]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.9-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 3 Nov 2020 14:07:10 +0000 (15:07 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 3 Nov 2020 14:07:10 +0000 (15:07 +0100)
added patches:
btrfs-cleanup-cow-block-on-error.patch
btrfs-drop-the-path-before-adding-block-group-sysfs-files.patch
btrfs-fix-readahead-hang-and-use-after-free-after-removing-a-device.patch
btrfs-fix-use-after-free-on-readahead-extent-after-failure-to-create-it.patch
btrfs-improve-device-scanning-messages.patch
btrfs-qgroup-fix-qgroup-meta-rsv-leak-for-subvolume-operations.patch
btrfs-qgroup-fix-wrong-qgroup-metadata-reserve-for-delayed-inode.patch
btrfs-reschedule-if-necessary-when-logging-directory-items.patch
btrfs-reschedule-when-cloning-lots-of-extents.patch
btrfs-send-orphanize-first-all-conflicting-inodes-when-processing-references.patch
btrfs-send-recompute-reference-path-after-orphanization-of-a-directory.patch
btrfs-skip-devices-without-magic-signature-when-mounting.patch
btrfs-sysfs-init-devices-outside-of-the-chunk_mutex.patch
btrfs-tracepoints-output-proper-root-owner-for-trace_find_free_extent.patch
btrfs-tree-checker-fix-false-alert-caused-by-legacy-btrfs-root-item.patch
btrfs-tree-checker-validate-number-of-chunk-stripes-and-parity.patch
btrfs-use-kvzalloc-to-allocate-clone_roots-in-btrfs_ioctl_send.patch
pm-runtime-remove-link-state-checks-in-rpm_get-put_supplier.patch
scsi-mptfusion-fix-null-pointer-dereferences-in-mptscsih_remove.patch
scsi-qla2xxx-fix-crash-on-session-cleanup-with-unload.patch
scsi-qla2xxx-fix-mpi-reset-needed-message.patch
scsi-qla2xxx-fix-reset-of-mpi-firmware.patch

23 files changed:
queue-5.9/btrfs-cleanup-cow-block-on-error.patch [new file with mode: 0644]
queue-5.9/btrfs-drop-the-path-before-adding-block-group-sysfs-files.patch [new file with mode: 0644]
queue-5.9/btrfs-fix-readahead-hang-and-use-after-free-after-removing-a-device.patch [new file with mode: 0644]
queue-5.9/btrfs-fix-use-after-free-on-readahead-extent-after-failure-to-create-it.patch [new file with mode: 0644]
queue-5.9/btrfs-improve-device-scanning-messages.patch [new file with mode: 0644]
queue-5.9/btrfs-qgroup-fix-qgroup-meta-rsv-leak-for-subvolume-operations.patch [new file with mode: 0644]
queue-5.9/btrfs-qgroup-fix-wrong-qgroup-metadata-reserve-for-delayed-inode.patch [new file with mode: 0644]
queue-5.9/btrfs-reschedule-if-necessary-when-logging-directory-items.patch [new file with mode: 0644]
queue-5.9/btrfs-reschedule-when-cloning-lots-of-extents.patch [new file with mode: 0644]
queue-5.9/btrfs-send-orphanize-first-all-conflicting-inodes-when-processing-references.patch [new file with mode: 0644]
queue-5.9/btrfs-send-recompute-reference-path-after-orphanization-of-a-directory.patch [new file with mode: 0644]
queue-5.9/btrfs-skip-devices-without-magic-signature-when-mounting.patch [new file with mode: 0644]
queue-5.9/btrfs-sysfs-init-devices-outside-of-the-chunk_mutex.patch [new file with mode: 0644]
queue-5.9/btrfs-tracepoints-output-proper-root-owner-for-trace_find_free_extent.patch [new file with mode: 0644]
queue-5.9/btrfs-tree-checker-fix-false-alert-caused-by-legacy-btrfs-root-item.patch [new file with mode: 0644]
queue-5.9/btrfs-tree-checker-validate-number-of-chunk-stripes-and-parity.patch [new file with mode: 0644]
queue-5.9/btrfs-use-kvzalloc-to-allocate-clone_roots-in-btrfs_ioctl_send.patch [new file with mode: 0644]
queue-5.9/pm-runtime-remove-link-state-checks-in-rpm_get-put_supplier.patch [new file with mode: 0644]
queue-5.9/scsi-mptfusion-fix-null-pointer-dereferences-in-mptscsih_remove.patch [new file with mode: 0644]
queue-5.9/scsi-qla2xxx-fix-crash-on-session-cleanup-with-unload.patch [new file with mode: 0644]
queue-5.9/scsi-qla2xxx-fix-mpi-reset-needed-message.patch [new file with mode: 0644]
queue-5.9/scsi-qla2xxx-fix-reset-of-mpi-firmware.patch [new file with mode: 0644]
queue-5.9/series

diff --git a/queue-5.9/btrfs-cleanup-cow-block-on-error.patch b/queue-5.9/btrfs-cleanup-cow-block-on-error.patch
new file mode 100644 (file)
index 0000000..480b959
--- /dev/null
@@ -0,0 +1,135 @@
+From 572c83acdcdafeb04e70aa46be1fa539310be20c Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Tue, 29 Sep 2020 08:53:54 -0400
+Subject: btrfs: cleanup cow block on error
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 572c83acdcdafeb04e70aa46be1fa539310be20c upstream.
+
+In fstest btrfs/064 a transaction abort in __btrfs_cow_block could lead
+to a system lockup. It gets stuck trying to write back inodes, and the
+write back thread was trying to lock an extent buffer:
+
+  $ cat /proc/2143497/stack
+  [<0>] __btrfs_tree_lock+0x108/0x250
+  [<0>] lock_extent_buffer_for_io+0x35e/0x3a0
+  [<0>] btree_write_cache_pages+0x15a/0x3b0
+  [<0>] do_writepages+0x28/0xb0
+  [<0>] __writeback_single_inode+0x54/0x5c0
+  [<0>] writeback_sb_inodes+0x1e8/0x510
+  [<0>] wb_writeback+0xcc/0x440
+  [<0>] wb_workfn+0xd7/0x650
+  [<0>] process_one_work+0x236/0x560
+  [<0>] worker_thread+0x55/0x3c0
+  [<0>] kthread+0x13a/0x150
+  [<0>] ret_from_fork+0x1f/0x30
+
+This is because we got an error while COWing a block, specifically here
+
+        if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
+                ret = btrfs_reloc_cow_block(trans, root, buf, cow);
+                if (ret) {
+                        btrfs_abort_transaction(trans, ret);
+                        return ret;
+                }
+        }
+
+  [16402.241552] BTRFS: Transaction aborted (error -2)
+  [16402.242362] WARNING: CPU: 1 PID: 2563188 at fs/btrfs/ctree.c:1074 __btrfs_cow_block+0x376/0x540
+  [16402.249469] CPU: 1 PID: 2563188 Comm: fsstress Not tainted 5.9.0-rc6+ #8
+  [16402.249936] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
+  [16402.250525] RIP: 0010:__btrfs_cow_block+0x376/0x540
+  [16402.252417] RSP: 0018:ffff9cca40e578b0 EFLAGS: 00010282
+  [16402.252787] RAX: 0000000000000025 RBX: 0000000000000002 RCX: ffff9132bbd19388
+  [16402.253278] RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff9132bbd19380
+  [16402.254063] RBP: ffff9132b41a49c0 R08: 0000000000000000 R09: 0000000000000000
+  [16402.254887] R10: 0000000000000000 R11: ffff91324758b080 R12: ffff91326ef17ce0
+  [16402.255694] R13: ffff91325fc0f000 R14: ffff91326ef176b0 R15: ffff9132815e2000
+  [16402.256321] FS:  00007f542c6d7b80(0000) GS:ffff9132bbd00000(0000) knlGS:0000000000000000
+  [16402.256973] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  [16402.257374] CR2: 00007f127b83f250 CR3: 0000000133480002 CR4: 0000000000370ee0
+  [16402.257867] Call Trace:
+  [16402.258072]  btrfs_cow_block+0x109/0x230
+  [16402.258356]  btrfs_search_slot+0x530/0x9d0
+  [16402.258655]  btrfs_lookup_file_extent+0x37/0x40
+  [16402.259155]  __btrfs_drop_extents+0x13c/0xd60
+  [16402.259628]  ? btrfs_block_rsv_migrate+0x4f/0xb0
+  [16402.259949]  btrfs_replace_file_extents+0x190/0x820
+  [16402.260873]  btrfs_clone+0x9ae/0xc00
+  [16402.261139]  btrfs_extent_same_range+0x66/0x90
+  [16402.261771]  btrfs_remap_file_range+0x353/0x3b1
+  [16402.262333]  vfs_dedupe_file_range_one.part.0+0xd5/0x140
+  [16402.262821]  vfs_dedupe_file_range+0x189/0x220
+  [16402.263150]  do_vfs_ioctl+0x552/0x700
+  [16402.263662]  __x64_sys_ioctl+0x62/0xb0
+  [16402.264023]  do_syscall_64+0x33/0x40
+  [16402.264364]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  [16402.264862] RIP: 0033:0x7f542c7d15cb
+  [16402.266901] RSP: 002b:00007ffd35944ea8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
+  [16402.267627] RAX: ffffffffffffffda RBX: 00000000009d1968 RCX: 00007f542c7d15cb
+  [16402.268298] RDX: 00000000009d2490 RSI: 00000000c0189436 RDI: 0000000000000003
+  [16402.268958] RBP: 00000000009d2520 R08: 0000000000000036 R09: 00000000009d2e64
+  [16402.269726] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000002
+  [16402.270659] R13: 000000000001f000 R14: 00000000009d1970 R15: 00000000009d2e80
+  [16402.271498] irq event stamp: 0
+  [16402.271846] hardirqs last  enabled at (0): [<0000000000000000>] 0x0
+  [16402.272497] hardirqs last disabled at (0): [<ffffffff910dbf59>] copy_process+0x6b9/0x1ba0
+  [16402.273343] softirqs last  enabled at (0): [<ffffffff910dbf59>] copy_process+0x6b9/0x1ba0
+  [16402.273905] softirqs last disabled at (0): [<0000000000000000>] 0x0
+  [16402.274338] ---[ end trace 737874a5a41a8236 ]---
+  [16402.274669] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry
+  [16402.276179] BTRFS info (device dm-9): forced readonly
+  [16402.277046] BTRFS: error (device dm-9) in btrfs_replace_file_extents:2723: errno=-2 No such entry
+  [16402.278744] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry
+  [16402.279968] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry
+  [16402.280582] BTRFS info (device dm-9): balance: ended with status: -30
+
+The problem here is that as soon as we allocate the new block it is
+locked and marked dirty in the btree inode.  This means that we could
+attempt to writeback this block and need to lock the extent buffer.
+However we're not unlocking it here and thus we deadlock.
+
+Fix this by unlocking the cow block if we have any errors inside of
+__btrfs_cow_block, and also free it so we do not leak it.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ctree.c |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -1061,6 +1061,8 @@ static noinline int __btrfs_cow_block(st
+       ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
+       if (ret) {
++              btrfs_tree_unlock(cow);
++              free_extent_buffer(cow);
+               btrfs_abort_transaction(trans, ret);
+               return ret;
+       }
+@@ -1068,6 +1070,8 @@ static noinline int __btrfs_cow_block(st
+       if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
+               ret = btrfs_reloc_cow_block(trans, root, buf, cow);
+               if (ret) {
++                      btrfs_tree_unlock(cow);
++                      free_extent_buffer(cow);
+                       btrfs_abort_transaction(trans, ret);
+                       return ret;
+               }
+@@ -1100,6 +1104,8 @@ static noinline int __btrfs_cow_block(st
+               if (last_ref) {
+                       ret = tree_mod_log_free_eb(buf);
+                       if (ret) {
++                              btrfs_tree_unlock(cow);
++                              free_extent_buffer(cow);
+                               btrfs_abort_transaction(trans, ret);
+                               return ret;
+                       }
diff --git a/queue-5.9/btrfs-drop-the-path-before-adding-block-group-sysfs-files.patch b/queue-5.9/btrfs-drop-the-path-before-adding-block-group-sysfs-files.patch
new file mode 100644 (file)
index 0000000..e0e0c0d
--- /dev/null
@@ -0,0 +1,221 @@
+From 7837fa88704a66257404bb14144c9e4ab631a28a Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 14 Oct 2020 17:00:51 -0400
+Subject: btrfs: drop the path before adding block group sysfs files
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 7837fa88704a66257404bb14144c9e4ab631a28a upstream.
+
+Dave reported a problem with my rwsem conversion patch where we got the
+following lockdep splat:
+
+  ======================================================
+  WARNING: possible circular locking dependency detected
+  5.9.0-default+ #1297 Not tainted
+  ------------------------------------------------------
+  kswapd0/76 is trying to acquire lock:
+  ffff9d5d25df2530 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs]
+
+  but task is already holding lock:
+  ffffffffa40cbba0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30
+
+  which lock already depends on the new lock.
+
+  the existing dependency chain (in reverse order) is:
+
+  -> #4 (fs_reclaim){+.+.}-{0:0}:
+        __lock_acquire+0x582/0xac0
+        lock_acquire+0xca/0x430
+        fs_reclaim_acquire.part.0+0x25/0x30
+        kmem_cache_alloc+0x30/0x9c0
+        alloc_inode+0x81/0x90
+        iget_locked+0xcd/0x1a0
+        kernfs_get_inode+0x1b/0x130
+        kernfs_get_tree+0x136/0x210
+        sysfs_get_tree+0x1a/0x50
+        vfs_get_tree+0x1d/0xb0
+        path_mount+0x70f/0xa80
+        do_mount+0x75/0x90
+        __x64_sys_mount+0x8e/0xd0
+        do_syscall_64+0x2d/0x70
+        entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+  -> #3 (kernfs_mutex){+.+.}-{3:3}:
+        __lock_acquire+0x582/0xac0
+        lock_acquire+0xca/0x430
+        __mutex_lock+0xa0/0xaf0
+        kernfs_add_one+0x23/0x150
+        kernfs_create_dir_ns+0x58/0x80
+        sysfs_create_dir_ns+0x70/0xd0
+        kobject_add_internal+0xbb/0x2d0
+        kobject_add+0x7a/0xd0
+        btrfs_sysfs_add_block_group_type+0x141/0x1d0 [btrfs]
+        btrfs_read_block_groups+0x1f1/0x8c0 [btrfs]
+        open_ctree+0x981/0x1108 [btrfs]
+        btrfs_mount_root.cold+0xe/0xb0 [btrfs]
+        legacy_get_tree+0x2d/0x60
+        vfs_get_tree+0x1d/0xb0
+        fc_mount+0xe/0x40
+        vfs_kern_mount.part.0+0x71/0x90
+        btrfs_mount+0x13b/0x3e0 [btrfs]
+        legacy_get_tree+0x2d/0x60
+        vfs_get_tree+0x1d/0xb0
+        path_mount+0x70f/0xa80
+        do_mount+0x75/0x90
+        __x64_sys_mount+0x8e/0xd0
+        do_syscall_64+0x2d/0x70
+        entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+  -> #2 (btrfs-extent-00){++++}-{3:3}:
+        __lock_acquire+0x582/0xac0
+        lock_acquire+0xca/0x430
+        down_read_nested+0x45/0x220
+        __btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
+        __btrfs_read_lock_root_node+0x3a/0x50 [btrfs]
+        btrfs_search_slot+0x6d4/0xfd0 [btrfs]
+        check_committed_ref+0x69/0x200 [btrfs]
+        btrfs_cross_ref_exist+0x65/0xb0 [btrfs]
+        run_delalloc_nocow+0x446/0x9b0 [btrfs]
+        btrfs_run_delalloc_range+0x61/0x6a0 [btrfs]
+        writepage_delalloc+0xae/0x160 [btrfs]
+        __extent_writepage+0x262/0x420 [btrfs]
+        extent_write_cache_pages+0x2b6/0x510 [btrfs]
+        extent_writepages+0x43/0x90 [btrfs]
+        do_writepages+0x40/0xe0
+        __writeback_single_inode+0x62/0x610
+        writeback_sb_inodes+0x20f/0x500
+        wb_writeback+0xef/0x4a0
+        wb_do_writeback+0x49/0x2e0
+        wb_workfn+0x81/0x340
+        process_one_work+0x233/0x5d0
+        worker_thread+0x50/0x3b0
+        kthread+0x137/0x150
+        ret_from_fork+0x1f/0x30
+
+  -> #1 (btrfs-fs-00){++++}-{3:3}:
+        __lock_acquire+0x582/0xac0
+        lock_acquire+0xca/0x430
+        down_read_nested+0x45/0x220
+        __btrfs_tree_read_lock+0x35/0x1c0 [btrfs]
+        __btrfs_read_lock_root_node+0x3a/0x50 [btrfs]
+        btrfs_search_slot+0x6d4/0xfd0 [btrfs]
+        btrfs_lookup_inode+0x3a/0xc0 [btrfs]
+        __btrfs_update_delayed_inode+0x93/0x2c0 [btrfs]
+        __btrfs_commit_inode_delayed_items+0x7de/0x850 [btrfs]
+        __btrfs_run_delayed_items+0x8e/0x140 [btrfs]
+        btrfs_commit_transaction+0x367/0xbc0 [btrfs]
+        btrfs_mksubvol+0x2db/0x470 [btrfs]
+        btrfs_mksnapshot+0x7b/0xb0 [btrfs]
+        __btrfs_ioctl_snap_create+0x16f/0x1a0 [btrfs]
+        btrfs_ioctl_snap_create_v2+0xb0/0xf0 [btrfs]
+        btrfs_ioctl+0xd0b/0x2690 [btrfs]
+        __x64_sys_ioctl+0x6f/0xa0
+        do_syscall_64+0x2d/0x70
+        entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+  -> #0 (&delayed_node->mutex){+.+.}-{3:3}:
+        check_prev_add+0x91/0xc60
+        validate_chain+0xa6e/0x2a20
+        __lock_acquire+0x582/0xac0
+        lock_acquire+0xca/0x430
+        __mutex_lock+0xa0/0xaf0
+        __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs]
+        btrfs_evict_inode+0x3cc/0x560 [btrfs]
+        evict+0xd6/0x1c0
+        dispose_list+0x48/0x70
+        prune_icache_sb+0x54/0x80
+        super_cache_scan+0x121/0x1a0
+        do_shrink_slab+0x16d/0x3b0
+        shrink_slab+0xb1/0x2e0
+        shrink_node+0x230/0x6a0
+        balance_pgdat+0x325/0x750
+        kswapd+0x206/0x4d0
+        kthread+0x137/0x150
+        ret_from_fork+0x1f/0x30
+
+  other info that might help us debug this:
+
+  Chain exists of:
+    &delayed_node->mutex --> kernfs_mutex --> fs_reclaim
+
+   Possible unsafe locking scenario:
+
+        CPU0                    CPU1
+        ----                    ----
+    lock(fs_reclaim);
+                                lock(kernfs_mutex);
+                                lock(fs_reclaim);
+    lock(&delayed_node->mutex);
+
+   *** DEADLOCK ***
+
+  3 locks held by kswapd0/76:
+   #0: ffffffffa40cbba0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30
+   #1: ffffffffa40b8b58 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x54/0x2e0
+   #2: ffff9d5d322390e8 (&type->s_umount_key#26){++++}-{3:3}, at: trylock_super+0x16/0x50
+
+  stack backtrace:
+  CPU: 2 PID: 76 Comm: kswapd0 Not tainted 5.9.0-default+ #1297
+  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
+  Call Trace:
+   dump_stack+0x77/0x97
+   check_noncircular+0xff/0x110
+   ? save_trace+0x50/0x470
+   check_prev_add+0x91/0xc60
+   validate_chain+0xa6e/0x2a20
+   ? save_trace+0x50/0x470
+   __lock_acquire+0x582/0xac0
+   lock_acquire+0xca/0x430
+   ? __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs]
+   __mutex_lock+0xa0/0xaf0
+   ? __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs]
+   ? __lock_acquire+0x582/0xac0
+   ? __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs]
+   ? btrfs_evict_inode+0x30b/0x560 [btrfs]
+   ? __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs]
+   __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs]
+   btrfs_evict_inode+0x3cc/0x560 [btrfs]
+   evict+0xd6/0x1c0
+   dispose_list+0x48/0x70
+   prune_icache_sb+0x54/0x80
+   super_cache_scan+0x121/0x1a0
+   do_shrink_slab+0x16d/0x3b0
+   shrink_slab+0xb1/0x2e0
+   shrink_node+0x230/0x6a0
+   balance_pgdat+0x325/0x750
+   kswapd+0x206/0x4d0
+   ? finish_wait+0x90/0x90
+   ? balance_pgdat+0x750/0x750
+   kthread+0x137/0x150
+   ? kthread_mod_delayed_work+0xc0/0xc0
+   ret_from_fork+0x1f/0x30
+
+This happens because we are still holding the path open when we start
+adding the sysfs files for the block groups, which creates a dependency
+on fs_reclaim via the tree lock.  Fix this by dropping the path before
+we start doing anything with sysfs.
+
+Reported-by: David Sterba <dsterba@suse.com>
+CC: stable@vger.kernel.org # 5.8+
+Reviewed-by: Anand Jain <anand.jain@oracle.com>
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/block-group.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -2034,6 +2034,7 @@ int btrfs_read_block_groups(struct btrfs
+               key.offset = 0;
+               btrfs_release_path(path);
+       }
++      btrfs_release_path(path);
+       rcu_read_lock();
+       list_for_each_entry_rcu(space_info, &info->space_info, list) {
diff --git a/queue-5.9/btrfs-fix-readahead-hang-and-use-after-free-after-removing-a-device.patch b/queue-5.9/btrfs-fix-readahead-hang-and-use-after-free-after-removing-a-device.patch
new file mode 100644 (file)
index 0000000..fdbd97a
--- /dev/null
@@ -0,0 +1,678 @@
+From 66d204a16c94f24ad08290a7663ab67e7fc04e82 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 12 Oct 2020 11:55:24 +0100
+Subject: btrfs: fix readahead hang and use-after-free after removing a device
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 66d204a16c94f24ad08290a7663ab67e7fc04e82 upstream.
+
+Very sporadically I had test case btrfs/069 from fstests hanging (for
+years, it is not a recent regression), with the following traces in
+dmesg/syslog:
+
+  [162301.160628] BTRFS info (device sdc): dev_replace from /dev/sdd (devid 2) to /dev/sdg started
+  [162301.181196] BTRFS info (device sdc): scrub: finished on devid 4 with status: 0
+  [162301.287162] BTRFS info (device sdc): dev_replace from /dev/sdd (devid 2) to /dev/sdg finished
+  [162513.513792] INFO: task btrfs-transacti:1356167 blocked for more than 120 seconds.
+  [162513.514318]       Not tainted 5.9.0-rc6-btrfs-next-69 #1
+  [162513.514522] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+  [162513.514747] task:btrfs-transacti state:D stack:    0 pid:1356167 ppid:     2 flags:0x00004000
+  [162513.514751] Call Trace:
+  [162513.514761]  __schedule+0x5ce/0xd00
+  [162513.514765]  ? _raw_spin_unlock_irqrestore+0x3c/0x60
+  [162513.514771]  schedule+0x46/0xf0
+  [162513.514844]  wait_current_trans+0xde/0x140 [btrfs]
+  [162513.514850]  ? finish_wait+0x90/0x90
+  [162513.514864]  start_transaction+0x37c/0x5f0 [btrfs]
+  [162513.514879]  transaction_kthread+0xa4/0x170 [btrfs]
+  [162513.514891]  ? btrfs_cleanup_transaction+0x660/0x660 [btrfs]
+  [162513.514894]  kthread+0x153/0x170
+  [162513.514897]  ? kthread_stop+0x2c0/0x2c0
+  [162513.514902]  ret_from_fork+0x22/0x30
+  [162513.514916] INFO: task fsstress:1356184 blocked for more than 120 seconds.
+  [162513.515192]       Not tainted 5.9.0-rc6-btrfs-next-69 #1
+  [162513.515431] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+  [162513.515680] task:fsstress        state:D stack:    0 pid:1356184 ppid:1356177 flags:0x00004000
+  [162513.515682] Call Trace:
+  [162513.515688]  __schedule+0x5ce/0xd00
+  [162513.515691]  ? _raw_spin_unlock_irqrestore+0x3c/0x60
+  [162513.515697]  schedule+0x46/0xf0
+  [162513.515712]  wait_current_trans+0xde/0x140 [btrfs]
+  [162513.515716]  ? finish_wait+0x90/0x90
+  [162513.515729]  start_transaction+0x37c/0x5f0 [btrfs]
+  [162513.515743]  btrfs_attach_transaction_barrier+0x1f/0x50 [btrfs]
+  [162513.515753]  btrfs_sync_fs+0x61/0x1c0 [btrfs]
+  [162513.515758]  ? __ia32_sys_fdatasync+0x20/0x20
+  [162513.515761]  iterate_supers+0x87/0xf0
+  [162513.515765]  ksys_sync+0x60/0xb0
+  [162513.515768]  __do_sys_sync+0xa/0x10
+  [162513.515771]  do_syscall_64+0x33/0x80
+  [162513.515774]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  [162513.515781] RIP: 0033:0x7f5238f50bd7
+  [162513.515782] Code: Bad RIP value.
+  [162513.515784] RSP: 002b:00007fff67b978e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a2
+  [162513.515786] RAX: ffffffffffffffda RBX: 000055b1fad2c560 RCX: 00007f5238f50bd7
+  [162513.515788] RDX: 00000000ffffffff RSI: 000000000daf0e74 RDI: 000000000000003a
+  [162513.515789] RBP: 0000000000000032 R08: 000000000000000a R09: 00007f5239019be0
+  [162513.515791] R10: fffffffffffff24f R11: 0000000000000206 R12: 000000000000003a
+  [162513.515792] R13: 00007fff67b97950 R14: 00007fff67b97906 R15: 000055b1fad1a340
+  [162513.515804] INFO: task fsstress:1356185 blocked for more than 120 seconds.
+  [162513.516064]       Not tainted 5.9.0-rc6-btrfs-next-69 #1
+  [162513.516329] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+  [162513.516617] task:fsstress        state:D stack:    0 pid:1356185 ppid:1356177 flags:0x00000000
+  [162513.516620] Call Trace:
+  [162513.516625]  __schedule+0x5ce/0xd00
+  [162513.516628]  ? _raw_spin_unlock_irqrestore+0x3c/0x60
+  [162513.516634]  schedule+0x46/0xf0
+  [162513.516647]  wait_current_trans+0xde/0x140 [btrfs]
+  [162513.516650]  ? finish_wait+0x90/0x90
+  [162513.516662]  start_transaction+0x4d7/0x5f0 [btrfs]
+  [162513.516679]  btrfs_setxattr_trans+0x3c/0x100 [btrfs]
+  [162513.516686]  __vfs_setxattr+0x66/0x80
+  [162513.516691]  __vfs_setxattr_noperm+0x70/0x200
+  [162513.516697]  vfs_setxattr+0x6b/0x120
+  [162513.516703]  setxattr+0x125/0x240
+  [162513.516709]  ? lock_acquire+0xb1/0x480
+  [162513.516712]  ? mnt_want_write+0x20/0x50
+  [162513.516721]  ? rcu_read_lock_any_held+0x8e/0xb0
+  [162513.516723]  ? preempt_count_add+0x49/0xa0
+  [162513.516725]  ? __sb_start_write+0x19b/0x290
+  [162513.516727]  ? preempt_count_add+0x49/0xa0
+  [162513.516732]  path_setxattr+0xba/0xd0
+  [162513.516739]  __x64_sys_setxattr+0x27/0x30
+  [162513.516741]  do_syscall_64+0x33/0x80
+  [162513.516743]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  [162513.516745] RIP: 0033:0x7f5238f56d5a
+  [162513.516746] Code: Bad RIP value.
+  [162513.516748] RSP: 002b:00007fff67b97868 EFLAGS: 00000202 ORIG_RAX: 00000000000000bc
+  [162513.516750] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f5238f56d5a
+  [162513.516751] RDX: 000055b1fbb0d5a0 RSI: 00007fff67b978a0 RDI: 000055b1fbb0d470
+  [162513.516753] RBP: 000055b1fbb0d5a0 R08: 0000000000000001 R09: 00007fff67b97700
+  [162513.516754] R10: 0000000000000004 R11: 0000000000000202 R12: 0000000000000004
+  [162513.516756] R13: 0000000000000024 R14: 0000000000000001 R15: 00007fff67b978a0
+  [162513.516767] INFO: task fsstress:1356196 blocked for more than 120 seconds.
+  [162513.517064]       Not tainted 5.9.0-rc6-btrfs-next-69 #1
+  [162513.517365] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+  [162513.517763] task:fsstress        state:D stack:    0 pid:1356196 ppid:1356177 flags:0x00004000
+  [162513.517780] Call Trace:
+  [162513.517786]  __schedule+0x5ce/0xd00
+  [162513.517789]  ? _raw_spin_unlock_irqrestore+0x3c/0x60
+  [162513.517796]  schedule+0x46/0xf0
+  [162513.517810]  wait_current_trans+0xde/0x140 [btrfs]
+  [162513.517814]  ? finish_wait+0x90/0x90
+  [162513.517829]  start_transaction+0x37c/0x5f0 [btrfs]
+  [162513.517845]  btrfs_attach_transaction_barrier+0x1f/0x50 [btrfs]
+  [162513.517857]  btrfs_sync_fs+0x61/0x1c0 [btrfs]
+  [162513.517862]  ? __ia32_sys_fdatasync+0x20/0x20
+  [162513.517865]  iterate_supers+0x87/0xf0
+  [162513.517869]  ksys_sync+0x60/0xb0
+  [162513.517872]  __do_sys_sync+0xa/0x10
+  [162513.517875]  do_syscall_64+0x33/0x80
+  [162513.517878]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  [162513.517881] RIP: 0033:0x7f5238f50bd7
+  [162513.517883] Code: Bad RIP value.
+  [162513.517885] RSP: 002b:00007fff67b978e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a2
+  [162513.517887] RAX: ffffffffffffffda RBX: 000055b1fad2c560 RCX: 00007f5238f50bd7
+  [162513.517889] RDX: 0000000000000000 RSI: 000000007660add2 RDI: 0000000000000053
+  [162513.517891] RBP: 0000000000000032 R08: 0000000000000067 R09: 00007f5239019be0
+  [162513.517893] R10: fffffffffffff24f R11: 0000000000000206 R12: 0000000000000053
+  [162513.517895] R13: 00007fff67b97950 R14: 00007fff67b97906 R15: 000055b1fad1a340
+  [162513.517908] INFO: task fsstress:1356197 blocked for more than 120 seconds.
+  [162513.518298]       Not tainted 5.9.0-rc6-btrfs-next-69 #1
+  [162513.518672] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+  [162513.519157] task:fsstress        state:D stack:    0 pid:1356197 ppid:1356177 flags:0x00000000
+  [162513.519160] Call Trace:
+  [162513.519165]  __schedule+0x5ce/0xd00
+  [162513.519168]  ? _raw_spin_unlock_irqrestore+0x3c/0x60
+  [162513.519174]  schedule+0x46/0xf0
+  [162513.519190]  wait_current_trans+0xde/0x140 [btrfs]
+  [162513.519193]  ? finish_wait+0x90/0x90
+  [162513.519206]  start_transaction+0x4d7/0x5f0 [btrfs]
+  [162513.519222]  btrfs_create+0x57/0x200 [btrfs]
+  [162513.519230]  lookup_open+0x522/0x650
+  [162513.519246]  path_openat+0x2b8/0xa50
+  [162513.519270]  do_filp_open+0x91/0x100
+  [162513.519275]  ? find_held_lock+0x32/0x90
+  [162513.519280]  ? lock_acquired+0x33b/0x470
+  [162513.519285]  ? do_raw_spin_unlock+0x4b/0xc0
+  [162513.519287]  ? _raw_spin_unlock+0x29/0x40
+  [162513.519295]  do_sys_openat2+0x20d/0x2d0
+  [162513.519300]  do_sys_open+0x44/0x80
+  [162513.519304]  do_syscall_64+0x33/0x80
+  [162513.519307]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  [162513.519309] RIP: 0033:0x7f5238f4a903
+  [162513.519310] Code: Bad RIP value.
+  [162513.519312] RSP: 002b:00007fff67b97758 EFLAGS: 00000246 ORIG_RAX: 0000000000000055
+  [162513.519314] RAX: ffffffffffffffda RBX: 00000000ffffffff RCX: 00007f5238f4a903
+  [162513.519316] RDX: 0000000000000000 RSI: 00000000000001b6 RDI: 000055b1fbb0d470
+  [162513.519317] RBP: 00007fff67b978c0 R08: 0000000000000001 R09: 0000000000000002
+  [162513.519319] R10: 00007fff67b974f7 R11: 0000000000000246 R12: 0000000000000013
+  [162513.519320] R13: 00000000000001b6 R14: 00007fff67b97906 R15: 000055b1fad1c620
+  [162513.519332] INFO: task btrfs:1356211 blocked for more than 120 seconds.
+  [162513.519727]       Not tainted 5.9.0-rc6-btrfs-next-69 #1
+  [162513.520115] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+  [162513.520508] task:btrfs           state:D stack:    0 pid:1356211 ppid:1356178 flags:0x00004002
+  [162513.520511] Call Trace:
+  [162513.520516]  __schedule+0x5ce/0xd00
+  [162513.520519]  ? _raw_spin_unlock_irqrestore+0x3c/0x60
+  [162513.520525]  schedule+0x46/0xf0
+  [162513.520544]  btrfs_scrub_pause+0x11f/0x180 [btrfs]
+  [162513.520548]  ? finish_wait+0x90/0x90
+  [162513.520562]  btrfs_commit_transaction+0x45a/0xc30 [btrfs]
+  [162513.520574]  ? start_transaction+0xe0/0x5f0 [btrfs]
+  [162513.520596]  btrfs_dev_replace_finishing+0x6d8/0x711 [btrfs]
+  [162513.520619]  btrfs_dev_replace_by_ioctl.cold+0x1cc/0x1fd [btrfs]
+  [162513.520639]  btrfs_ioctl+0x2a25/0x36f0 [btrfs]
+  [162513.520643]  ? do_sigaction+0xf3/0x240
+  [162513.520645]  ? find_held_lock+0x32/0x90
+  [162513.520648]  ? do_sigaction+0xf3/0x240
+  [162513.520651]  ? lock_acquired+0x33b/0x470
+  [162513.520655]  ? _raw_spin_unlock_irq+0x24/0x50
+  [162513.520657]  ? lockdep_hardirqs_on+0x7d/0x100
+  [162513.520660]  ? _raw_spin_unlock_irq+0x35/0x50
+  [162513.520662]  ? do_sigaction+0xf3/0x240
+  [162513.520671]  ? __x64_sys_ioctl+0x83/0xb0
+  [162513.520672]  __x64_sys_ioctl+0x83/0xb0
+  [162513.520677]  do_syscall_64+0x33/0x80
+  [162513.520679]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  [162513.520681] RIP: 0033:0x7fc3cd307d87
+  [162513.520682] Code: Bad RIP value.
+  [162513.520684] RSP: 002b:00007ffe30a56bb8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
+  [162513.520686] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007fc3cd307d87
+  [162513.520687] RDX: 00007ffe30a57a30 RSI: 00000000ca289435 RDI: 0000000000000003
+  [162513.520689] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
+  [162513.520690] R10: 0000000000000008 R11: 0000000000000202 R12: 0000000000000003
+  [162513.520692] R13: 0000557323a212e0 R14: 00007ffe30a5a520 R15: 0000000000000001
+  [162513.520703]
+                 Showing all locks held in the system:
+  [162513.520712] 1 lock held by khungtaskd/54:
+  [162513.520713]  #0: ffffffffb40a91a0 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x15/0x197
+  [162513.520728] 1 lock held by in:imklog/596:
+  [162513.520729]  #0: ffff8f3f0d781400 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x4d/0x60
+  [162513.520782] 1 lock held by btrfs-transacti/1356167:
+  [162513.520784]  #0: ffff8f3d810cc848 (&fs_info->transaction_kthread_mutex){+.+.}-{3:3}, at: transaction_kthread+0x4a/0x170 [btrfs]
+  [162513.520798] 1 lock held by btrfs/1356190:
+  [162513.520800]  #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write_file+0x22/0x60
+  [162513.520805] 1 lock held by fsstress/1356184:
+  [162513.520806]  #0: ffff8f3d576440e8 (&type->s_umount_key#62){++++}-{3:3}, at: iterate_supers+0x6f/0xf0
+  [162513.520811] 3 locks held by fsstress/1356185:
+  [162513.520812]  #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write+0x20/0x50
+  [162513.520815]  #1: ffff8f3d80a650b8 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: vfs_setxattr+0x50/0x120
+  [162513.520820]  #2: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs]
+  [162513.520833] 1 lock held by fsstress/1356196:
+  [162513.520834]  #0: ffff8f3d576440e8 (&type->s_umount_key#62){++++}-{3:3}, at: iterate_supers+0x6f/0xf0
+  [162513.520838] 3 locks held by fsstress/1356197:
+  [162513.520839]  #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write+0x20/0x50
+  [162513.520843]  #1: ffff8f3d506465e8 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: path_openat+0x2a7/0xa50
+  [162513.520846]  #2: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs]
+  [162513.520858] 2 locks held by btrfs/1356211:
+  [162513.520859]  #0: ffff8f3d810cde30 (&fs_info->dev_replace.lock_finishing_cancel_unmount){+.+.}-{3:3}, at: btrfs_dev_replace_finishing+0x52/0x711 [btrfs]
+  [162513.520877]  #1: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs]
+
+This was weird because the stack traces show that a transaction commit,
+triggered by a device replace operation, is blocking trying to pause any
+running scrubs but there are no stack traces of blocked tasks doing a
+scrub.
+
+After poking around with drgn, I noticed there was a scrub task that was
+constantly running and blocking for shorts periods of time:
+
+  >>> t = find_task(prog, 1356190)
+  >>> prog.stack_trace(t)
+  #0  __schedule+0x5ce/0xcfc
+  #1  schedule+0x46/0xe4
+  #2  schedule_timeout+0x1df/0x475
+  #3  btrfs_reada_wait+0xda/0x132
+  #4  scrub_stripe+0x2a8/0x112f
+  #5  scrub_chunk+0xcd/0x134
+  #6  scrub_enumerate_chunks+0x29e/0x5ee
+  #7  btrfs_scrub_dev+0x2d5/0x91b
+  #8  btrfs_ioctl+0x7f5/0x36e7
+  #9  __x64_sys_ioctl+0x83/0xb0
+  #10 do_syscall_64+0x33/0x77
+  #11 entry_SYSCALL_64+0x7c/0x156
+
+Which corresponds to:
+
+int btrfs_reada_wait(void *handle)
+{
+    struct reada_control *rc = handle;
+    struct btrfs_fs_info *fs_info = rc->fs_info;
+
+    while (atomic_read(&rc->elems)) {
+        if (!atomic_read(&fs_info->reada_works_cnt))
+            reada_start_machine(fs_info);
+        wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
+                          (HZ + 9) / 10);
+    }
+(...)
+
+So the counter "rc->elems" was set to 1 and never decreased to 0, causing
+the scrub task to loop forever in that function. Then I used the following
+script for drgn to check the readahead requests:
+
+  $ cat dump_reada.py
+  import sys
+  import drgn
+  from drgn import NULL, Object, cast, container_of, execscript, \
+      reinterpret, sizeof
+  from drgn.helpers.linux import *
+
+  mnt_path = b"/home/fdmanana/btrfs-tests/scratch_1"
+
+  mnt = None
+  for mnt in for_each_mount(prog, dst = mnt_path):
+      pass
+
+  if mnt is None:
+      sys.stderr.write(f'Error: mount point {mnt_path} not found\n')
+      sys.exit(1)
+
+  fs_info = cast('struct btrfs_fs_info *', mnt.mnt.mnt_sb.s_fs_info)
+
+  def dump_re(re):
+      nzones = re.nzones.value_()
+      print(f're at {hex(re.value_())}')
+      print(f'\t logical {re.logical.value_()}')
+      print(f'\t refcnt {re.refcnt.value_()}')
+      print(f'\t nzones {nzones}')
+      for i in range(nzones):
+          dev = re.zones[i].device
+          name = dev.name.str.string_()
+          print(f'\t\t dev id {dev.devid.value_()} name {name}')
+      print()
+
+  for _, e in radix_tree_for_each(fs_info.reada_tree):
+      re = cast('struct reada_extent *', e)
+      dump_re(re)
+
+  $ drgn dump_reada.py
+  re at 0xffff8f3da9d25ad8
+          logical 38928384
+          refcnt 1
+          nzones 1
+                 dev id 0 name b'/dev/sdd'
+  $
+
+So there was one readahead extent with a single zone corresponding to the
+source device of that last device replace operation logged in dmesg/syslog.
+Also the ID of that zone's device was 0 which is a special value set in
+the source device of a device replace operation when the operation finishes
+(constant BTRFS_DEV_REPLACE_DEVID set at btrfs_dev_replace_finishing()),
+confirming again that device /dev/sdd was the source of a device replace
+operation.
+
+Normally there should be as many zones in the readahead extent as there are
+devices, and I wasn't expecting the extent to be in a block group with a
+'single' profile, so I went and confirmed with the following drgn script
+that there weren't any single profile block groups:
+
+  $ cat dump_block_groups.py
+  import sys
+  import drgn
+  from drgn import NULL, Object, cast, container_of, execscript, \
+      reinterpret, sizeof
+  from drgn.helpers.linux import *
+
+  mnt_path = b"/home/fdmanana/btrfs-tests/scratch_1"
+
+  mnt = None
+  for mnt in for_each_mount(prog, dst = mnt_path):
+      pass
+
+  if mnt is None:
+      sys.stderr.write(f'Error: mount point {mnt_path} not found\n')
+      sys.exit(1)
+
+  fs_info = cast('struct btrfs_fs_info *', mnt.mnt.mnt_sb.s_fs_info)
+
+  BTRFS_BLOCK_GROUP_DATA = (1 << 0)
+  BTRFS_BLOCK_GROUP_SYSTEM = (1 << 1)
+  BTRFS_BLOCK_GROUP_METADATA = (1 << 2)
+  BTRFS_BLOCK_GROUP_RAID0 = (1 << 3)
+  BTRFS_BLOCK_GROUP_RAID1 = (1 << 4)
+  BTRFS_BLOCK_GROUP_DUP = (1 << 5)
+  BTRFS_BLOCK_GROUP_RAID10 = (1 << 6)
+  BTRFS_BLOCK_GROUP_RAID5 = (1 << 7)
+  BTRFS_BLOCK_GROUP_RAID6 = (1 << 8)
+  BTRFS_BLOCK_GROUP_RAID1C3 = (1 << 9)
+  BTRFS_BLOCK_GROUP_RAID1C4 = (1 << 10)
+
+  def bg_flags_string(bg):
+      flags = bg.flags.value_()
+      ret = ''
+      if flags & BTRFS_BLOCK_GROUP_DATA:
+          ret = 'data'
+      if flags & BTRFS_BLOCK_GROUP_METADATA:
+          if len(ret) > 0:
+              ret += '|'
+          ret += 'meta'
+      if flags & BTRFS_BLOCK_GROUP_SYSTEM:
+          if len(ret) > 0:
+              ret += '|'
+          ret += 'system'
+      if flags & BTRFS_BLOCK_GROUP_RAID0:
+          ret += ' raid0'
+      elif flags & BTRFS_BLOCK_GROUP_RAID1:
+          ret += ' raid1'
+      elif flags & BTRFS_BLOCK_GROUP_DUP:
+          ret += ' dup'
+      elif flags & BTRFS_BLOCK_GROUP_RAID10:
+          ret += ' raid10'
+      elif flags & BTRFS_BLOCK_GROUP_RAID5:
+          ret += ' raid5'
+      elif flags & BTRFS_BLOCK_GROUP_RAID6:
+          ret += ' raid6'
+      elif flags & BTRFS_BLOCK_GROUP_RAID1C3:
+          ret += ' raid1c3'
+      elif flags & BTRFS_BLOCK_GROUP_RAID1C4:
+          ret += ' raid1c4'
+      else:
+          ret += ' single'
+
+      return ret
+
+  def dump_bg(bg):
+      print()
+      print(f'block group at {hex(bg.value_())}')
+      print(f'\t start {bg.start.value_()} length {bg.length.value_()}')
+      print(f'\t flags {bg.flags.value_()} - {bg_flags_string(bg)}')
+
+  bg_root = fs_info.block_group_cache_tree.address_of_()
+  for bg in rbtree_inorder_for_each_entry('struct btrfs_block_group', bg_root, 'cache_node'):
+      dump_bg(bg)
+
+  $ drgn dump_block_groups.py
+
+  block group at 0xffff8f3d673b0400
+         start 22020096 length 16777216
+         flags 258 - system raid6
+
+  block group at 0xffff8f3d53ddb400
+         start 38797312 length 536870912
+         flags 260 - meta raid6
+
+  block group at 0xffff8f3d5f4d9c00
+         start 575668224 length 2147483648
+         flags 257 - data raid6
+
+  block group at 0xffff8f3d08189000
+         start 2723151872 length 67108864
+         flags 258 - system raid6
+
+  block group at 0xffff8f3db70ff000
+         start 2790260736 length 1073741824
+         flags 260 - meta raid6
+
+  block group at 0xffff8f3d5f4dd800
+         start 3864002560 length 67108864
+         flags 258 - system raid6
+
+  block group at 0xffff8f3d67037000
+         start 3931111424 length 2147483648
+         flags 257 - data raid6
+  $
+
+So there were only 2 reasons left for having a readahead extent with a
+single zone: reada_find_zone(), called when creating a readahead extent,
+returned NULL either because we failed to find the corresponding block
+group or because a memory allocation failed. With some additional and
+custom tracing I figured out that on every further ocurrence of the
+problem the block group had just been deleted when we were looping to
+create the zones for the readahead extent (at reada_find_extent()), so we
+ended up with only one zone in the readahead extent, corresponding to a
+device that ends up getting replaced.
+
+So after figuring that out it became obvious why the hang happens:
+
+1) Task A starts a scrub on any device of the filesystem, except for
+   device /dev/sdd;
+
+2) Task B starts a device replace with /dev/sdd as the source device;
+
+3) Task A calls btrfs_reada_add() from scrub_stripe() and it is currently
+   starting to scrub a stripe from block group X. This call to
+   btrfs_reada_add() is the one for the extent tree. When btrfs_reada_add()
+   calls reada_add_block(), it passes the logical address of the extent
+   tree's root node as its 'logical' argument - a value of 38928384;
+
+4) Task A then enters reada_find_extent(), called from reada_add_block().
+   It finds there isn't any existing readahead extent for the logical
+   address 38928384, so it proceeds to the path of creating a new one.
+
+   It calls btrfs_map_block() to find out which stripes exist for the block
+   group X. On the first iteration of the for loop that iterates over the
+   stripes, it finds the stripe for device /dev/sdd, so it creates one
+   zone for that device and adds it to the readahead extent. Before getting
+   into the second iteration of the loop, the cleanup kthread deletes block
+   group X because it was empty. So in the iterations for the remaining
+   stripes it does not add more zones to the readahead extent, because the
+   calls to reada_find_zone() returned NULL because they couldn't find
+   block group X anymore.
+
+   As a result the new readahead extent has a single zone, corresponding to
+   the device /dev/sdd;
+
+4) Before task A returns to btrfs_reada_add() and queues the readahead job
+   for the readahead work queue, task B finishes the device replace and at
+   btrfs_dev_replace_finishing() swaps the device /dev/sdd with the new
+   device /dev/sdg;
+
+5) Task A returns to reada_add_block(), which increments the counter
+   "->elems" of the reada_control structure allocated at btrfs_reada_add().
+
+   Then it returns back to btrfs_reada_add() and calls
+   reada_start_machine(). This queues a job in the readahead work queue to
+   run the function reada_start_machine_worker(), which calls
+   __reada_start_machine().
+
+   At __reada_start_machine() we take the device list mutex and for each
+   device found in the current device list, we call
+   reada_start_machine_dev() to start the readahead work. However at this
+   point the device /dev/sdd was already freed and is not in the device
+   list anymore.
+
+   This means the corresponding readahead for the extent at 38928384 is
+   never started, and therefore the "->elems" counter of the reada_control
+   structure allocated at btrfs_reada_add() never goes down to 0, causing
+   the call to btrfs_reada_wait(), done by the scrub task, to wait forever.
+
+Note that the readahead request can be made either after the device replace
+started or before it started, however in pratice it is very unlikely that a
+device replace is able to start after a readahead request is made and is
+able to complete before the readahead request completes - maybe only on a
+very small and nearly empty filesystem.
+
+This hang however is not the only problem we can have with readahead and
+device removals. When the readahead extent has other zones other than the
+one corresponding to the device that is being removed (either by a device
+replace or a device remove operation), we risk having a use-after-free on
+the device when dropping the last reference of the readahead extent.
+
+For example if we create a readahead extent with two zones, one for the
+device /dev/sdd and one for the device /dev/sde:
+
+1) Before the readahead worker starts, the device /dev/sdd is removed,
+   and the corresponding btrfs_device structure is freed. However the
+   readahead extent still has the zone pointing to the device structure;
+
+2) When the readahead worker starts, it only finds device /dev/sde in the
+   current device list of the filesystem;
+
+3) It starts the readahead work, at reada_start_machine_dev(), using the
+   device /dev/sde;
+
+4) Then when it finishes reading the extent from device /dev/sde, it calls
+   __readahead_hook() which ends up dropping the last reference on the
+   readahead extent through the last call to reada_extent_put();
+
+5) At reada_extent_put() it iterates over each zone of the readahead extent
+   and attempts to delete an element from the device's 'reada_extents'
+   radix tree, resulting in a use-after-free, as the device pointer of the
+   zone for /dev/sdd is now stale. We can also access the device after
+   dropping the last reference of a zone, through reada_zone_release(),
+   also called by reada_extent_put().
+
+And a device remove suffers the same problem, however since it shrinks the
+device size down to zero before removing the device, it is very unlikely to
+still have readahead requests not completed by the time we free the device,
+the only possibility is if the device has a very little space allocated.
+
+While the hang problem is exclusive to scrub, since it is currently the
+only user of btrfs_reada_add() and btrfs_reada_wait(), the use-after-free
+problem affects any path that triggers readhead, which includes
+btree_readahead_hook() and __readahead_hook() (a readahead worker can
+trigger readahed for the children of a node) for example - any path that
+ends up calling reada_add_block() can trigger the use-after-free after a
+device is removed.
+
+So fix this by waiting for any readahead requests for a device to complete
+before removing a device, ensuring that while waiting for existing ones no
+new ones can be made.
+
+This problem has been around for a very long time - the readahead code was
+added in 2011, device remove exists since 2008 and device replace was
+introduced in 2013, hard to pick a specific commit for a git Fixes tag.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ctree.h       |    2 ++
+ fs/btrfs/dev-replace.c |    5 +++++
+ fs/btrfs/reada.c       |   45 +++++++++++++++++++++++++++++++++++++++++++++
+ fs/btrfs/volumes.c     |    3 +++
+ fs/btrfs/volumes.h     |    1 +
+ 5 files changed, 56 insertions(+)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -3517,6 +3517,8 @@ struct reada_control *btrfs_reada_add(st
+ int btrfs_reada_wait(void *handle);
+ void btrfs_reada_detach(void *handle);
+ int btree_readahead_hook(struct extent_buffer *eb, int err);
++void btrfs_reada_remove_dev(struct btrfs_device *dev);
++void btrfs_reada_undo_remove_dev(struct btrfs_device *dev);
+ static inline int is_fstree(u64 rootid)
+ {
+--- a/fs/btrfs/dev-replace.c
++++ b/fs/btrfs/dev-replace.c
+@@ -668,6 +668,9 @@ static int btrfs_dev_replace_finishing(s
+       }
+       btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
++      if (!scrub_ret)
++              btrfs_reada_remove_dev(src_device);
++
+       /*
+        * We have to use this loop approach because at this point src_device
+        * has to be available for transaction commit to complete, yet new
+@@ -676,6 +679,7 @@ static int btrfs_dev_replace_finishing(s
+       while (1) {
+               trans = btrfs_start_transaction(root, 0);
+               if (IS_ERR(trans)) {
++                      btrfs_reada_undo_remove_dev(src_device);
+                       mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+                       return PTR_ERR(trans);
+               }
+@@ -726,6 +730,7 @@ error:
+               up_write(&dev_replace->rwsem);
+               mutex_unlock(&fs_info->chunk_mutex);
+               mutex_unlock(&fs_info->fs_devices->device_list_mutex);
++              btrfs_reada_undo_remove_dev(src_device);
+               btrfs_rm_dev_replace_blocked(fs_info);
+               if (tgt_device)
+                       btrfs_destroy_dev_replace_tgtdev(tgt_device);
+--- a/fs/btrfs/reada.c
++++ b/fs/btrfs/reada.c
+@@ -421,6 +421,9 @@ static struct reada_extent *reada_find_e
+               if (!dev->bdev)
+                       continue;
++              if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state))
++                      continue;
++
+               if (dev_replace_is_ongoing &&
+                   dev == fs_info->dev_replace.tgtdev) {
+                       /*
+@@ -1014,3 +1017,45 @@ void btrfs_reada_detach(void *handle)
+       kref_put(&rc->refcnt, reada_control_release);
+ }
++
++/*
++ * Before removing a device (device replace or device remove ioctls), call this
++ * function to wait for all existing readahead requests on the device and to
++ * make sure no one queues more readahead requests for the device.
++ *
++ * Must be called without holding neither the device list mutex nor the device
++ * replace semaphore, otherwise it will deadlock.
++ */
++void btrfs_reada_remove_dev(struct btrfs_device *dev)
++{
++      struct btrfs_fs_info *fs_info = dev->fs_info;
++
++      /* Serialize with readahead extent creation at reada_find_extent(). */
++      spin_lock(&fs_info->reada_lock);
++      set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
++      spin_unlock(&fs_info->reada_lock);
++
++      /*
++       * There might be readahead requests added to the radix trees which
++       * were not yet added to the readahead work queue. We need to start
++       * them and wait for their completion, otherwise we can end up with
++       * use-after-free problems when dropping the last reference on the
++       * readahead extents and their zones, as they need to access the
++       * device structure.
++       */
++      reada_start_machine(fs_info);
++      btrfs_flush_workqueue(fs_info->readahead_workers);
++}
++
++/*
++ * If when removing a device (device replace or device remove ioctls) an error
++ * happens after calling btrfs_reada_remove_dev(), call this to undo what that
++ * function did. This is safe to call even if btrfs_reada_remove_dev() was not
++ * called before.
++ */
++void btrfs_reada_undo_remove_dev(struct btrfs_device *dev)
++{
++      spin_lock(&dev->fs_info->reada_lock);
++      clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state);
++      spin_unlock(&dev->fs_info->reada_lock);
++}
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -2104,6 +2104,8 @@ int btrfs_rm_device(struct btrfs_fs_info
+       mutex_unlock(&uuid_mutex);
+       ret = btrfs_shrink_device(device, 0);
++      if (!ret)
++              btrfs_reada_remove_dev(device);
+       mutex_lock(&uuid_mutex);
+       if (ret)
+               goto error_undo;
+@@ -2191,6 +2193,7 @@ out:
+       return ret;
+ error_undo:
++      btrfs_reada_undo_remove_dev(device);
+       if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
+               mutex_lock(&fs_info->chunk_mutex);
+               list_add(&device->dev_alloc_list,
+--- a/fs/btrfs/volumes.h
++++ b/fs/btrfs/volumes.h
+@@ -50,6 +50,7 @@ struct btrfs_io_geometry {
+ #define BTRFS_DEV_STATE_MISSING               (2)
+ #define BTRFS_DEV_STATE_REPLACE_TGT   (3)
+ #define BTRFS_DEV_STATE_FLUSH_SENT    (4)
++#define BTRFS_DEV_STATE_NO_READA      (5)
+ struct btrfs_device {
+       struct list_head dev_list; /* device_list_mutex */
diff --git a/queue-5.9/btrfs-fix-use-after-free-on-readahead-extent-after-failure-to-create-it.patch b/queue-5.9/btrfs-fix-use-after-free-on-readahead-extent-after-failure-to-create-it.patch
new file mode 100644 (file)
index 0000000..3435a0d
--- /dev/null
@@ -0,0 +1,139 @@
+From 83bc1560e02e25c6439341352024ebe8488f4fbd Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 12 Oct 2020 11:55:23 +0100
+Subject: btrfs: fix use-after-free on readahead extent after failure to create it
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 83bc1560e02e25c6439341352024ebe8488f4fbd upstream.
+
+If we fail to find suitable zones for a new readahead extent, we end up
+leaving a stale pointer in the global readahead extents radix tree
+(fs_info->reada_tree), which can trigger the following trace later on:
+
+  [13367.696354] BUG: kernel NULL pointer dereference, address: 00000000000000b0
+  [13367.696802] #PF: supervisor read access in kernel mode
+  [13367.697249] #PF: error_code(0x0000) - not-present page
+  [13367.697721] PGD 0 P4D 0
+  [13367.698171] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
+  [13367.698632] CPU: 6 PID: 851214 Comm: btrfs Tainted: G        W         5.9.0-rc6-btrfs-next-69 #1
+  [13367.699100] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
+  [13367.700069] RIP: 0010:__lock_acquire+0x20a/0x3970
+  [13367.700562] Code: ff 1f 0f b7 c0 48 0f (...)
+  [13367.701609] RSP: 0018:ffffb14448f57790 EFLAGS: 00010046
+  [13367.702140] RAX: 0000000000000000 RBX: 29b935140c15e8cf RCX: 0000000000000000
+  [13367.702698] RDX: 0000000000000002 RSI: ffffffffb3d66bd0 RDI: 0000000000000046
+  [13367.703240] RBP: ffff8a52ba8ac040 R08: 00000c2866ad9288 R09: 0000000000000001
+  [13367.703783] R10: 0000000000000001 R11: 00000000b66d9b53 R12: ffff8a52ba8ac9b0
+  [13367.704330] R13: 0000000000000000 R14: ffff8a532b6333e8 R15: 0000000000000000
+  [13367.704880] FS:  00007fe1df6b5700(0000) GS:ffff8a5376600000(0000) knlGS:0000000000000000
+  [13367.705438] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  [13367.705995] CR2: 00000000000000b0 CR3: 000000022cca8004 CR4: 00000000003706e0
+  [13367.706565] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+  [13367.707127] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+  [13367.707686] Call Trace:
+  [13367.708246]  ? ___slab_alloc+0x395/0x740
+  [13367.708820]  ? reada_add_block+0xae/0xee0 [btrfs]
+  [13367.709383]  lock_acquire+0xb1/0x480
+  [13367.709955]  ? reada_add_block+0xe0/0xee0 [btrfs]
+  [13367.710537]  ? reada_add_block+0xae/0xee0 [btrfs]
+  [13367.711097]  ? rcu_read_lock_sched_held+0x5d/0x90
+  [13367.711659]  ? kmem_cache_alloc_trace+0x8d2/0x990
+  [13367.712221]  ? lock_acquired+0x33b/0x470
+  [13367.712784]  _raw_spin_lock+0x34/0x80
+  [13367.713356]  ? reada_add_block+0xe0/0xee0 [btrfs]
+  [13367.713966]  reada_add_block+0xe0/0xee0 [btrfs]
+  [13367.714529]  ? btrfs_root_node+0x15/0x1f0 [btrfs]
+  [13367.715077]  btrfs_reada_add+0x117/0x170 [btrfs]
+  [13367.715620]  scrub_stripe+0x21e/0x10d0 [btrfs]
+  [13367.716141]  ? kvm_sched_clock_read+0x5/0x10
+  [13367.716657]  ? __lock_acquire+0x41e/0x3970
+  [13367.717184]  ? scrub_chunk+0x60/0x140 [btrfs]
+  [13367.717697]  ? find_held_lock+0x32/0x90
+  [13367.718254]  ? scrub_chunk+0x60/0x140 [btrfs]
+  [13367.718773]  ? lock_acquired+0x33b/0x470
+  [13367.719278]  ? scrub_chunk+0xcd/0x140 [btrfs]
+  [13367.719786]  scrub_chunk+0xcd/0x140 [btrfs]
+  [13367.720291]  scrub_enumerate_chunks+0x270/0x5c0 [btrfs]
+  [13367.720787]  ? finish_wait+0x90/0x90
+  [13367.721281]  btrfs_scrub_dev+0x1ee/0x620 [btrfs]
+  [13367.721762]  ? rcu_read_lock_any_held+0x8e/0xb0
+  [13367.722235]  ? preempt_count_add+0x49/0xa0
+  [13367.722710]  ? __sb_start_write+0x19b/0x290
+  [13367.723192]  btrfs_ioctl+0x7f5/0x36f0 [btrfs]
+  [13367.723660]  ? __fget_files+0x101/0x1d0
+  [13367.724118]  ? find_held_lock+0x32/0x90
+  [13367.724559]  ? __fget_files+0x101/0x1d0
+  [13367.724982]  ? __x64_sys_ioctl+0x83/0xb0
+  [13367.725399]  __x64_sys_ioctl+0x83/0xb0
+  [13367.725802]  do_syscall_64+0x33/0x80
+  [13367.726188]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  [13367.726574] RIP: 0033:0x7fe1df7add87
+  [13367.726948] Code: 00 00 00 48 8b 05 09 91 (...)
+  [13367.727763] RSP: 002b:00007fe1df6b4d48 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
+  [13367.728179] RAX: ffffffffffffffda RBX: 000055ce1fb596a0 RCX: 00007fe1df7add87
+  [13367.728604] RDX: 000055ce1fb596a0 RSI: 00000000c400941b RDI: 0000000000000003
+  [13367.729021] RBP: 0000000000000000 R08: 00007fe1df6b5700 R09: 0000000000000000
+  [13367.729431] R10: 00007fe1df6b5700 R11: 0000000000000246 R12: 00007ffd922b07de
+  [13367.729842] R13: 00007ffd922b07df R14: 00007fe1df6b4e40 R15: 0000000000802000
+  [13367.730275] Modules linked in: btrfs blake2b_generic xor (...)
+  [13367.732638] CR2: 00000000000000b0
+  [13367.733166] ---[ end trace d298b6805556acd9 ]---
+
+What happens is the following:
+
+1) At reada_find_extent() we don't find any existing readahead extent for
+   the metadata extent starting at logical address X;
+
+2) So we proceed to create a new one. We then call btrfs_map_block() to get
+   information about which stripes contain extent X;
+
+3) After that we iterate over the stripes and create only one zone for the
+   readahead extent - only one because reada_find_zone() returned NULL for
+   all iterations except for one, either because a memory allocation failed
+   or it couldn't find the block group of the extent (it may have just been
+   deleted);
+
+4) We then add the new readahead extent to the readahead extents radix
+   tree at fs_info->reada_tree;
+
+5) Then we iterate over each zone of the new readahead extent, and find
+   that the device used for that zone no longer exists, because it was
+   removed or it was the source device of a device replace operation.
+   Since this left 'have_zone' set to 0, after finishing the loop we jump
+   to the 'error' label, call kfree() on the new readahead extent and
+   return without removing it from the radix tree at fs_info->reada_tree;
+
+6) Any future call to reada_find_extent() for the logical address X will
+   find the stale pointer in the readahead extents radix tree, increment
+   its reference counter, which can trigger the use-after-free right
+   away or return it to the caller reada_add_block() that results in the
+   use-after-free of the example trace above.
+
+So fix this by making sure we delete the readahead extent from the radix
+tree if we fail to setup zones for it (when 'have_zone = 0').
+
+Fixes: 319450211842ba ("btrfs: reada: bypass adding extent when all zone failed")
+CC: stable@vger.kernel.org # 4.9+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/reada.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/btrfs/reada.c
++++ b/fs/btrfs/reada.c
+@@ -445,6 +445,8 @@ static struct reada_extent *reada_find_e
+               }
+               have_zone = 1;
+       }
++      if (!have_zone)
++              radix_tree_delete(&fs_info->reada_tree, index);
+       spin_unlock(&fs_info->reada_lock);
+       up_read(&fs_info->dev_replace.rwsem);
diff --git a/queue-5.9/btrfs-improve-device-scanning-messages.patch b/queue-5.9/btrfs-improve-device-scanning-messages.patch
new file mode 100644 (file)
index 0000000..0fb399d
--- /dev/null
@@ -0,0 +1,59 @@
+From 79dae17d8d44b2d15779e332180080af45df5352 Mon Sep 17 00:00:00 2001
+From: Anand Jain <anand.jain@oracle.com>
+Date: Thu, 3 Sep 2020 21:30:12 +0800
+Subject: btrfs: improve device scanning messages
+
+From: Anand Jain <anand.jain@oracle.com>
+
+commit 79dae17d8d44b2d15779e332180080af45df5352 upstream.
+
+Systems booting without the initramfs seems to scan an unusual kind
+of device path (/dev/root). And at a later time, the device is updated
+to the correct path. We generally print the process name and PID of the
+process scanning the device but we don't capture the same information if
+the device path is rescanned with a different pathname.
+
+The current message is too long, so drop the unnecessary UUID and add
+process name and PID.
+
+While at this also update the duplicate device warning to include the
+process name and PID so the messages are consistent
+
+CC: stable@vger.kernel.org # 4.19+
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=89721
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/volumes.c |   14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -942,16 +942,18 @@ static noinline struct btrfs_device *dev
+                               bdput(path_bdev);
+                               mutex_unlock(&fs_devices->device_list_mutex);
+                               btrfs_warn_in_rcu(device->fs_info,
+-                      "duplicate device fsid:devid for %pU:%llu old:%s new:%s",
+-                                      disk_super->fsid, devid,
+-                                      rcu_str_deref(device->name), path);
++      "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
++                                                path, devid, found_transid,
++                                                current->comm,
++                                                task_pid_nr(current));
+                               return ERR_PTR(-EEXIST);
+                       }
+                       bdput(path_bdev);
+                       btrfs_info_in_rcu(device->fs_info,
+-                              "device fsid %pU devid %llu moved old:%s new:%s",
+-                              disk_super->fsid, devid,
+-                              rcu_str_deref(device->name), path);
++      "devid %llu device path %s changed to %s scanned by %s (%d)",
++                                        devid, rcu_str_deref(device->name),
++                                        path, current->comm,
++                                        task_pid_nr(current));
+               }
+               name = rcu_string_strdup(path, GFP_NOFS);
diff --git a/queue-5.9/btrfs-qgroup-fix-qgroup-meta-rsv-leak-for-subvolume-operations.patch b/queue-5.9/btrfs-qgroup-fix-qgroup-meta-rsv-leak-for-subvolume-operations.patch
new file mode 100644 (file)
index 0000000..f30d098
--- /dev/null
@@ -0,0 +1,155 @@
+From e85fde5162bf1b242cbd6daf7dba0f9b457d592b Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Fri, 24 Jul 2020 14:46:10 +0800
+Subject: btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit e85fde5162bf1b242cbd6daf7dba0f9b457d592b upstream.
+
+[BUG]
+When quota is enabled for TEST_DEV, generic/013 sometimes fails like this:
+
+  generic/013 14s ... _check_dmesg: something found in dmesg (see xfstests-dev/results//generic/013.dmesg)
+
+And with the following metadata leak:
+
+  BTRFS warning (device dm-3): qgroup 0/1370 has unreleased space, type 2 rsv 49152
+  ------------[ cut here ]------------
+  WARNING: CPU: 2 PID: 47912 at fs/btrfs/disk-io.c:4078 close_ctree+0x1dc/0x323 [btrfs]
+  Call Trace:
+   btrfs_put_super+0x15/0x17 [btrfs]
+   generic_shutdown_super+0x72/0x110
+   kill_anon_super+0x18/0x30
+   btrfs_kill_super+0x17/0x30 [btrfs]
+   deactivate_locked_super+0x3b/0xa0
+   deactivate_super+0x40/0x50
+   cleanup_mnt+0x135/0x190
+   __cleanup_mnt+0x12/0x20
+   task_work_run+0x64/0xb0
+   __prepare_exit_to_usermode+0x1bc/0x1c0
+   __syscall_return_slowpath+0x47/0x230
+   do_syscall_64+0x64/0xb0
+   entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  ---[ end trace a6cfd45ba80e4e06 ]---
+  BTRFS error (device dm-3): qgroup reserved space leaked
+  BTRFS info (device dm-3): disk space caching is enabled
+  BTRFS info (device dm-3): has skinny extents
+
+[CAUSE]
+The qgroup preallocated meta rsv operations of that offending root are:
+
+  btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
+  btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072
+  btrfs_subvolume_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=49152
+  btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
+  btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072
+
+It's pretty obvious that, we reserve qgroup meta rsv in
+btrfs_subvolume_reserve_metadata(), but doesn't have corresponding
+release/convert calls in btrfs_subvolume_release_metadata().
+
+This leads to the leakage.
+
+[FIX]
+To fix this bug, we should follow what we're doing in
+btrfs_delalloc_reserve_metadata(), where we reserve qgroup space, and
+add it to block_rsv->qgroup_rsv_reserved.
+
+And free the qgroup reserved metadata space when releasing the
+block_rsv.
+
+To do this, we need to change the btrfs_subvolume_release_metadata() to
+accept btrfs_root, and record the qgroup_to_release number, and call
+btrfs_qgroup_convert_reserved_meta() for it.
+
+Fixes: 733e03a0b26a ("btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans")
+CC: stable@vger.kernel.org # 4.19+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ctree.h     |    2 +-
+ fs/btrfs/inode.c     |    2 +-
+ fs/btrfs/ioctl.c     |    6 +++---
+ fs/btrfs/root-tree.c |   13 +++++++++++--
+ 4 files changed, 16 insertions(+), 7 deletions(-)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -2619,7 +2619,7 @@ enum btrfs_flush_state {
+ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+                                    struct btrfs_block_rsv *rsv,
+                                    int nitems, bool use_global_rsv);
+-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
++void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+                                     struct btrfs_block_rsv *rsv);
+ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -4051,7 +4051,7 @@ out_end_trans:
+               err = ret;
+       inode->i_flags |= S_DEAD;
+ out_release:
+-      btrfs_subvolume_release_metadata(fs_info, &block_rsv);
++      btrfs_subvolume_release_metadata(root, &block_rsv);
+ out_up_write:
+       up_write(&fs_info->subvol_sem);
+       if (err) {
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -618,7 +618,7 @@ static noinline int create_subvol(struct
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans)) {
+               ret = PTR_ERR(trans);
+-              btrfs_subvolume_release_metadata(fs_info, &block_rsv);
++              btrfs_subvolume_release_metadata(root, &block_rsv);
+               goto fail_free;
+       }
+       trans->block_rsv = &block_rsv;
+@@ -742,7 +742,7 @@ fail:
+       kfree(root_item);
+       trans->block_rsv = NULL;
+       trans->bytes_reserved = 0;
+-      btrfs_subvolume_release_metadata(fs_info, &block_rsv);
++      btrfs_subvolume_release_metadata(root, &block_rsv);
+       err = btrfs_commit_transaction(trans);
+       if (err && !ret)
+@@ -856,7 +856,7 @@ fail:
+       if (ret && pending_snapshot->snap)
+               pending_snapshot->snap->anon_dev = 0;
+       btrfs_put_root(pending_snapshot->snap);
+-      btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
++      btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
+ free_pending:
+       if (pending_snapshot->anon_dev)
+               free_anon_bdev(pending_snapshot->anon_dev);
+--- a/fs/btrfs/root-tree.c
++++ b/fs/btrfs/root-tree.c
+@@ -512,11 +512,20 @@ int btrfs_subvolume_reserve_metadata(str
+       if (ret && qgroup_num_bytes)
+               btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
++      if (!ret) {
++              spin_lock(&rsv->lock);
++              rsv->qgroup_rsv_reserved += qgroup_num_bytes;
++              spin_unlock(&rsv->lock);
++      }
+       return ret;
+ }
+-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
++void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+                                     struct btrfs_block_rsv *rsv)
+ {
+-      btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
++      struct btrfs_fs_info *fs_info = root->fs_info;
++      u64 qgroup_to_release;
++
++      btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release);
++      btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release);
+ }
diff --git a/queue-5.9/btrfs-qgroup-fix-wrong-qgroup-metadata-reserve-for-delayed-inode.patch b/queue-5.9/btrfs-qgroup-fix-wrong-qgroup-metadata-reserve-for-delayed-inode.patch
new file mode 100644 (file)
index 0000000..9bbba89
--- /dev/null
@@ -0,0 +1,66 @@
+From b4c5d8fdfff3e2b6c4fa4a5043e8946dff500f8c Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Fri, 24 Jul 2020 14:46:09 +0800
+Subject: btrfs: qgroup: fix wrong qgroup metadata reserve for delayed inode
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit b4c5d8fdfff3e2b6c4fa4a5043e8946dff500f8c upstream.
+
+For delayed inode facility, qgroup metadata is reserved for it, and
+later freed.
+
+However we're freeing more bytes than we reserved.
+In btrfs_delayed_inode_reserve_metadata():
+
+       num_bytes = btrfs_calc_metadata_size(fs_info, 1);
+       ...
+               ret = btrfs_qgroup_reserve_meta_prealloc(root,
+                               fs_info->nodesize, true);
+               ...
+               if (!ret) {
+                       node->bytes_reserved = num_bytes;
+
+But in btrfs_delayed_inode_release_metadata():
+
+       if (qgroup_free)
+               btrfs_qgroup_free_meta_prealloc(node->root,
+                               node->bytes_reserved);
+       else
+               btrfs_qgroup_convert_reserved_meta(node->root,
+                               node->bytes_reserved);
+
+This means, we're always releasing more qgroup metadata rsv than we have
+reserved.
+
+This won't trigger selftest warning, as btrfs qgroup metadata rsv has
+extra protection against cases like quota enabled half-way.
+
+But we still need to fix this problem any way.
+
+This patch will use the same num_bytes for qgroup metadata rsv so we
+could handle it correctly.
+
+Fixes: f218ea6c4792 ("btrfs: delayed-inode: Remove wrong qgroup meta reservation calls")
+CC: stable@vger.kernel.org # 4.19+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/delayed-inode.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/btrfs/delayed-inode.c
++++ b/fs/btrfs/delayed-inode.c
+@@ -627,8 +627,7 @@ static int btrfs_delayed_inode_reserve_m
+        */
+       if (!src_rsv || (!trans->bytes_reserved &&
+                        src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
+-              ret = btrfs_qgroup_reserve_meta_prealloc(root,
+-                              fs_info->nodesize, true);
++              ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+               if (ret < 0)
+                       return ret;
+               ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
diff --git a/queue-5.9/btrfs-reschedule-if-necessary-when-logging-directory-items.patch b/queue-5.9/btrfs-reschedule-if-necessary-when-logging-directory-items.patch
new file mode 100644 (file)
index 0000000..cda4553
--- /dev/null
@@ -0,0 +1,111 @@
+From bb56f02f26fe23798edb1b2175707419b28c752a Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 14 Sep 2020 15:27:50 +0100
+Subject: btrfs: reschedule if necessary when logging directory items
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit bb56f02f26fe23798edb1b2175707419b28c752a upstream.
+
+Logging directories with many entries can take a significant amount of
+time, and in some cases monopolize a cpu/core for a long time if the
+logging task doesn't happen to block often enough.
+
+Johannes and Lu Fengqi reported test case generic/041 triggering a soft
+lockup when the kernel has CONFIG_SOFTLOCKUP_DETECTOR=y. For this test
+case we log an inode with 3002 hard links, and because the test removed
+one hard link before fsyncing the file, the inode logging causes the
+parent directory do be logged as well, which has 6004 directory items to
+log (3002 BTRFS_DIR_ITEM_KEY items plus 3002 BTRFS_DIR_INDEX_KEY items),
+so it can take a significant amount of time and trigger the soft lockup.
+
+So just make tree-log.c:log_dir_items() reschedule when necessary,
+releasing the current search path before doing so and then resume from
+where it was before the reschedule.
+
+The stack trace produced when the soft lockup happens is the following:
+
+[10480.277653] watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [xfs_io:28172]
+[10480.279418] Modules linked in: dm_thin_pool dm_persistent_data (...)
+[10480.284915] irq event stamp: 29646366
+[10480.285987] hardirqs last  enabled at (29646365): [<ffffffff85249b66>] __slab_alloc.constprop.0+0x56/0x60
+[10480.288482] hardirqs last disabled at (29646366): [<ffffffff8579b00d>] irqentry_enter+0x1d/0x50
+[10480.290856] softirqs last  enabled at (4612): [<ffffffff85a00323>] __do_softirq+0x323/0x56c
+[10480.293615] softirqs last disabled at (4483): [<ffffffff85800dbf>] asm_call_on_stack+0xf/0x20
+[10480.296428] CPU: 2 PID: 28172 Comm: xfs_io Not tainted 5.9.0-rc4-default+ #1248
+[10480.298948] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
+[10480.302455] RIP: 0010:__slab_alloc.constprop.0+0x19/0x60
+[10480.304151] Code: 86 e8 31 75 21 00 66 66 2e 0f 1f 84 00 00 00 (...)
+[10480.309558] RSP: 0018:ffffadbe09397a58 EFLAGS: 00000282
+[10480.311179] RAX: ffff8a495ab92840 RBX: 0000000000000282 RCX: 0000000000000006
+[10480.313242] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff85249b66
+[10480.315260] RBP: ffff8a497d04b740 R08: 0000000000000001 R09: 0000000000000001
+[10480.317229] R10: ffff8a497d044800 R11: ffff8a495ab93c40 R12: 0000000000000000
+[10480.319169] R13: 0000000000000000 R14: 0000000000000c40 R15: ffffffffc01daf70
+[10480.321104] FS:  00007fa1dc5c0e40(0000) GS:ffff8a497da00000(0000) knlGS:0000000000000000
+[10480.323559] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[10480.325235] CR2: 00007fa1dc5befb8 CR3: 0000000004f8a006 CR4: 0000000000170ea0
+[10480.327259] Call Trace:
+[10480.328286]  ? overwrite_item+0x1f0/0x5a0 [btrfs]
+[10480.329784]  __kmalloc+0x831/0xa20
+[10480.331009]  ? btrfs_get_32+0xb0/0x1d0 [btrfs]
+[10480.332464]  overwrite_item+0x1f0/0x5a0 [btrfs]
+[10480.333948]  log_dir_items+0x2ee/0x570 [btrfs]
+[10480.335413]  log_directory_changes+0x82/0xd0 [btrfs]
+[10480.336926]  btrfs_log_inode+0xc9b/0xda0 [btrfs]
+[10480.338374]  ? init_once+0x20/0x20 [btrfs]
+[10480.339711]  btrfs_log_inode_parent+0x8d3/0xd10 [btrfs]
+[10480.341257]  ? dget_parent+0x97/0x2e0
+[10480.342480]  btrfs_log_dentry_safe+0x3a/0x50 [btrfs]
+[10480.343977]  btrfs_sync_file+0x24b/0x5e0 [btrfs]
+[10480.345381]  do_fsync+0x38/0x70
+[10480.346483]  __x64_sys_fsync+0x10/0x20
+[10480.347703]  do_syscall_64+0x2d/0x70
+[10480.348891]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
+[10480.350444] RIP: 0033:0x7fa1dc80970b
+[10480.351642] Code: 0f 05 48 3d 00 f0 ff ff 77 45 c3 0f 1f 40 00 48 (...)
+[10480.356952] RSP: 002b:00007fffb3d081d0 EFLAGS: 00000293 ORIG_RAX: 000000000000004a
+[10480.359458] RAX: ffffffffffffffda RBX: 0000562d93d45e40 RCX: 00007fa1dc80970b
+[10480.361426] RDX: 0000562d93d44ab0 RSI: 0000562d93d45e60 RDI: 0000000000000003
+[10480.363367] RBP: 0000000000000001 R08: 0000000000000000 R09: 00007fa1dc7b2a40
+[10480.365317] R10: 0000562d93d0e366 R11: 0000000000000293 R12: 0000000000000001
+[10480.367299] R13: 0000562d93d45290 R14: 0000562d93d45e40 R15: 0000562d93d45e60
+
+Link: https://lore.kernel.org/linux-btrfs/20180713090216.GC575@fnst.localdomain/
+Reported-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+CC: stable@vger.kernel.org # 4.4+
+Tested-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/tree-log.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -3615,6 +3615,7 @@ static noinline int log_dir_items(struct
+        * search and this search we'll not find the key again and can just
+        * bail.
+        */
++search:
+       ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
+       if (ret != 0)
+               goto done;
+@@ -3634,6 +3635,13 @@ static noinline int log_dir_items(struct
+                       if (min_key.objectid != ino || min_key.type != key_type)
+                               goto done;
++
++                      if (need_resched()) {
++                              btrfs_release_path(path);
++                              cond_resched();
++                              goto search;
++                      }
++
+                       ret = overwrite_item(trans, log, dst_path, src, i,
+                                            &min_key);
+                       if (ret) {
diff --git a/queue-5.9/btrfs-reschedule-when-cloning-lots-of-extents.patch b/queue-5.9/btrfs-reschedule-when-cloning-lots-of-extents.patch
new file mode 100644 (file)
index 0000000..ab10cd4
--- /dev/null
@@ -0,0 +1,94 @@
+From 6b613cc97f0ace77f92f7bc112b8f6ad3f52baf8 Mon Sep 17 00:00:00 2001
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Date: Tue, 22 Sep 2020 17:27:29 +0900
+Subject: btrfs: reschedule when cloning lots of extents
+
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+
+commit 6b613cc97f0ace77f92f7bc112b8f6ad3f52baf8 upstream.
+
+We have several occurrences of a soft lockup from fstest's generic/175
+testcase, which look more or less like this one:
+
+  watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [xfs_io:10030]
+  Kernel panic - not syncing: softlockup: hung tasks
+  CPU: 0 PID: 10030 Comm: xfs_io Tainted: G             L    5.9.0-rc5+ #768
+  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4-rebuilt.opensuse.org 04/01/2014
+  Call Trace:
+   <IRQ>
+   dump_stack+0x77/0xa0
+   panic+0xfa/0x2cb
+   watchdog_timer_fn.cold+0x85/0xa5
+   ? lockup_detector_update_enable+0x50/0x50
+   __hrtimer_run_queues+0x99/0x4c0
+   ? recalibrate_cpu_khz+0x10/0x10
+   hrtimer_run_queues+0x9f/0xb0
+   update_process_times+0x28/0x80
+   tick_handle_periodic+0x1b/0x60
+   __sysvec_apic_timer_interrupt+0x76/0x210
+   asm_call_on_stack+0x12/0x20
+   </IRQ>
+   sysvec_apic_timer_interrupt+0x7f/0x90
+   asm_sysvec_apic_timer_interrupt+0x12/0x20
+  RIP: 0010:btrfs_tree_unlock+0x91/0x1a0 [btrfs]
+  RSP: 0018:ffffc90007123a58 EFLAGS: 00000282
+  RAX: ffff8881cea2fbe0 RBX: ffff8881cea2fbe0 RCX: 0000000000000000
+  RDX: ffff8881d23fd200 RSI: ffffffff82045220 RDI: ffff8881cea2fba0
+  RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000032
+  R10: 0000160000000000 R11: 0000000000001000 R12: 0000000000001000
+  R13: ffff8882357fd5b0 R14: ffff88816fa76e70 R15: ffff8881cea2fad0
+   ? btrfs_tree_unlock+0x15b/0x1a0 [btrfs]
+   btrfs_release_path+0x67/0x80 [btrfs]
+   btrfs_insert_replace_extent+0x177/0x2c0 [btrfs]
+   btrfs_replace_file_extents+0x472/0x7c0 [btrfs]
+   btrfs_clone+0x9ba/0xbd0 [btrfs]
+   btrfs_clone_files.isra.0+0xeb/0x140 [btrfs]
+   ? file_update_time+0xcd/0x120
+   btrfs_remap_file_range+0x322/0x3b0 [btrfs]
+   do_clone_file_range+0xb7/0x1e0
+   vfs_clone_file_range+0x30/0xa0
+   ioctl_file_clone+0x8a/0xc0
+   do_vfs_ioctl+0x5b2/0x6f0
+   __x64_sys_ioctl+0x37/0xa0
+   do_syscall_64+0x33/0x40
+   entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  RIP: 0033:0x7f87977fc247
+  RSP: 002b:00007ffd51a2f6d8 EFLAGS: 00000206 ORIG_RAX: 0000000000000010
+  RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f87977fc247
+  RDX: 00007ffd51a2f710 RSI: 000000004020940d RDI: 0000000000000003
+  RBP: 0000000000000004 R08: 00007ffd51a79080 R09: 0000000000000000
+  R10: 00005621f11352f2 R11: 0000000000000206 R12: 0000000000000000
+  R13: 0000000000000000 R14: 00005621f128b958 R15: 0000000080000000
+  Kernel Offset: disabled
+  ---[ end Kernel panic - not syncing: softlockup: hung tasks ]---
+
+All of these lockup reports have the call chain btrfs_clone_files() ->
+btrfs_clone() in common. btrfs_clone_files() calls btrfs_clone() with
+both source and destination extents locked and loops over the source
+extent to create the clones.
+
+Conditionally reschedule in the btrfs_clone() loop, to give some time back
+to other processes.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/reflink.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/btrfs/reflink.c
++++ b/fs/btrfs/reflink.c
+@@ -520,6 +520,8 @@ process_slot:
+                       ret = -EINTR;
+                       goto out;
+               }
++
++              cond_resched();
+       }
+       ret = 0;
diff --git a/queue-5.9/btrfs-send-orphanize-first-all-conflicting-inodes-when-processing-references.patch b/queue-5.9/btrfs-send-orphanize-first-all-conflicting-inodes-when-processing-references.patch
new file mode 100644 (file)
index 0000000..af4cb8c
--- /dev/null
@@ -0,0 +1,273 @@
+From 98272bb77bf4cc20ed1ffca89832d713e70ebf09 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 21 Sep 2020 14:13:29 +0100
+Subject: btrfs: send, orphanize first all conflicting inodes when processing references
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 98272bb77bf4cc20ed1ffca89832d713e70ebf09 upstream.
+
+When doing an incremental send it is possible that when processing the new
+references for an inode we end up issuing rename or link operations that
+have an invalid path, which contains the orphanized name of a directory
+before we actually orphanized it, causing the receiver to fail.
+
+The following reproducer triggers such scenario:
+
+  $ cat reproducer.sh
+  #!/bin/bash
+
+  mkfs.btrfs -f /dev/sdi >/dev/null
+  mount /dev/sdi /mnt/sdi
+
+  touch /mnt/sdi/a
+  touch /mnt/sdi/b
+  mkdir /mnt/sdi/testdir
+  # We want "a" to have a lower inode number then "testdir" (257 vs 259).
+  mv /mnt/sdi/a /mnt/sdi/testdir/a
+
+  # Filesystem looks like:
+  #
+  # .                           (ino 256)
+  # |----- testdir/             (ino 259)
+  # |          |----- a         (ino 257)
+  # |
+  # |----- b                    (ino 258)
+
+  btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap1
+  btrfs send -f /tmp/snap1.send /mnt/sdi/snap1
+
+  # Now rename 259 to "testdir_2", then change the name of 257 to
+  # "testdir" and make it a direct descendant of the root inode (256).
+  # Also create a new link for inode 257 with the old name of inode 258.
+  # By swapping the names and location of several inodes and create a
+  # nasty dependency chain of rename and link operations.
+  mv /mnt/sdi/testdir/a /mnt/sdi/a2
+  touch /mnt/sdi/testdir/a
+  mv /mnt/sdi/b /mnt/sdi/b2
+  ln /mnt/sdi/a2 /mnt/sdi/b
+  mv /mnt/sdi/testdir /mnt/sdi/testdir_2
+  mv /mnt/sdi/a2 /mnt/sdi/testdir
+
+  # Filesystem now looks like:
+  #
+  # .                            (ino 256)
+  # |----- testdir_2/            (ino 259)
+  # |          |----- a          (ino 260)
+  # |
+  # |----- testdir               (ino 257)
+  # |----- b                     (ino 257)
+  # |----- b2                    (ino 258)
+
+  btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap2
+  btrfs send -f /tmp/snap2.send -p /mnt/sdi/snap1 /mnt/sdi/snap2
+
+  mkfs.btrfs -f /dev/sdj >/dev/null
+  mount /dev/sdj /mnt/sdj
+
+  btrfs receive -f /tmp/snap1.send /mnt/sdj
+  btrfs receive -f /tmp/snap2.send /mnt/sdj
+
+  umount /mnt/sdi
+  umount /mnt/sdj
+
+When running the reproducer, the receive of the incremental send stream
+fails:
+
+  $ ./reproducer.sh
+  Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap1'
+  At subvol /mnt/sdi/snap1
+  Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap2'
+  At subvol /mnt/sdi/snap2
+  At subvol snap1
+  At snapshot snap2
+  ERROR: link b -> o259-6-0/a failed: No such file or directory
+
+The problem happens because of the following:
+
+1) Before we start iterating the list of new references for inode 257,
+   we generate its current path and store it at @valid_path, done at
+   the very beginning of process_recorded_refs(). The generated path
+   is "o259-6-0/a", containing the orphanized name for inode 259;
+
+2) Then we iterate over the list of new references, which has the
+   references "b" and "testdir" in that specific order;
+
+3) We process reference "b" first, because it is in the list before
+   reference "testdir". We then issue a link operation to create
+   the new reference "b" using a target path corresponding to the
+   content at @valid_path, which corresponds to "o259-6-0/a".
+   However we haven't yet orphanized inode 259, its name is still
+   "testdir", and not "o259-6-0". The orphanization of 259 did not
+   happen yet because we will process the reference named "testdir"
+   for inode 257 only in the next iteration of the loop that goes
+   over the list of new references.
+
+Fix the issue by having a preliminar iteration over all the new references
+at process_recorded_refs(). This iteration is responsible only for doing
+the orphanization of other inodes that have and old reference that
+conflicts with one of the new references of the inode we are currently
+processing. The emission of rename and link operations happen now in the
+next iteration of the new references.
+
+A test case for fstests will follow soon.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/send.c |  127 ++++++++++++++++++++++++++++++++++++++------------------
+ 1 file changed, 87 insertions(+), 40 deletions(-)
+
+--- a/fs/btrfs/send.c
++++ b/fs/btrfs/send.c
+@@ -3880,52 +3880,56 @@ static int process_recorded_refs(struct
+                       goto out;
+       }
++      /*
++       * Before doing any rename and link operations, do a first pass on the
++       * new references to orphanize any unprocessed inodes that may have a
++       * reference that conflicts with one of the new references of the current
++       * inode. This needs to happen first because a new reference may conflict
++       * with the old reference of a parent directory, so we must make sure
++       * that the path used for link and rename commands don't use an
++       * orphanized name when an ancestor was not yet orphanized.
++       *
++       * Example:
++       *
++       * Parent snapshot:
++       *
++       * .                                                      (ino 256)
++       * |----- testdir/                                        (ino 259)
++       * |          |----- a                                    (ino 257)
++       * |
++       * |----- b                                               (ino 258)
++       *
++       * Send snapshot:
++       *
++       * .                                                      (ino 256)
++       * |----- testdir_2/                                      (ino 259)
++       * |          |----- a                                    (ino 260)
++       * |
++       * |----- testdir                                         (ino 257)
++       * |----- b                                               (ino 257)
++       * |----- b2                                              (ino 258)
++       *
++       * Processing the new reference for inode 257 with name "b" may happen
++       * before processing the new reference with name "testdir". If so, we
++       * must make sure that by the time we send a link command to create the
++       * hard link "b", inode 259 was already orphanized, since the generated
++       * path in "valid_path" already contains the orphanized name for 259.
++       * We are processing inode 257, so only later when processing 259 we do
++       * the rename operation to change its temporary (orphanized) name to
++       * "testdir_2".
++       */
+       list_for_each_entry(cur, &sctx->new_refs, list) {
+-              /*
+-               * We may have refs where the parent directory does not exist
+-               * yet. This happens if the parent directories inum is higher
+-               * than the current inum. To handle this case, we create the
+-               * parent directory out of order. But we need to check if this
+-               * did already happen before due to other refs in the same dir.
+-               */
+               ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
+               if (ret < 0)
+                       goto out;
+-              if (ret == inode_state_will_create) {
+-                      ret = 0;
+-                      /*
+-                       * First check if any of the current inodes refs did
+-                       * already create the dir.
+-                       */
+-                      list_for_each_entry(cur2, &sctx->new_refs, list) {
+-                              if (cur == cur2)
+-                                      break;
+-                              if (cur2->dir == cur->dir) {
+-                                      ret = 1;
+-                                      break;
+-                              }
+-                      }
+-
+-                      /*
+-                       * If that did not happen, check if a previous inode
+-                       * did already create the dir.
+-                       */
+-                      if (!ret)
+-                              ret = did_create_dir(sctx, cur->dir);
+-                      if (ret < 0)
+-                              goto out;
+-                      if (!ret) {
+-                              ret = send_create_inode(sctx, cur->dir);
+-                              if (ret < 0)
+-                                      goto out;
+-                      }
+-              }
++              if (ret == inode_state_will_create)
++                      continue;
+               /*
+-               * Check if this new ref would overwrite the first ref of
+-               * another unprocessed inode. If yes, orphanize the
+-               * overwritten inode. If we find an overwritten ref that is
+-               * not the first ref, simply unlink it.
++               * Check if this new ref would overwrite the first ref of another
++               * unprocessed inode. If yes, orphanize the overwritten inode.
++               * If we find an overwritten ref that is not the first ref,
++               * simply unlink it.
+                */
+               ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
+                               cur->name, cur->name_len,
+@@ -4002,6 +4006,49 @@ static int process_recorded_refs(struct
+                               if (ret < 0)
+                                       goto out;
+                       }
++              }
++
++      }
++
++      list_for_each_entry(cur, &sctx->new_refs, list) {
++              /*
++               * We may have refs where the parent directory does not exist
++               * yet. This happens if the parent directories inum is higher
++               * than the current inum. To handle this case, we create the
++               * parent directory out of order. But we need to check if this
++               * did already happen before due to other refs in the same dir.
++               */
++              ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
++              if (ret < 0)
++                      goto out;
++              if (ret == inode_state_will_create) {
++                      ret = 0;
++                      /*
++                       * First check if any of the current inodes refs did
++                       * already create the dir.
++                       */
++                      list_for_each_entry(cur2, &sctx->new_refs, list) {
++                              if (cur == cur2)
++                                      break;
++                              if (cur2->dir == cur->dir) {
++                                      ret = 1;
++                                      break;
++                              }
++                      }
++
++                      /*
++                       * If that did not happen, check if a previous inode
++                       * did already create the dir.
++                       */
++                      if (!ret)
++                              ret = did_create_dir(sctx, cur->dir);
++                      if (ret < 0)
++                              goto out;
++                      if (!ret) {
++                              ret = send_create_inode(sctx, cur->dir);
++                              if (ret < 0)
++                                      goto out;
++                      }
+               }
+               if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
diff --git a/queue-5.9/btrfs-send-recompute-reference-path-after-orphanization-of-a-directory.patch b/queue-5.9/btrfs-send-recompute-reference-path-after-orphanization-of-a-directory.patch
new file mode 100644 (file)
index 0000000..40bb114
--- /dev/null
@@ -0,0 +1,244 @@
+From 9c2b4e0347067396ceb3ae929d6888c81d610259 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 21 Sep 2020 14:13:30 +0100
+Subject: btrfs: send, recompute reference path after orphanization of a directory
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 9c2b4e0347067396ceb3ae929d6888c81d610259 upstream.
+
+During an incremental send, when an inode has multiple new references we
+might end up emitting rename operations for orphanizations that have a
+source path that is no longer valid due to a previous orphanization of
+some directory inode. This causes the receiver to fail since it tries
+to rename a path that does not exists.
+
+Example reproducer:
+
+  $ cat reproducer.sh
+  #!/bin/bash
+
+  mkfs.btrfs -f /dev/sdi >/dev/null
+  mount /dev/sdi /mnt/sdi
+
+  touch /mnt/sdi/f1
+  touch /mnt/sdi/f2
+  mkdir /mnt/sdi/d1
+  mkdir /mnt/sdi/d1/d2
+
+  # Filesystem looks like:
+  #
+  # .                           (ino 256)
+  # |----- f1                   (ino 257)
+  # |----- f2                   (ino 258)
+  # |----- d1/                  (ino 259)
+  #        |----- d2/           (ino 260)
+
+  btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap1
+  btrfs send -f /tmp/snap1.send /mnt/sdi/snap1
+
+  # Now do a series of changes such that:
+  #
+  # *) inode 258 has one new hardlink and the previous name changed
+  #
+  # *) both names conflict with the old names of two other inodes:
+  #
+  #    1) the new name "d1" conflicts with the old name of inode 259,
+  #       under directory inode 256 (root)
+  #
+  #    2) the new name "d2" conflicts with the old name of inode 260
+  #       under directory inode 259
+  #
+  # *) inodes 259 and 260 now have the old names of inode 258
+  #
+  # *) inode 257 is now located under inode 260 - an inode with a number
+  #    smaller than the inode (258) for which we created a second hard
+  #    link and swapped its names with inodes 259 and 260
+  #
+  ln /mnt/sdi/f2 /mnt/sdi/d1/f2_link
+  mv /mnt/sdi/f1 /mnt/sdi/d1/d2/f1
+
+  # Swap d1 and f2.
+  mv /mnt/sdi/d1 /mnt/sdi/tmp
+  mv /mnt/sdi/f2 /mnt/sdi/d1
+  mv /mnt/sdi/tmp /mnt/sdi/f2
+
+  # Swap d2 and f2_link
+  mv /mnt/sdi/f2/d2 /mnt/sdi/tmp
+  mv /mnt/sdi/f2/f2_link /mnt/sdi/f2/d2
+  mv /mnt/sdi/tmp /mnt/sdi/f2/f2_link
+
+  # Filesystem now looks like:
+  #
+  # .                                (ino 256)
+  # |----- d1                        (ino 258)
+  # |----- f2/                       (ino 259)
+  #        |----- f2_link/           (ino 260)
+  #        |       |----- f1         (ino 257)
+  #        |
+  #        |----- d2                 (ino 258)
+
+  btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap2
+  btrfs send -f /tmp/snap2.send -p /mnt/sdi/snap1 /mnt/sdi/snap2
+
+  mkfs.btrfs -f /dev/sdj >/dev/null
+  mount /dev/sdj /mnt/sdj
+
+  btrfs receive -f /tmp/snap1.send /mnt/sdj
+  btrfs receive -f /tmp/snap2.send /mnt/sdj
+
+  umount /mnt/sdi
+  umount /mnt/sdj
+
+When executed the receive of the incremental stream fails:
+
+  $ ./reproducer.sh
+  Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap1'
+  At subvol /mnt/sdi/snap1
+  Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap2'
+  At subvol /mnt/sdi/snap2
+  At subvol snap1
+  At snapshot snap2
+  ERROR: rename d1/d2 -> o260-6-0 failed: No such file or directory
+
+This happens because:
+
+1) When processing inode 257 we end up computing the name for inode 259
+   because it is an ancestor in the send snapshot, and at that point it
+   still has its old name, "d1", from the parent snapshot because inode
+   259 was not yet processed. We then cache that name, which is valid
+   until we start processing inode 259 (or set the progress to 260 after
+   processing its references);
+
+2) Later we start processing inode 258 and collecting all its new
+   references into the list sctx->new_refs. The first reference in the
+   list happens to be the reference for name "d1" while the reference for
+   name "d2" is next (the last element of the list).
+   We compute the full path "d1/d2" for this second reference and store
+   it in the reference (its ->full_path member). The path used for the
+   new parent directory was "d1" and not "f2" because inode 259, the
+   new parent, was not yet processed;
+
+3) When we start processing the new references at process_recorded_refs()
+   we start with the first reference in the list, for the new name "d1".
+   Because there is a conflicting inode that was not yet processed, which
+   is directory inode 259, we orphanize it, renaming it from "d1" to
+   "o259-6-0";
+
+4) Then we start processing the new reference for name "d2", and we
+   realize it conflicts with the reference of inode 260 in the parent
+   snapshot. So we issue an orphanization operation for inode 260 by
+   emitting a rename operation with a destination path of "o260-6-0"
+   and a source path of "d1/d2" - this source path is the value we
+   stored in the reference earlier at step 2), corresponding to the
+   ->full_path member of the reference, however that path is no longer
+   valid due to the orphanization of the directory inode 259 in step 3).
+   This makes the receiver fail since the path does not exists, it should
+   have been "o259-6-0/d2".
+
+Fix this by recomputing the full path of a reference before emitting an
+orphanization if we previously orphanized any directory, since that
+directory could be a parent in the new path. This is a rare scenario so
+keeping it simple and not checking if that previously orphanized directory
+is in fact an ancestor of the inode we are trying to orphanize.
+
+A test case for fstests follows soon.
+
+CC: stable@vger.kernel.org # 4.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/send.c |   72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 72 insertions(+)
+
+--- a/fs/btrfs/send.c
++++ b/fs/btrfs/send.c
+@@ -3813,6 +3813,72 @@ static int update_ref_path(struct send_c
+ }
+ /*
++ * When processing the new references for an inode we may orphanize an existing
++ * directory inode because its old name conflicts with one of the new references
++ * of the current inode. Later, when processing another new reference of our
++ * inode, we might need to orphanize another inode, but the path we have in the
++ * reference reflects the pre-orphanization name of the directory we previously
++ * orphanized. For example:
++ *
++ * parent snapshot looks like:
++ *
++ * .                                     (ino 256)
++ * |----- f1                             (ino 257)
++ * |----- f2                             (ino 258)
++ * |----- d1/                            (ino 259)
++ *        |----- d2/                     (ino 260)
++ *
++ * send snapshot looks like:
++ *
++ * .                                     (ino 256)
++ * |----- d1                             (ino 258)
++ * |----- f2/                            (ino 259)
++ *        |----- f2_link/                (ino 260)
++ *        |       |----- f1              (ino 257)
++ *        |
++ *        |----- d2                      (ino 258)
++ *
++ * When processing inode 257 we compute the name for inode 259 as "d1", and we
++ * cache it in the name cache. Later when we start processing inode 258, when
++ * collecting all its new references we set a full path of "d1/d2" for its new
++ * reference with name "d2". When we start processing the new references we
++ * start by processing the new reference with name "d1", and this results in
++ * orphanizing inode 259, since its old reference causes a conflict. Then we
++ * move on the next new reference, with name "d2", and we find out we must
++ * orphanize inode 260, as its old reference conflicts with ours - but for the
++ * orphanization we use a source path corresponding to the path we stored in the
++ * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
++ * receiver fail since the path component "d1/" no longer exists, it was renamed
++ * to "o259-6-0/" when processing the previous new reference. So in this case we
++ * must recompute the path in the new reference and use it for the new
++ * orphanization operation.
++ */
++static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
++{
++      char *name;
++      int ret;
++
++      name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
++      if (!name)
++              return -ENOMEM;
++
++      fs_path_reset(ref->full_path);
++      ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path);
++      if (ret < 0)
++              goto out;
++
++      ret = fs_path_add(ref->full_path, name, ref->name_len);
++      if (ret < 0)
++              goto out;
++
++      /* Update the reference's base name pointer. */
++      set_ref_path(ref, ref->full_path);
++out:
++      kfree(name);
++      return ret;
++}
++
++/*
+  * This does all the move/link/unlink/rmdir magic.
+  */
+ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
+@@ -3946,6 +4012,12 @@ static int process_recorded_refs(struct
+                               struct name_cache_entry *nce;
+                               struct waiting_dir_move *wdm;
++                              if (orphanized_dir) {
++                                      ret = refresh_ref_path(sctx, cur);
++                                      if (ret < 0)
++                                              goto out;
++                              }
++
+                               ret = orphanize_inode(sctx, ow_inode, ow_gen,
+                                               cur->full_path);
+                               if (ret < 0)
diff --git a/queue-5.9/btrfs-skip-devices-without-magic-signature-when-mounting.patch b/queue-5.9/btrfs-skip-devices-without-magic-signature-when-mounting.patch
new file mode 100644 (file)
index 0000000..36b9301
--- /dev/null
@@ -0,0 +1,93 @@
+From 96c2e067ed3e3e004580a643c76f58729206b829 Mon Sep 17 00:00:00 2001
+From: Anand Jain <anand.jain@oracle.com>
+Date: Wed, 30 Sep 2020 21:09:52 +0800
+Subject: btrfs: skip devices without magic signature when mounting
+
+From: Anand Jain <anand.jain@oracle.com>
+
+commit 96c2e067ed3e3e004580a643c76f58729206b829 upstream.
+
+Many things can happen after the device is scanned and before the device
+is mounted.  One such thing is losing the BTRFS_MAGIC on the device.
+If it happens we still won't free that device from the memory and cause
+the userland confusion.
+
+For example: As the BTRFS_IOC_DEV_INFO still carries the device path
+which does not have the BTRFS_MAGIC, 'btrfs fi show' still lists
+device which does not belong to the filesystem anymore:
+
+  $ mkfs.btrfs -fq -draid1 -mraid1 /dev/sda /dev/sdb
+  $ wipefs -a /dev/sdb
+  # /dev/sdb does not contain magic signature
+  $ mount -o degraded /dev/sda /btrfs
+  $ btrfs fi show -m
+  Label: none  uuid: 470ec6fb-646b-4464-b3cb-df1b26c527bd
+         Total devices 2 FS bytes used 128.00KiB
+         devid    1 size 3.00GiB used 571.19MiB path /dev/sda
+         devid    2 size 3.00GiB used 571.19MiB path /dev/sdb
+
+We need to distinguish the missing signature and invalid superblock, so
+add a specific error code ENODATA for that. This also fixes failure of
+fstest btrfs/198.
+
+CC: stable@vger.kernel.org # 4.19+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/disk-io.c |    8 ++++++--
+ fs/btrfs/volumes.c |   18 ++++++++++++------
+ 2 files changed, 18 insertions(+), 8 deletions(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -3482,8 +3482,12 @@ struct btrfs_super_block *btrfs_read_dev
+               return ERR_CAST(page);
+       super = page_address(page);
+-      if (btrfs_super_bytenr(super) != bytenr ||
+-                  btrfs_super_magic(super) != BTRFS_MAGIC) {
++      if (btrfs_super_magic(super) != BTRFS_MAGIC) {
++              btrfs_release_disk_super(super);
++              return ERR_PTR(-ENODATA);
++      }
++
++      if (btrfs_super_bytenr(super) != bytenr) {
+               btrfs_release_disk_super(super);
+               return ERR_PTR(-EINVAL);
+       }
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -1200,17 +1200,23 @@ static int open_fs_devices(struct btrfs_
+ {
+       struct btrfs_device *device;
+       struct btrfs_device *latest_dev = NULL;
++      struct btrfs_device *tmp_device;
+       flags |= FMODE_EXCL;
+-      list_for_each_entry(device, &fs_devices->devices, dev_list) {
+-              /* Just open everything we can; ignore failures here */
+-              if (btrfs_open_one_device(fs_devices, device, flags, holder))
+-                      continue;
++      list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
++                               dev_list) {
++              int ret;
+-              if (!latest_dev ||
+-                  device->generation > latest_dev->generation)
++              ret = btrfs_open_one_device(fs_devices, device, flags, holder);
++              if (ret == 0 &&
++                  (!latest_dev || device->generation > latest_dev->generation)) {
+                       latest_dev = device;
++              } else if (ret == -ENODATA) {
++                      fs_devices->num_devices--;
++                      list_del(&device->dev_list);
++                      btrfs_free_device(device);
++              }
+       }
+       if (fs_devices->open_devices == 0)
+               return -EINVAL;
diff --git a/queue-5.9/btrfs-sysfs-init-devices-outside-of-the-chunk_mutex.patch b/queue-5.9/btrfs-sysfs-init-devices-outside-of-the-chunk_mutex.patch
new file mode 100644 (file)
index 0000000..7f47147
--- /dev/null
@@ -0,0 +1,188 @@
+From ca10845a56856fff4de3804c85e6424d0f6d0cde Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Tue, 1 Sep 2020 08:09:01 -0400
+Subject: btrfs: sysfs: init devices outside of the chunk_mutex
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit ca10845a56856fff4de3804c85e6424d0f6d0cde upstream.
+
+While running btrfs/061, btrfs/073, btrfs/078, or btrfs/178 we hit the
+following lockdep splat:
+
+  ======================================================
+  WARNING: possible circular locking dependency detected
+  5.9.0-rc3+ #4 Not tainted
+  ------------------------------------------------------
+  kswapd0/100 is trying to acquire lock:
+  ffff96ecc22ef4a0 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x330
+
+  but task is already holding lock:
+  ffffffff8dd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30
+
+  which lock already depends on the new lock.
+
+  the existing dependency chain (in reverse order) is:
+
+  -> #3 (fs_reclaim){+.+.}-{0:0}:
+        fs_reclaim_acquire+0x65/0x80
+        slab_pre_alloc_hook.constprop.0+0x20/0x200
+        kmem_cache_alloc+0x37/0x270
+        alloc_inode+0x82/0xb0
+        iget_locked+0x10d/0x2c0
+        kernfs_get_inode+0x1b/0x130
+        kernfs_get_tree+0x136/0x240
+        sysfs_get_tree+0x16/0x40
+        vfs_get_tree+0x28/0xc0
+        path_mount+0x434/0xc00
+        __x64_sys_mount+0xe3/0x120
+        do_syscall_64+0x33/0x40
+        entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+  -> #2 (kernfs_mutex){+.+.}-{3:3}:
+        __mutex_lock+0x7e/0x7e0
+        kernfs_add_one+0x23/0x150
+        kernfs_create_link+0x63/0xa0
+        sysfs_do_create_link_sd+0x5e/0xd0
+        btrfs_sysfs_add_devices_dir+0x81/0x130
+        btrfs_init_new_device+0x67f/0x1250
+        btrfs_ioctl+0x1ef/0x2e20
+        __x64_sys_ioctl+0x83/0xb0
+        do_syscall_64+0x33/0x40
+        entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+  -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}:
+        __mutex_lock+0x7e/0x7e0
+        btrfs_chunk_alloc+0x125/0x3a0
+        find_free_extent+0xdf6/0x1210
+        btrfs_reserve_extent+0xb3/0x1b0
+        btrfs_alloc_tree_block+0xb0/0x310
+        alloc_tree_block_no_bg_flush+0x4a/0x60
+        __btrfs_cow_block+0x11a/0x530
+        btrfs_cow_block+0x104/0x220
+        btrfs_search_slot+0x52e/0x9d0
+        btrfs_insert_empty_items+0x64/0xb0
+        btrfs_insert_delayed_items+0x90/0x4f0
+        btrfs_commit_inode_delayed_items+0x93/0x140
+        btrfs_log_inode+0x5de/0x2020
+        btrfs_log_inode_parent+0x429/0xc90
+        btrfs_log_new_name+0x95/0x9b
+        btrfs_rename2+0xbb9/0x1800
+        vfs_rename+0x64f/0x9f0
+        do_renameat2+0x320/0x4e0
+        __x64_sys_rename+0x1f/0x30
+        do_syscall_64+0x33/0x40
+        entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+  -> #0 (&delayed_node->mutex){+.+.}-{3:3}:
+        __lock_acquire+0x119c/0x1fc0
+        lock_acquire+0xa7/0x3d0
+        __mutex_lock+0x7e/0x7e0
+        __btrfs_release_delayed_node.part.0+0x3f/0x330
+        btrfs_evict_inode+0x24c/0x500
+        evict+0xcf/0x1f0
+        dispose_list+0x48/0x70
+        prune_icache_sb+0x44/0x50
+        super_cache_scan+0x161/0x1e0
+        do_shrink_slab+0x178/0x3c0
+        shrink_slab+0x17c/0x290
+        shrink_node+0x2b2/0x6d0
+        balance_pgdat+0x30a/0x670
+        kswapd+0x213/0x4c0
+        kthread+0x138/0x160
+        ret_from_fork+0x1f/0x30
+
+  other info that might help us debug this:
+
+  Chain exists of:
+    &delayed_node->mutex --> kernfs_mutex --> fs_reclaim
+
+   Possible unsafe locking scenario:
+
+        CPU0                    CPU1
+        ----                    ----
+    lock(fs_reclaim);
+                                lock(kernfs_mutex);
+                                lock(fs_reclaim);
+    lock(&delayed_node->mutex);
+
+   *** DEADLOCK ***
+
+  3 locks held by kswapd0/100:
+   #0: ffffffff8dd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30
+   #1: ffffffff8dd65c50 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x115/0x290
+   #2: ffff96ed2ade30e0 (&type->s_umount_key#36){++++}-{3:3}, at: super_cache_scan+0x38/0x1e0
+
+  stack backtrace:
+  CPU: 0 PID: 100 Comm: kswapd0 Not tainted 5.9.0-rc3+ #4
+  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014
+  Call Trace:
+   dump_stack+0x8b/0xb8
+   check_noncircular+0x12d/0x150
+   __lock_acquire+0x119c/0x1fc0
+   lock_acquire+0xa7/0x3d0
+   ? __btrfs_release_delayed_node.part.0+0x3f/0x330
+   __mutex_lock+0x7e/0x7e0
+   ? __btrfs_release_delayed_node.part.0+0x3f/0x330
+   ? __btrfs_release_delayed_node.part.0+0x3f/0x330
+   ? lock_acquire+0xa7/0x3d0
+   ? find_held_lock+0x2b/0x80
+   __btrfs_release_delayed_node.part.0+0x3f/0x330
+   btrfs_evict_inode+0x24c/0x500
+   evict+0xcf/0x1f0
+   dispose_list+0x48/0x70
+   prune_icache_sb+0x44/0x50
+   super_cache_scan+0x161/0x1e0
+   do_shrink_slab+0x178/0x3c0
+   shrink_slab+0x17c/0x290
+   shrink_node+0x2b2/0x6d0
+   balance_pgdat+0x30a/0x670
+   kswapd+0x213/0x4c0
+   ? _raw_spin_unlock_irqrestore+0x41/0x50
+   ? add_wait_queue_exclusive+0x70/0x70
+   ? balance_pgdat+0x670/0x670
+   kthread+0x138/0x160
+   ? kthread_create_worker_on_cpu+0x40/0x40
+   ret_from_fork+0x1f/0x30
+
+This happens because we are holding the chunk_mutex at the time of
+adding in a new device.  However we only need to hold the
+device_list_mutex, as we're going to iterate over the fs_devices
+devices.  Move the sysfs init stuff outside of the chunk_mutex to get
+rid of this lockdep splat.
+
+CC: stable@vger.kernel.org # 4.4.x: f3cd2c58110dad14e: btrfs: sysfs, rename device_link add/remove functions
+CC: stable@vger.kernel.org # 4.4.x
+Reported-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/volumes.c |    7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -2613,9 +2613,6 @@ int btrfs_init_new_device(struct btrfs_f
+       btrfs_set_super_num_devices(fs_info->super_copy,
+                                   orig_super_num_devices + 1);
+-      /* add sysfs device entry */
+-      btrfs_sysfs_add_devices_dir(fs_devices, device);
+-
+       /*
+        * we've got more storage, clear any full flags on the space
+        * infos
+@@ -2623,6 +2620,10 @@ int btrfs_init_new_device(struct btrfs_f
+       btrfs_clear_space_info_full(fs_info);
+       mutex_unlock(&fs_info->chunk_mutex);
++
++      /* Add sysfs device entry */
++      btrfs_sysfs_add_devices_dir(fs_devices, device);
++
+       mutex_unlock(&fs_devices->device_list_mutex);
+       if (seeding_dev) {
diff --git a/queue-5.9/btrfs-tracepoints-output-proper-root-owner-for-trace_find_free_extent.patch b/queue-5.9/btrfs-tracepoints-output-proper-root-owner-for-trace_find_free_extent.patch
new file mode 100644 (file)
index 0000000..e8423f8
--- /dev/null
@@ -0,0 +1,120 @@
+From 437490fed3b0c9ae21af8f70e0f338d34560842b Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Tue, 28 Jul 2020 09:42:49 +0800
+Subject: btrfs: tracepoints: output proper root owner for trace_find_free_extent()
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 437490fed3b0c9ae21af8f70e0f338d34560842b upstream.
+
+The current trace event always output result like this:
+
+ find_free_extent: root=2(EXTENT_TREE) len=16384 empty_size=0 flags=4(METADATA)
+ find_free_extent: root=2(EXTENT_TREE) len=16384 empty_size=0 flags=4(METADATA)
+ find_free_extent: root=2(EXTENT_TREE) len=8192 empty_size=0 flags=1(DATA)
+ find_free_extent: root=2(EXTENT_TREE) len=8192 empty_size=0 flags=1(DATA)
+ find_free_extent: root=2(EXTENT_TREE) len=4096 empty_size=0 flags=1(DATA)
+ find_free_extent: root=2(EXTENT_TREE) len=4096 empty_size=0 flags=1(DATA)
+
+T's saying we're allocating data extent for EXTENT tree, which is not
+even possible.
+
+It's because we always use EXTENT tree as the owner for
+trace_find_free_extent() without using the @root from
+btrfs_reserve_extent().
+
+This patch will change the parameter to use proper @root for
+trace_find_free_extent():
+
+Now it looks much better:
+
+ find_free_extent: root=5(FS_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP)
+ find_free_extent: root=5(FS_TREE) len=8192 empty_size=0 flags=1(DATA)
+ find_free_extent: root=5(FS_TREE) len=16384 empty_size=0 flags=1(DATA)
+ find_free_extent: root=5(FS_TREE) len=4096 empty_size=0 flags=1(DATA)
+ find_free_extent: root=5(FS_TREE) len=8192 empty_size=0 flags=1(DATA)
+ find_free_extent: root=5(FS_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP)
+ find_free_extent: root=7(CSUM_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP)
+ find_free_extent: root=2(EXTENT_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP)
+ find_free_extent: root=1(ROOT_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP)
+
+Reported-by: Hans van Kranenburg <hans@knorrie.org>
+CC: stable@vger.kernel.org # 5.4+
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/extent-tree.c       |    7 ++++---
+ include/trace/events/btrfs.h |   10 ++++++----
+ 2 files changed, 10 insertions(+), 7 deletions(-)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -3918,11 +3918,12 @@ static int prepare_allocation(struct btr
+  * |- Push harder to find free extents
+  *    |- If not found, re-iterate all block groups
+  */
+-static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
++static noinline int find_free_extent(struct btrfs_root *root,
+                               u64 ram_bytes, u64 num_bytes, u64 empty_size,
+                               u64 hint_byte_orig, struct btrfs_key *ins,
+                               u64 flags, int delalloc)
+ {
++      struct btrfs_fs_info *fs_info = root->fs_info;
+       int ret = 0;
+       int cache_block_group_error = 0;
+       struct btrfs_block_group *block_group = NULL;
+@@ -3954,7 +3955,7 @@ static noinline int find_free_extent(str
+       ins->objectid = 0;
+       ins->offset = 0;
+-      trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
++      trace_find_free_extent(root, num_bytes, empty_size, flags);
+       space_info = btrfs_find_space_info(fs_info, flags);
+       if (!space_info) {
+@@ -4203,7 +4204,7 @@ int btrfs_reserve_extent(struct btrfs_ro
+       flags = get_alloc_profile_by_root(root, is_data);
+ again:
+       WARN_ON(num_bytes < fs_info->sectorsize);
+-      ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
++      ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
+                              hint_byte, ins, flags, delalloc);
+       if (!ret && !is_data) {
+               btrfs_dec_block_group_reservations(fs_info, ins->objectid);
+--- a/include/trace/events/btrfs.h
++++ b/include/trace/events/btrfs.h
+@@ -1176,25 +1176,27 @@ DEFINE_EVENT(btrfs__reserved_extent,  bt
+ TRACE_EVENT(find_free_extent,
+-      TP_PROTO(const struct btrfs_fs_info *fs_info, u64 num_bytes,
++      TP_PROTO(const struct btrfs_root *root, u64 num_bytes,
+                u64 empty_size, u64 data),
+-      TP_ARGS(fs_info, num_bytes, empty_size, data),
++      TP_ARGS(root, num_bytes, empty_size, data),
+       TP_STRUCT__entry_btrfs(
++              __field(        u64,    root_objectid           )
+               __field(        u64,    num_bytes               )
+               __field(        u64,    empty_size              )
+               __field(        u64,    data                    )
+       ),
+-      TP_fast_assign_btrfs(fs_info,
++      TP_fast_assign_btrfs(root->fs_info,
++              __entry->root_objectid  = root->root_key.objectid;
+               __entry->num_bytes      = num_bytes;
+               __entry->empty_size     = empty_size;
+               __entry->data           = data;
+       ),
+       TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)",
+-                show_root_type(BTRFS_EXTENT_TREE_OBJECTID),
++                show_root_type(__entry->root_objectid),
+                 __entry->num_bytes, __entry->empty_size, __entry->data,
+                 __print_flags((unsigned long)__entry->data, "|",
+                                BTRFS_GROUP_FLAGS))
diff --git a/queue-5.9/btrfs-tree-checker-fix-false-alert-caused-by-legacy-btrfs-root-item.patch b/queue-5.9/btrfs-tree-checker-fix-false-alert-caused-by-legacy-btrfs-root-item.patch
new file mode 100644 (file)
index 0000000..cead2cb
--- /dev/null
@@ -0,0 +1,102 @@
+From 1465af12e254a68706e110846f59cf0f09683184 Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Tue, 22 Sep 2020 10:37:01 +0800
+Subject: btrfs: tree-checker: fix false alert caused by legacy btrfs root item
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 1465af12e254a68706e110846f59cf0f09683184 upstream.
+
+Commit 259ee7754b67 ("btrfs: tree-checker: Add ROOT_ITEM check")
+introduced btrfs root item size check, however btrfs root item has two
+versions, the legacy one which just ends before generation_v2 member, is
+smaller than current btrfs root item size.
+
+This caused btrfs kernel to reject valid but old tree root leaves.
+
+Fix this problem by also allowing legacy root item, since kernel can
+already handle them pretty well and upgrade to newer root item format
+when needed.
+
+Reported-by: Martin Steigerwald <martin@lichtvoll.de>
+Fixes: 259ee7754b67 ("btrfs: tree-checker: Add ROOT_ITEM check")
+CC: stable@vger.kernel.org # 5.4+
+Tested-By: Martin Steigerwald <martin@lichtvoll.de>
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/tree-checker.c         |   17 ++++++++++++-----
+ include/uapi/linux/btrfs_tree.h |   14 ++++++++++++++
+ 2 files changed, 26 insertions(+), 5 deletions(-)
+
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -1035,7 +1035,7 @@ static int check_root_item(struct extent
+                          int slot)
+ {
+       struct btrfs_fs_info *fs_info = leaf->fs_info;
+-      struct btrfs_root_item ri;
++      struct btrfs_root_item ri = { 0 };
+       const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY |
+                                    BTRFS_ROOT_SUBVOL_DEAD;
+       int ret;
+@@ -1044,14 +1044,21 @@ static int check_root_item(struct extent
+       if (ret < 0)
+               return ret;
+-      if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) {
++      if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) &&
++          btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) {
+               generic_err(leaf, slot,
+-                          "invalid root item size, have %u expect %zu",
+-                          btrfs_item_size_nr(leaf, slot), sizeof(ri));
++                          "invalid root item size, have %u expect %zu or %u",
++                          btrfs_item_size_nr(leaf, slot), sizeof(ri),
++                          btrfs_legacy_root_item_size());
+       }
++      /*
++       * For legacy root item, the members starting at generation_v2 will be
++       * all filled with 0.
++       * And since we allow geneartion_v2 as 0, it will still pass the check.
++       */
+       read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot),
+-                         sizeof(ri));
++                         btrfs_item_size_nr(leaf, slot));
+       /* Generation related */
+       if (btrfs_root_generation(&ri) >
+--- a/include/uapi/linux/btrfs_tree.h
++++ b/include/uapi/linux/btrfs_tree.h
+@@ -4,6 +4,11 @@
+ #include <linux/btrfs.h>
+ #include <linux/types.h>
++#ifdef __KERNEL__
++#include <linux/stddef.h>
++#else
++#include <stddef.h>
++#endif
+ /*
+  * This header contains the structure definitions and constants used
+@@ -645,6 +650,15 @@ struct btrfs_root_item {
+ } __attribute__ ((__packed__));
+ /*
++ * Btrfs root item used to be smaller than current size.  The old format ends
++ * at where member generation_v2 is.
++ */
++static inline __u32 btrfs_legacy_root_item_size(void)
++{
++      return offsetof(struct btrfs_root_item, generation_v2);
++}
++
++/*
+  * this is used for both forward and backward root refs
+  */
+ struct btrfs_root_ref {
diff --git a/queue-5.9/btrfs-tree-checker-validate-number-of-chunk-stripes-and-parity.patch b/queue-5.9/btrfs-tree-checker-validate-number-of-chunk-stripes-and-parity.patch
new file mode 100644 (file)
index 0000000..4af3eb3
--- /dev/null
@@ -0,0 +1,64 @@
+From 85d07fbe09efd1c529ff3e025e2f0d2c6c96a1b7 Mon Sep 17 00:00:00 2001
+From: Daniel Xu <dxu@dxuuu.xyz>
+Date: Thu, 8 Oct 2020 18:09:10 -0700
+Subject: btrfs: tree-checker: validate number of chunk stripes and parity
+
+From: Daniel Xu <dxu@dxuuu.xyz>
+
+commit 85d07fbe09efd1c529ff3e025e2f0d2c6c96a1b7 upstream.
+
+If there's no parity and num_stripes < ncopies, a crafted image can
+trigger a division by zero in calc_stripe_length().
+
+The image was generated through fuzzing.
+
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=209587
+Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/tree-checker.c |   18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
+
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -760,18 +760,36 @@ int btrfs_check_chunk_valid(struct exten
+       u64 type;
+       u64 features;
+       bool mixed = false;
++      int raid_index;
++      int nparity;
++      int ncopies;
+       length = btrfs_chunk_length(leaf, chunk);
+       stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+       num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+       sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
+       type = btrfs_chunk_type(leaf, chunk);
++      raid_index = btrfs_bg_flags_to_raid_index(type);
++      ncopies = btrfs_raid_array[raid_index].ncopies;
++      nparity = btrfs_raid_array[raid_index].nparity;
+       if (!num_stripes) {
+               chunk_err(leaf, chunk, logical,
+                         "invalid chunk num_stripes, have %u", num_stripes);
+               return -EUCLEAN;
+       }
++      if (num_stripes < ncopies) {
++              chunk_err(leaf, chunk, logical,
++                        "invalid chunk num_stripes < ncopies, have %u < %d",
++                        num_stripes, ncopies);
++              return -EUCLEAN;
++      }
++      if (nparity && num_stripes == nparity) {
++              chunk_err(leaf, chunk, logical,
++                        "invalid chunk num_stripes == nparity, have %u == %d",
++                        num_stripes, nparity);
++              return -EUCLEAN;
++      }
+       if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
+               chunk_err(leaf, chunk, logical,
+               "invalid chunk logical, have %llu should aligned to %u",
diff --git a/queue-5.9/btrfs-use-kvzalloc-to-allocate-clone_roots-in-btrfs_ioctl_send.patch b/queue-5.9/btrfs-use-kvzalloc-to-allocate-clone_roots-in-btrfs_ioctl_send.patch
new file mode 100644 (file)
index 0000000..76428fa
--- /dev/null
@@ -0,0 +1,37 @@
+From 8eb2fd00153a3a96a19c62ac9c6d48c2efebe5e8 Mon Sep 17 00:00:00 2001
+From: Denis Efremov <efremov@linux.com>
+Date: Mon, 21 Sep 2020 20:03:35 +0300
+Subject: btrfs: use kvzalloc() to allocate clone_roots in btrfs_ioctl_send()
+
+From: Denis Efremov <efremov@linux.com>
+
+commit 8eb2fd00153a3a96a19c62ac9c6d48c2efebe5e8 upstream.
+
+btrfs_ioctl_send() used open-coded kvzalloc implementation earlier.
+The code was accidentally replaced with kzalloc() call [1]. Restore
+the original code by using kvzalloc() to allocate sctx->clone_roots.
+
+[1] https://patchwork.kernel.org/patch/9757891/#20529627
+
+Fixes: 818e010bf9d0 ("btrfs: replace opencoded kvzalloc with the helper")
+CC: stable@vger.kernel.org # 4.14+
+Signed-off-by: Denis Efremov <efremov@linux.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/send.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/send.c
++++ b/fs/btrfs/send.c
+@@ -7300,7 +7300,7 @@ long btrfs_ioctl_send(struct file *mnt_f
+       alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
+-      sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL);
++      sctx->clone_roots = kvzalloc(alloc_size, GFP_KERNEL);
+       if (!sctx->clone_roots) {
+               ret = -ENOMEM;
+               goto out;
diff --git a/queue-5.9/pm-runtime-remove-link-state-checks-in-rpm_get-put_supplier.patch b/queue-5.9/pm-runtime-remove-link-state-checks-in-rpm_get-put_supplier.patch
new file mode 100644 (file)
index 0000000..bd77587
--- /dev/null
@@ -0,0 +1,84 @@
+From d12544fb2aa9944b180c35914031a8384ab082c1 Mon Sep 17 00:00:00 2001
+From: Xiang Chen <chenxiang66@hisilicon.com>
+Date: Tue, 22 Sep 2020 21:11:06 +0800
+Subject: PM: runtime: Remove link state checks in rpm_get/put_supplier()
+
+From: Xiang Chen <chenxiang66@hisilicon.com>
+
+commit d12544fb2aa9944b180c35914031a8384ab082c1 upstream.
+
+To support runtime PM for hisi SAS driver (the driver is in directory
+drivers/scsi/hisi_sas), we add device link between scsi_device->sdev_gendev
+(consumer device) and hisi_hba->dev(supplier device) with flags
+DL_FLAG_PM_RUNTIME | DL_FLAG_RPM_ACTIVE.
+
+After runtime suspended consumers and supplier, unload the dirver which
+causes a hung.
+
+We found that it called function device_release_driver_internal() to
+release the supplier device (hisi_hba->dev), as the device link was
+busy, it set the device link state to DL_STATE_SUPPLIER_UNBIND, and
+then it called device_release_driver_internal() to release the consumer
+device (scsi_device->sdev_gendev).
+
+Then it would try to call pm_runtime_get_sync() to resume the consumer
+device, but because consumer-supplier relation existed, it would try
+to resume the supplier first, but as the link state was already
+DL_STATE_SUPPLIER_UNBIND, so it skipped resuming the supplier and only
+resumed the consumer which hanged (it sends IOs to resume scsi_device
+while the SAS controller is suspended).
+
+Simple flow is as follows:
+
+device_release_driver_internal -> (supplier device)
+    if device_links_busy ->
+       device_links_unbind_consumers ->
+           ...
+           WRITE_ONCE(link->status, DL_STATE_SUPPLIER_UNBIND)
+           device_release_driver_internal (consumer device)
+    pm_runtime_get_sync -> (consumer device)
+       ...
+       __rpm_callback ->
+           rpm_get_suppliers ->
+               if link->state == DL_STATE_SUPPLIER_UNBIND -> skip the action of resuming the supplier
+               ...
+    pm_runtime_clean_up_links
+    ...
+
+Correct suspend/resume ordering between a supplier device and its consumer
+devices (resume the supplier device before resuming consumer devices, and
+suspend consumer devices before suspending the supplier device) should be
+guaranteed by runtime PM, but the state checks in rpm_get_supplier() and
+rpm_put_supplier() break this rule, so remove them.
+
+Signed-off-by: Xiang Chen <chenxiang66@hisilicon.com>
+[ rjw: Subject and changelog edits ]
+Cc: All applicable <stable@vger.kernel.org>
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/base/power/runtime.c |    5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+--- a/drivers/base/power/runtime.c
++++ b/drivers/base/power/runtime.c
+@@ -291,8 +291,7 @@ static int rpm_get_suppliers(struct devi
+                               device_links_read_lock_held()) {
+               int retval;
+-              if (!(link->flags & DL_FLAG_PM_RUNTIME) ||
+-                  READ_ONCE(link->status) == DL_STATE_SUPPLIER_UNBIND)
++              if (!(link->flags & DL_FLAG_PM_RUNTIME))
+                       continue;
+               retval = pm_runtime_get_sync(link->supplier);
+@@ -312,8 +311,6 @@ static void rpm_put_suppliers(struct dev
+       list_for_each_entry_rcu(link, &dev->links.suppliers, c_node,
+                               device_links_read_lock_held()) {
+-              if (READ_ONCE(link->status) == DL_STATE_SUPPLIER_UNBIND)
+-                      continue;
+               while (refcount_dec_not_one(&link->rpm_active))
+                       pm_runtime_put(link->supplier);
diff --git a/queue-5.9/scsi-mptfusion-fix-null-pointer-dereferences-in-mptscsih_remove.patch b/queue-5.9/scsi-mptfusion-fix-null-pointer-dereferences-in-mptscsih_remove.patch
new file mode 100644 (file)
index 0000000..f0fdcdb
--- /dev/null
@@ -0,0 +1,77 @@
+From 2f4843b172c2c0360ee7792ad98025fae7baefde Mon Sep 17 00:00:00 2001
+From: Helge Deller <deller@gmx.de>
+Date: Thu, 22 Oct 2020 11:00:05 +0200
+Subject: scsi: mptfusion: Fix null pointer dereferences in mptscsih_remove()
+
+From: Helge Deller <deller@gmx.de>
+
+commit 2f4843b172c2c0360ee7792ad98025fae7baefde upstream.
+
+The mptscsih_remove() function triggers a kernel oops if the Scsi_Host
+pointer (ioc->sh) is NULL, as can be seen in this syslog:
+
+ ioc0: LSI53C1030 B2: Capabilities={Initiator,Target}
+ Begin: Waiting for root file system ...
+ scsi host2: error handler thread failed to spawn, error = -4
+ mptspi: ioc0: WARNING - Unable to register controller with SCSI subsystem
+ Backtrace:
+  [<000000001045b7cc>] mptspi_probe+0x248/0x3d0 [mptspi]
+  [<0000000040946470>] pci_device_probe+0x1ac/0x2d8
+  [<0000000040add668>] really_probe+0x1bc/0x988
+  [<0000000040ade704>] driver_probe_device+0x160/0x218
+  [<0000000040adee24>] device_driver_attach+0x160/0x188
+  [<0000000040adef90>] __driver_attach+0x144/0x320
+  [<0000000040ad7c78>] bus_for_each_dev+0xd4/0x158
+  [<0000000040adc138>] driver_attach+0x4c/0x80
+  [<0000000040adb3ec>] bus_add_driver+0x3e0/0x498
+  [<0000000040ae0130>] driver_register+0xf4/0x298
+  [<00000000409450c4>] __pci_register_driver+0x78/0xa8
+  [<000000000007d248>] mptspi_init+0x18c/0x1c4 [mptspi]
+
+This patch adds the necessary NULL-pointer checks.  Successfully tested on
+a HP C8000 parisc workstation with buggy SCSI drives.
+
+Link: https://lore.kernel.org/r/20201022090005.GA9000@ls3530.fritz.box
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Helge Deller <deller@gmx.de>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/message/fusion/mptscsih.c |   13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+--- a/drivers/message/fusion/mptscsih.c
++++ b/drivers/message/fusion/mptscsih.c
+@@ -1176,8 +1176,10 @@ mptscsih_remove(struct pci_dev *pdev)
+       MPT_SCSI_HOST           *hd;
+       int sz1;
+-      if((hd = shost_priv(host)) == NULL)
+-              return;
++      if (host == NULL)
++              hd = NULL;
++      else
++              hd = shost_priv(host);
+       mptscsih_shutdown(pdev);
+@@ -1193,14 +1195,15 @@ mptscsih_remove(struct pci_dev *pdev)
+           "Free'd ScsiLookup (%d) memory\n",
+           ioc->name, sz1));
+-      kfree(hd->info_kbuf);
++      if (hd)
++              kfree(hd->info_kbuf);
+       /* NULL the Scsi_Host pointer
+        */
+       ioc->sh = NULL;
+-      scsi_host_put(host);
+-
++      if (host)
++              scsi_host_put(host);
+       mpt_detach(pdev);
+ }
diff --git a/queue-5.9/scsi-qla2xxx-fix-crash-on-session-cleanup-with-unload.patch b/queue-5.9/scsi-qla2xxx-fix-crash-on-session-cleanup-with-unload.patch
new file mode 100644 (file)
index 0000000..a8be4b6
--- /dev/null
@@ -0,0 +1,49 @@
+From 50457dab670f396557e60c07f086358460876353 Mon Sep 17 00:00:00 2001
+From: Quinn Tran <qutran@marvell.com>
+Date: Tue, 29 Sep 2020 03:21:50 -0700
+Subject: scsi: qla2xxx: Fix crash on session cleanup with unload
+
+From: Quinn Tran <qutran@marvell.com>
+
+commit 50457dab670f396557e60c07f086358460876353 upstream.
+
+On unload, session cleanup prematurely gave the signal for driver unload
+path to advance.
+
+Link: https://lore.kernel.org/r/20200929102152.32278-6-njavali@marvell.com
+Fixes: 726b85487067 ("qla2xxx: Add framework for async fabric discovery")
+Cc: stable@vger.kernel.org
+Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
+Signed-off-by: Quinn Tran <qutran@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/scsi/qla2xxx/qla_target.c |   13 +++++++------
+ 1 file changed, 7 insertions(+), 6 deletions(-)
+
+--- a/drivers/scsi/qla2xxx/qla_target.c
++++ b/drivers/scsi/qla2xxx/qla_target.c
+@@ -1229,14 +1229,15 @@ void qlt_schedule_sess_for_deletion(stru
+       case DSC_DELETE_PEND:
+               return;
+       case DSC_DELETED:
+-              if (tgt && tgt->tgt_stop && (tgt->sess_count == 0))
+-                      wake_up_all(&tgt->waitQ);
+-              if (sess->vha->fcport_count == 0)
+-                      wake_up_all(&sess->vha->fcport_waitQ);
+-
+               if (!sess->plogi_link[QLT_PLOGI_LINK_SAME_WWN] &&
+-                      !sess->plogi_link[QLT_PLOGI_LINK_CONFLICT])
++                      !sess->plogi_link[QLT_PLOGI_LINK_CONFLICT]) {
++                      if (tgt && tgt->tgt_stop && tgt->sess_count == 0)
++                              wake_up_all(&tgt->waitQ);
++
++                      if (sess->vha->fcport_count == 0)
++                              wake_up_all(&sess->vha->fcport_waitQ);
+                       return;
++              }
+               break;
+       case DSC_UPD_FCPORT:
+               /*
diff --git a/queue-5.9/scsi-qla2xxx-fix-mpi-reset-needed-message.patch b/queue-5.9/scsi-qla2xxx-fix-mpi-reset-needed-message.patch
new file mode 100644 (file)
index 0000000..c44ebcc
--- /dev/null
@@ -0,0 +1,40 @@
+From 7a6cdbd5e87515ebf6231b762ad903c7cff87b9c Mon Sep 17 00:00:00 2001
+From: Arun Easi <aeasi@marvell.com>
+Date: Tue, 29 Sep 2020 03:21:48 -0700
+Subject: scsi: qla2xxx: Fix MPI reset needed message
+
+From: Arun Easi <aeasi@marvell.com>
+
+commit 7a6cdbd5e87515ebf6231b762ad903c7cff87b9c upstream.
+
+When printing the message:
+
+  "MPI Heartbeat stop. MPI reset is not needed.."
+
+..the wrong register was checked leading to always printing that MPI reset
+is not needed, even when it is needed. Fix the MPI reset message.
+
+Link: https://lore.kernel.org/r/20200929102152.32278-4-njavali@marvell.com
+Fixes: cbb01c2f2f63 ("scsi: qla2xxx: Fix MPI failure AEN (8200) handling")
+Cc: stable@vger.kernel.org
+Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
+Signed-off-by: Arun Easi <aeasi@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/scsi/qla2xxx/qla_isr.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/scsi/qla2xxx/qla_isr.c
++++ b/drivers/scsi/qla2xxx/qla_isr.c
+@@ -767,7 +767,7 @@ qla27xx_handle_8200_aen(scsi_qla_host_t
+       ql_log(ql_log_warn, vha, 0x02f0,
+              "MPI Heartbeat stop. MPI reset is%s needed. "
+              "MB0[%xh] MB1[%xh] MB2[%xh] MB3[%xh]\n",
+-             mb[0] & BIT_8 ? "" : " not",
++             mb[1] & BIT_8 ? "" : " not",
+              mb[0], mb[1], mb[2], mb[3]);
+       if ((mb[1] & BIT_8) == 0)
diff --git a/queue-5.9/scsi-qla2xxx-fix-reset-of-mpi-firmware.patch b/queue-5.9/scsi-qla2xxx-fix-reset-of-mpi-firmware.patch
new file mode 100644 (file)
index 0000000..9991155
--- /dev/null
@@ -0,0 +1,171 @@
+From 3e6efab865ac943f4ec43913eb665695737112b0 Mon Sep 17 00:00:00 2001
+From: Arun Easi <aeasi@marvell.com>
+Date: Tue, 29 Sep 2020 03:21:49 -0700
+Subject: scsi: qla2xxx: Fix reset of MPI firmware
+
+From: Arun Easi <aeasi@marvell.com>
+
+commit 3e6efab865ac943f4ec43913eb665695737112b0 upstream.
+
+Normally, the MPI firmware is reset when an MPI dump is collected.  If an
+unsaved MPI dump exists in the driver, though, an alternate mechanism is
+used. This mechanism, which was not fully correct, is not recommended and
+instead an MPI dump template walk is suggested to perform the MPI reset.
+
+To allow for the MPI dump template walk, extra space is reserved in the MPI
+dump buffer which gets used only when there is already an MPI dump in
+place.
+
+Link: https://lore.kernel.org/r/20200929102152.32278-5-njavali@marvell.com
+Fixes: cbb01c2f2f63 ("scsi: qla2xxx: Fix MPI failure AEN (8200) handling")
+Cc: stable@vger.kernel.org
+Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
+Signed-off-by: Arun Easi <aeasi@marvell.com>
+Signed-off-by: Nilesh Javali <njavali@marvell.com>
+Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/scsi/qla2xxx/qla_attr.c |   10 ++++++--
+ drivers/scsi/qla2xxx/qla_gbl.h  |    1 
+ drivers/scsi/qla2xxx/qla_init.c |    2 +
+ drivers/scsi/qla2xxx/qla_tmpl.c |   49 ++++++++++------------------------------
+ 4 files changed, 23 insertions(+), 39 deletions(-)
+
+--- a/drivers/scsi/qla2xxx/qla_attr.c
++++ b/drivers/scsi/qla2xxx/qla_attr.c
+@@ -157,6 +157,14 @@ qla2x00_sysfs_write_fw_dump(struct file
+                              vha->host_no);
+               }
+               break;
++      case 10:
++              if (IS_QLA27XX(ha) || IS_QLA28XX(ha)) {
++                      ql_log(ql_log_info, vha, 0x70e9,
++                             "Issuing MPI firmware dump on host#%ld.\n",
++                             vha->host_no);
++                      ha->isp_ops->mpi_fw_dump(vha, 0);
++              }
++              break;
+       }
+       return count;
+ }
+@@ -744,8 +752,6 @@ qla2x00_sysfs_write_reset(struct file *f
+                       qla83xx_idc_audit(vha, IDC_AUDIT_TIMESTAMP);
+                       qla83xx_idc_unlock(vha, 0);
+                       break;
+-              } else if (IS_QLA27XX(ha) || IS_QLA28XX(ha)) {
+-                      qla27xx_reset_mpi(vha);
+               } else {
+                       /* Make sure FC side is not in reset */
+                       WARN_ON_ONCE(qla2x00_wait_for_hba_online(vha) !=
+--- a/drivers/scsi/qla2xxx/qla_gbl.h
++++ b/drivers/scsi/qla2xxx/qla_gbl.h
+@@ -938,6 +938,5 @@ extern void qla24xx_process_purex_list(s
+ /* nvme.c */
+ void qla_nvme_unregister_remote_port(struct fc_port *fcport);
+-void qla27xx_reset_mpi(scsi_qla_host_t *vha);
+ void qla_handle_els_plogi_done(scsi_qla_host_t *vha, struct event_arg *ea);
+ #endif /* _QLA_GBL_H */
+--- a/drivers/scsi/qla2xxx/qla_init.c
++++ b/drivers/scsi/qla2xxx/qla_init.c
+@@ -3298,6 +3298,8 @@ qla2x00_alloc_fw_dump(scsi_qla_host_t *v
+                           j, fwdt->dump_size);
+                       dump_size += fwdt->dump_size;
+               }
++              /* Add space for spare MPI fw dump. */
++              dump_size += ha->fwdt[1].dump_size;
+       } else {
+               req_q_size = req->length * sizeof(request_t);
+               rsp_q_size = rsp->length * sizeof(response_t);
+--- a/drivers/scsi/qla2xxx/qla_tmpl.c
++++ b/drivers/scsi/qla2xxx/qla_tmpl.c
+@@ -12,33 +12,6 @@
+ #define IOBASE(vha)   IOBAR(ISPREG(vha))
+ #define INVALID_ENTRY ((struct qla27xx_fwdt_entry *)0xffffffffffffffffUL)
+-/* hardware_lock assumed held. */
+-static void
+-qla27xx_write_remote_reg(struct scsi_qla_host *vha,
+-                       u32 addr, u32 data)
+-{
+-      struct device_reg_24xx __iomem *reg = &vha->hw->iobase->isp24;
+-
+-      ql_dbg(ql_dbg_misc, vha, 0xd300,
+-             "%s: addr/data = %xh/%xh\n", __func__, addr, data);
+-
+-      wrt_reg_dword(&reg->iobase_addr, 0x40);
+-      wrt_reg_dword(&reg->iobase_c4, data);
+-      wrt_reg_dword(&reg->iobase_window, addr);
+-}
+-
+-void
+-qla27xx_reset_mpi(scsi_qla_host_t *vha)
+-{
+-      ql_dbg(ql_dbg_misc + ql_dbg_verbose, vha, 0xd301,
+-             "Entered %s.\n", __func__);
+-
+-      qla27xx_write_remote_reg(vha, 0x104050, 0x40004);
+-      qla27xx_write_remote_reg(vha, 0x10405c, 0x4);
+-
+-      vha->hw->stat.num_mpi_reset++;
+-}
+-
+ static inline void
+ qla27xx_insert16(uint16_t value, void *buf, ulong *len)
+ {
+@@ -1028,7 +1001,6 @@ void
+ qla27xx_mpi_fwdump(scsi_qla_host_t *vha, int hardware_locked)
+ {
+       ulong flags = 0;
+-      bool need_mpi_reset = true;
+ #ifndef __CHECKER__
+       if (!hardware_locked)
+@@ -1036,14 +1008,20 @@ qla27xx_mpi_fwdump(scsi_qla_host_t *vha,
+ #endif
+       if (!vha->hw->mpi_fw_dump) {
+               ql_log(ql_log_warn, vha, 0x02f3, "-> mpi_fwdump no buffer\n");
+-      } else if (vha->hw->mpi_fw_dumped) {
+-              ql_log(ql_log_warn, vha, 0x02f4,
+-                     "-> MPI firmware already dumped (%p) -- ignoring request\n",
+-                     vha->hw->mpi_fw_dump);
+       } else {
+               struct fwdt *fwdt = &vha->hw->fwdt[1];
+               ulong len;
+               void *buf = vha->hw->mpi_fw_dump;
++              bool walk_template_only = false;
++
++              if (vha->hw->mpi_fw_dumped) {
++                      /* Use the spare area for any further dumps. */
++                      buf += fwdt->dump_size;
++                      walk_template_only = true;
++                      ql_log(ql_log_warn, vha, 0x02f4,
++                             "-> MPI firmware already dumped -- dump saving to temporary buffer %p.\n",
++                             buf);
++              }
+               ql_log(ql_log_warn, vha, 0x02f5, "-> fwdt1 running...\n");
+               if (!fwdt->template) {
+@@ -1058,9 +1036,10 @@ qla27xx_mpi_fwdump(scsi_qla_host_t *vha,
+                       ql_log(ql_log_warn, vha, 0x02f7,
+                              "-> fwdt1 fwdump residual=%+ld\n",
+                              fwdt->dump_size - len);
+-              } else {
+-                      need_mpi_reset = false;
+               }
++              vha->hw->stat.num_mpi_reset++;
++              if (walk_template_only)
++                      goto bailout;
+               vha->hw->mpi_fw_dump_len = len;
+               vha->hw->mpi_fw_dumped = 1;
+@@ -1072,8 +1051,6 @@ qla27xx_mpi_fwdump(scsi_qla_host_t *vha,
+       }
+ bailout:
+-      if (need_mpi_reset)
+-              qla27xx_reset_mpi(vha);
+ #ifndef __CHECKER__
+       if (!hardware_locked)
+               spin_unlock_irqrestore(&vha->hw->hardware_lock, flags);
index 0d42f4226546618bcf104e11eecc60259b0a9cb4..171954010d4963ff9cb7b98486eccb5dbc5345bb 100644 (file)
@@ -213,3 +213,25 @@ acpi-cpufreq-honor-_psd-table-setting-on-new-amd-cpus.patch
 io-wq-assign-numa-node-locality-if-appropriate.patch
 w1-mxc_w1-fix-timeout-resolution-problem-leading-to-bus-error.patch
 fs-kernel_read_file-remove-firmware_prealloc_buffer-enum.patch
+scsi-mptfusion-fix-null-pointer-dereferences-in-mptscsih_remove.patch
+scsi-qla2xxx-fix-mpi-reset-needed-message.patch
+scsi-qla2xxx-fix-reset-of-mpi-firmware.patch
+scsi-qla2xxx-fix-crash-on-session-cleanup-with-unload.patch
+pm-runtime-remove-link-state-checks-in-rpm_get-put_supplier.patch
+btrfs-qgroup-fix-wrong-qgroup-metadata-reserve-for-delayed-inode.patch
+btrfs-improve-device-scanning-messages.patch
+btrfs-qgroup-fix-qgroup-meta-rsv-leak-for-subvolume-operations.patch
+btrfs-sysfs-init-devices-outside-of-the-chunk_mutex.patch
+btrfs-tracepoints-output-proper-root-owner-for-trace_find_free_extent.patch
+btrfs-reschedule-if-necessary-when-logging-directory-items.patch
+btrfs-send-orphanize-first-all-conflicting-inodes-when-processing-references.patch
+btrfs-send-recompute-reference-path-after-orphanization-of-a-directory.patch
+btrfs-use-kvzalloc-to-allocate-clone_roots-in-btrfs_ioctl_send.patch
+btrfs-tree-checker-fix-false-alert-caused-by-legacy-btrfs-root-item.patch
+btrfs-reschedule-when-cloning-lots-of-extents.patch
+btrfs-cleanup-cow-block-on-error.patch
+btrfs-skip-devices-without-magic-signature-when-mounting.patch
+btrfs-tree-checker-validate-number-of-chunk-stripes-and-parity.patch
+btrfs-fix-use-after-free-on-readahead-extent-after-failure-to-create-it.patch
+btrfs-fix-readahead-hang-and-use-after-free-after-removing-a-device.patch
+btrfs-drop-the-path-before-adding-block-group-sysfs-files.patch