From: Greg Kroah-Hartman Date: Tue, 3 Nov 2020 14:07:10 +0000 (+0100) Subject: 5.9-stable patches X-Git-Tag: v4.14.204~34 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=58449d92ab002097169e02fa4a98379819a18bbf;p=thirdparty%2Fkernel%2Fstable-queue.git 5.9-stable patches added patches: btrfs-cleanup-cow-block-on-error.patch btrfs-drop-the-path-before-adding-block-group-sysfs-files.patch btrfs-fix-readahead-hang-and-use-after-free-after-removing-a-device.patch btrfs-fix-use-after-free-on-readahead-extent-after-failure-to-create-it.patch btrfs-improve-device-scanning-messages.patch btrfs-qgroup-fix-qgroup-meta-rsv-leak-for-subvolume-operations.patch btrfs-qgroup-fix-wrong-qgroup-metadata-reserve-for-delayed-inode.patch btrfs-reschedule-if-necessary-when-logging-directory-items.patch btrfs-reschedule-when-cloning-lots-of-extents.patch btrfs-send-orphanize-first-all-conflicting-inodes-when-processing-references.patch btrfs-send-recompute-reference-path-after-orphanization-of-a-directory.patch btrfs-skip-devices-without-magic-signature-when-mounting.patch btrfs-sysfs-init-devices-outside-of-the-chunk_mutex.patch btrfs-tracepoints-output-proper-root-owner-for-trace_find_free_extent.patch btrfs-tree-checker-fix-false-alert-caused-by-legacy-btrfs-root-item.patch btrfs-tree-checker-validate-number-of-chunk-stripes-and-parity.patch btrfs-use-kvzalloc-to-allocate-clone_roots-in-btrfs_ioctl_send.patch pm-runtime-remove-link-state-checks-in-rpm_get-put_supplier.patch scsi-mptfusion-fix-null-pointer-dereferences-in-mptscsih_remove.patch scsi-qla2xxx-fix-crash-on-session-cleanup-with-unload.patch scsi-qla2xxx-fix-mpi-reset-needed-message.patch scsi-qla2xxx-fix-reset-of-mpi-firmware.patch --- diff --git a/queue-5.9/btrfs-cleanup-cow-block-on-error.patch b/queue-5.9/btrfs-cleanup-cow-block-on-error.patch new file mode 100644 index 00000000000..480b9595ebe --- /dev/null +++ b/queue-5.9/btrfs-cleanup-cow-block-on-error.patch @@ -0,0 +1,135 @@ +From 572c83acdcdafeb04e70aa46be1fa539310be20c Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Tue, 29 Sep 2020 08:53:54 -0400 +Subject: btrfs: cleanup cow block on error + +From: Josef Bacik + +commit 572c83acdcdafeb04e70aa46be1fa539310be20c upstream. + +In fstest btrfs/064 a transaction abort in __btrfs_cow_block could lead +to a system lockup. It gets stuck trying to write back inodes, and the +write back thread was trying to lock an extent buffer: + + $ cat /proc/2143497/stack + [<0>] __btrfs_tree_lock+0x108/0x250 + [<0>] lock_extent_buffer_for_io+0x35e/0x3a0 + [<0>] btree_write_cache_pages+0x15a/0x3b0 + [<0>] do_writepages+0x28/0xb0 + [<0>] __writeback_single_inode+0x54/0x5c0 + [<0>] writeback_sb_inodes+0x1e8/0x510 + [<0>] wb_writeback+0xcc/0x440 + [<0>] wb_workfn+0xd7/0x650 + [<0>] process_one_work+0x236/0x560 + [<0>] worker_thread+0x55/0x3c0 + [<0>] kthread+0x13a/0x150 + [<0>] ret_from_fork+0x1f/0x30 + +This is because we got an error while COWing a block, specifically here + + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { + ret = btrfs_reloc_cow_block(trans, root, buf, cow); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + + [16402.241552] BTRFS: Transaction aborted (error -2) + [16402.242362] WARNING: CPU: 1 PID: 2563188 at fs/btrfs/ctree.c:1074 __btrfs_cow_block+0x376/0x540 + [16402.249469] CPU: 1 PID: 2563188 Comm: fsstress Not tainted 5.9.0-rc6+ #8 + [16402.249936] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 + [16402.250525] RIP: 0010:__btrfs_cow_block+0x376/0x540 + [16402.252417] RSP: 0018:ffff9cca40e578b0 EFLAGS: 00010282 + [16402.252787] RAX: 0000000000000025 RBX: 0000000000000002 RCX: ffff9132bbd19388 + [16402.253278] RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff9132bbd19380 + [16402.254063] RBP: ffff9132b41a49c0 R08: 0000000000000000 R09: 0000000000000000 + [16402.254887] R10: 0000000000000000 R11: ffff91324758b080 R12: ffff91326ef17ce0 + [16402.255694] R13: ffff91325fc0f000 R14: ffff91326ef176b0 R15: ffff9132815e2000 + [16402.256321] FS: 00007f542c6d7b80(0000) GS:ffff9132bbd00000(0000) knlGS:0000000000000000 + [16402.256973] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [16402.257374] CR2: 00007f127b83f250 CR3: 0000000133480002 CR4: 0000000000370ee0 + [16402.257867] Call Trace: + [16402.258072] btrfs_cow_block+0x109/0x230 + [16402.258356] btrfs_search_slot+0x530/0x9d0 + [16402.258655] btrfs_lookup_file_extent+0x37/0x40 + [16402.259155] __btrfs_drop_extents+0x13c/0xd60 + [16402.259628] ? btrfs_block_rsv_migrate+0x4f/0xb0 + [16402.259949] btrfs_replace_file_extents+0x190/0x820 + [16402.260873] btrfs_clone+0x9ae/0xc00 + [16402.261139] btrfs_extent_same_range+0x66/0x90 + [16402.261771] btrfs_remap_file_range+0x353/0x3b1 + [16402.262333] vfs_dedupe_file_range_one.part.0+0xd5/0x140 + [16402.262821] vfs_dedupe_file_range+0x189/0x220 + [16402.263150] do_vfs_ioctl+0x552/0x700 + [16402.263662] __x64_sys_ioctl+0x62/0xb0 + [16402.264023] do_syscall_64+0x33/0x40 + [16402.264364] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + [16402.264862] RIP: 0033:0x7f542c7d15cb + [16402.266901] RSP: 002b:00007ffd35944ea8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 + [16402.267627] RAX: ffffffffffffffda RBX: 00000000009d1968 RCX: 00007f542c7d15cb + [16402.268298] RDX: 00000000009d2490 RSI: 00000000c0189436 RDI: 0000000000000003 + [16402.268958] RBP: 00000000009d2520 R08: 0000000000000036 R09: 00000000009d2e64 + [16402.269726] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000002 + [16402.270659] R13: 000000000001f000 R14: 00000000009d1970 R15: 00000000009d2e80 + [16402.271498] irq event stamp: 0 + [16402.271846] hardirqs last enabled at (0): [<0000000000000000>] 0x0 + [16402.272497] hardirqs last disabled at (0): [] copy_process+0x6b9/0x1ba0 + [16402.273343] softirqs last enabled at (0): [] copy_process+0x6b9/0x1ba0 + [16402.273905] softirqs last disabled at (0): [<0000000000000000>] 0x0 + [16402.274338] ---[ end trace 737874a5a41a8236 ]--- + [16402.274669] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry + [16402.276179] BTRFS info (device dm-9): forced readonly + [16402.277046] BTRFS: error (device dm-9) in btrfs_replace_file_extents:2723: errno=-2 No such entry + [16402.278744] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry + [16402.279968] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry + [16402.280582] BTRFS info (device dm-9): balance: ended with status: -30 + +The problem here is that as soon as we allocate the new block it is +locked and marked dirty in the btree inode. This means that we could +attempt to writeback this block and need to lock the extent buffer. +However we're not unlocking it here and thus we deadlock. + +Fix this by unlocking the cow block if we have any errors inside of +__btrfs_cow_block, and also free it so we do not leak it. + +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -1061,6 +1061,8 @@ static noinline int __btrfs_cow_block(st + + ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); + if (ret) { ++ btrfs_tree_unlock(cow); ++ free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } +@@ -1068,6 +1070,8 @@ static noinline int __btrfs_cow_block(st + if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { + ret = btrfs_reloc_cow_block(trans, root, buf, cow); + if (ret) { ++ btrfs_tree_unlock(cow); ++ free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } +@@ -1100,6 +1104,8 @@ static noinline int __btrfs_cow_block(st + if (last_ref) { + ret = tree_mod_log_free_eb(buf); + if (ret) { ++ btrfs_tree_unlock(cow); ++ free_extent_buffer(cow); + btrfs_abort_transaction(trans, ret); + return ret; + } diff --git a/queue-5.9/btrfs-drop-the-path-before-adding-block-group-sysfs-files.patch b/queue-5.9/btrfs-drop-the-path-before-adding-block-group-sysfs-files.patch new file mode 100644 index 00000000000..e0e0c0db06b --- /dev/null +++ b/queue-5.9/btrfs-drop-the-path-before-adding-block-group-sysfs-files.patch @@ -0,0 +1,221 @@ +From 7837fa88704a66257404bb14144c9e4ab631a28a Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Wed, 14 Oct 2020 17:00:51 -0400 +Subject: btrfs: drop the path before adding block group sysfs files + +From: Josef Bacik + +commit 7837fa88704a66257404bb14144c9e4ab631a28a upstream. + +Dave reported a problem with my rwsem conversion patch where we got the +following lockdep splat: + + ====================================================== + WARNING: possible circular locking dependency detected + 5.9.0-default+ #1297 Not tainted + ------------------------------------------------------ + kswapd0/76 is trying to acquire lock: + ffff9d5d25df2530 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs] + + but task is already holding lock: + ffffffffa40cbba0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 + + which lock already depends on the new lock. + + the existing dependency chain (in reverse order) is: + + -> #4 (fs_reclaim){+.+.}-{0:0}: + __lock_acquire+0x582/0xac0 + lock_acquire+0xca/0x430 + fs_reclaim_acquire.part.0+0x25/0x30 + kmem_cache_alloc+0x30/0x9c0 + alloc_inode+0x81/0x90 + iget_locked+0xcd/0x1a0 + kernfs_get_inode+0x1b/0x130 + kernfs_get_tree+0x136/0x210 + sysfs_get_tree+0x1a/0x50 + vfs_get_tree+0x1d/0xb0 + path_mount+0x70f/0xa80 + do_mount+0x75/0x90 + __x64_sys_mount+0x8e/0xd0 + do_syscall_64+0x2d/0x70 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + -> #3 (kernfs_mutex){+.+.}-{3:3}: + __lock_acquire+0x582/0xac0 + lock_acquire+0xca/0x430 + __mutex_lock+0xa0/0xaf0 + kernfs_add_one+0x23/0x150 + kernfs_create_dir_ns+0x58/0x80 + sysfs_create_dir_ns+0x70/0xd0 + kobject_add_internal+0xbb/0x2d0 + kobject_add+0x7a/0xd0 + btrfs_sysfs_add_block_group_type+0x141/0x1d0 [btrfs] + btrfs_read_block_groups+0x1f1/0x8c0 [btrfs] + open_ctree+0x981/0x1108 [btrfs] + btrfs_mount_root.cold+0xe/0xb0 [btrfs] + legacy_get_tree+0x2d/0x60 + vfs_get_tree+0x1d/0xb0 + fc_mount+0xe/0x40 + vfs_kern_mount.part.0+0x71/0x90 + btrfs_mount+0x13b/0x3e0 [btrfs] + legacy_get_tree+0x2d/0x60 + vfs_get_tree+0x1d/0xb0 + path_mount+0x70f/0xa80 + do_mount+0x75/0x90 + __x64_sys_mount+0x8e/0xd0 + do_syscall_64+0x2d/0x70 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + -> #2 (btrfs-extent-00){++++}-{3:3}: + __lock_acquire+0x582/0xac0 + lock_acquire+0xca/0x430 + down_read_nested+0x45/0x220 + __btrfs_tree_read_lock+0x35/0x1c0 [btrfs] + __btrfs_read_lock_root_node+0x3a/0x50 [btrfs] + btrfs_search_slot+0x6d4/0xfd0 [btrfs] + check_committed_ref+0x69/0x200 [btrfs] + btrfs_cross_ref_exist+0x65/0xb0 [btrfs] + run_delalloc_nocow+0x446/0x9b0 [btrfs] + btrfs_run_delalloc_range+0x61/0x6a0 [btrfs] + writepage_delalloc+0xae/0x160 [btrfs] + __extent_writepage+0x262/0x420 [btrfs] + extent_write_cache_pages+0x2b6/0x510 [btrfs] + extent_writepages+0x43/0x90 [btrfs] + do_writepages+0x40/0xe0 + __writeback_single_inode+0x62/0x610 + writeback_sb_inodes+0x20f/0x500 + wb_writeback+0xef/0x4a0 + wb_do_writeback+0x49/0x2e0 + wb_workfn+0x81/0x340 + process_one_work+0x233/0x5d0 + worker_thread+0x50/0x3b0 + kthread+0x137/0x150 + ret_from_fork+0x1f/0x30 + + -> #1 (btrfs-fs-00){++++}-{3:3}: + __lock_acquire+0x582/0xac0 + lock_acquire+0xca/0x430 + down_read_nested+0x45/0x220 + __btrfs_tree_read_lock+0x35/0x1c0 [btrfs] + __btrfs_read_lock_root_node+0x3a/0x50 [btrfs] + btrfs_search_slot+0x6d4/0xfd0 [btrfs] + btrfs_lookup_inode+0x3a/0xc0 [btrfs] + __btrfs_update_delayed_inode+0x93/0x2c0 [btrfs] + __btrfs_commit_inode_delayed_items+0x7de/0x850 [btrfs] + __btrfs_run_delayed_items+0x8e/0x140 [btrfs] + btrfs_commit_transaction+0x367/0xbc0 [btrfs] + btrfs_mksubvol+0x2db/0x470 [btrfs] + btrfs_mksnapshot+0x7b/0xb0 [btrfs] + __btrfs_ioctl_snap_create+0x16f/0x1a0 [btrfs] + btrfs_ioctl_snap_create_v2+0xb0/0xf0 [btrfs] + btrfs_ioctl+0xd0b/0x2690 [btrfs] + __x64_sys_ioctl+0x6f/0xa0 + do_syscall_64+0x2d/0x70 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + -> #0 (&delayed_node->mutex){+.+.}-{3:3}: + check_prev_add+0x91/0xc60 + validate_chain+0xa6e/0x2a20 + __lock_acquire+0x582/0xac0 + lock_acquire+0xca/0x430 + __mutex_lock+0xa0/0xaf0 + __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs] + btrfs_evict_inode+0x3cc/0x560 [btrfs] + evict+0xd6/0x1c0 + dispose_list+0x48/0x70 + prune_icache_sb+0x54/0x80 + super_cache_scan+0x121/0x1a0 + do_shrink_slab+0x16d/0x3b0 + shrink_slab+0xb1/0x2e0 + shrink_node+0x230/0x6a0 + balance_pgdat+0x325/0x750 + kswapd+0x206/0x4d0 + kthread+0x137/0x150 + ret_from_fork+0x1f/0x30 + + other info that might help us debug this: + + Chain exists of: + &delayed_node->mutex --> kernfs_mutex --> fs_reclaim + + Possible unsafe locking scenario: + + CPU0 CPU1 + ---- ---- + lock(fs_reclaim); + lock(kernfs_mutex); + lock(fs_reclaim); + lock(&delayed_node->mutex); + + *** DEADLOCK *** + + 3 locks held by kswapd0/76: + #0: ffffffffa40cbba0 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 + #1: ffffffffa40b8b58 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x54/0x2e0 + #2: ffff9d5d322390e8 (&type->s_umount_key#26){++++}-{3:3}, at: trylock_super+0x16/0x50 + + stack backtrace: + CPU: 2 PID: 76 Comm: kswapd0 Not tainted 5.9.0-default+ #1297 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 + Call Trace: + dump_stack+0x77/0x97 + check_noncircular+0xff/0x110 + ? save_trace+0x50/0x470 + check_prev_add+0x91/0xc60 + validate_chain+0xa6e/0x2a20 + ? save_trace+0x50/0x470 + __lock_acquire+0x582/0xac0 + lock_acquire+0xca/0x430 + ? __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs] + __mutex_lock+0xa0/0xaf0 + ? __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs] + ? __lock_acquire+0x582/0xac0 + ? __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs] + ? btrfs_evict_inode+0x30b/0x560 [btrfs] + ? __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs] + __btrfs_release_delayed_node.part.0+0x3f/0x320 [btrfs] + btrfs_evict_inode+0x3cc/0x560 [btrfs] + evict+0xd6/0x1c0 + dispose_list+0x48/0x70 + prune_icache_sb+0x54/0x80 + super_cache_scan+0x121/0x1a0 + do_shrink_slab+0x16d/0x3b0 + shrink_slab+0xb1/0x2e0 + shrink_node+0x230/0x6a0 + balance_pgdat+0x325/0x750 + kswapd+0x206/0x4d0 + ? finish_wait+0x90/0x90 + ? balance_pgdat+0x750/0x750 + kthread+0x137/0x150 + ? kthread_mod_delayed_work+0xc0/0xc0 + ret_from_fork+0x1f/0x30 + +This happens because we are still holding the path open when we start +adding the sysfs files for the block groups, which creates a dependency +on fs_reclaim via the tree lock. Fix this by dropping the path before +we start doing anything with sysfs. + +Reported-by: David Sterba +CC: stable@vger.kernel.org # 5.8+ +Reviewed-by: Anand Jain +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/block-group.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -2034,6 +2034,7 @@ int btrfs_read_block_groups(struct btrfs + key.offset = 0; + btrfs_release_path(path); + } ++ btrfs_release_path(path); + + rcu_read_lock(); + list_for_each_entry_rcu(space_info, &info->space_info, list) { diff --git a/queue-5.9/btrfs-fix-readahead-hang-and-use-after-free-after-removing-a-device.patch b/queue-5.9/btrfs-fix-readahead-hang-and-use-after-free-after-removing-a-device.patch new file mode 100644 index 00000000000..fdbd97aa0c3 --- /dev/null +++ b/queue-5.9/btrfs-fix-readahead-hang-and-use-after-free-after-removing-a-device.patch @@ -0,0 +1,678 @@ +From 66d204a16c94f24ad08290a7663ab67e7fc04e82 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 12 Oct 2020 11:55:24 +0100 +Subject: btrfs: fix readahead hang and use-after-free after removing a device + +From: Filipe Manana + +commit 66d204a16c94f24ad08290a7663ab67e7fc04e82 upstream. + +Very sporadically I had test case btrfs/069 from fstests hanging (for +years, it is not a recent regression), with the following traces in +dmesg/syslog: + + [162301.160628] BTRFS info (device sdc): dev_replace from /dev/sdd (devid 2) to /dev/sdg started + [162301.181196] BTRFS info (device sdc): scrub: finished on devid 4 with status: 0 + [162301.287162] BTRFS info (device sdc): dev_replace from /dev/sdd (devid 2) to /dev/sdg finished + [162513.513792] INFO: task btrfs-transacti:1356167 blocked for more than 120 seconds. + [162513.514318] Not tainted 5.9.0-rc6-btrfs-next-69 #1 + [162513.514522] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. + [162513.514747] task:btrfs-transacti state:D stack: 0 pid:1356167 ppid: 2 flags:0x00004000 + [162513.514751] Call Trace: + [162513.514761] __schedule+0x5ce/0xd00 + [162513.514765] ? _raw_spin_unlock_irqrestore+0x3c/0x60 + [162513.514771] schedule+0x46/0xf0 + [162513.514844] wait_current_trans+0xde/0x140 [btrfs] + [162513.514850] ? finish_wait+0x90/0x90 + [162513.514864] start_transaction+0x37c/0x5f0 [btrfs] + [162513.514879] transaction_kthread+0xa4/0x170 [btrfs] + [162513.514891] ? btrfs_cleanup_transaction+0x660/0x660 [btrfs] + [162513.514894] kthread+0x153/0x170 + [162513.514897] ? kthread_stop+0x2c0/0x2c0 + [162513.514902] ret_from_fork+0x22/0x30 + [162513.514916] INFO: task fsstress:1356184 blocked for more than 120 seconds. + [162513.515192] Not tainted 5.9.0-rc6-btrfs-next-69 #1 + [162513.515431] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. + [162513.515680] task:fsstress state:D stack: 0 pid:1356184 ppid:1356177 flags:0x00004000 + [162513.515682] Call Trace: + [162513.515688] __schedule+0x5ce/0xd00 + [162513.515691] ? _raw_spin_unlock_irqrestore+0x3c/0x60 + [162513.515697] schedule+0x46/0xf0 + [162513.515712] wait_current_trans+0xde/0x140 [btrfs] + [162513.515716] ? finish_wait+0x90/0x90 + [162513.515729] start_transaction+0x37c/0x5f0 [btrfs] + [162513.515743] btrfs_attach_transaction_barrier+0x1f/0x50 [btrfs] + [162513.515753] btrfs_sync_fs+0x61/0x1c0 [btrfs] + [162513.515758] ? __ia32_sys_fdatasync+0x20/0x20 + [162513.515761] iterate_supers+0x87/0xf0 + [162513.515765] ksys_sync+0x60/0xb0 + [162513.515768] __do_sys_sync+0xa/0x10 + [162513.515771] do_syscall_64+0x33/0x80 + [162513.515774] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + [162513.515781] RIP: 0033:0x7f5238f50bd7 + [162513.515782] Code: Bad RIP value. + [162513.515784] RSP: 002b:00007fff67b978e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a2 + [162513.515786] RAX: ffffffffffffffda RBX: 000055b1fad2c560 RCX: 00007f5238f50bd7 + [162513.515788] RDX: 00000000ffffffff RSI: 000000000daf0e74 RDI: 000000000000003a + [162513.515789] RBP: 0000000000000032 R08: 000000000000000a R09: 00007f5239019be0 + [162513.515791] R10: fffffffffffff24f R11: 0000000000000206 R12: 000000000000003a + [162513.515792] R13: 00007fff67b97950 R14: 00007fff67b97906 R15: 000055b1fad1a340 + [162513.515804] INFO: task fsstress:1356185 blocked for more than 120 seconds. + [162513.516064] Not tainted 5.9.0-rc6-btrfs-next-69 #1 + [162513.516329] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. + [162513.516617] task:fsstress state:D stack: 0 pid:1356185 ppid:1356177 flags:0x00000000 + [162513.516620] Call Trace: + [162513.516625] __schedule+0x5ce/0xd00 + [162513.516628] ? _raw_spin_unlock_irqrestore+0x3c/0x60 + [162513.516634] schedule+0x46/0xf0 + [162513.516647] wait_current_trans+0xde/0x140 [btrfs] + [162513.516650] ? finish_wait+0x90/0x90 + [162513.516662] start_transaction+0x4d7/0x5f0 [btrfs] + [162513.516679] btrfs_setxattr_trans+0x3c/0x100 [btrfs] + [162513.516686] __vfs_setxattr+0x66/0x80 + [162513.516691] __vfs_setxattr_noperm+0x70/0x200 + [162513.516697] vfs_setxattr+0x6b/0x120 + [162513.516703] setxattr+0x125/0x240 + [162513.516709] ? lock_acquire+0xb1/0x480 + [162513.516712] ? mnt_want_write+0x20/0x50 + [162513.516721] ? rcu_read_lock_any_held+0x8e/0xb0 + [162513.516723] ? preempt_count_add+0x49/0xa0 + [162513.516725] ? __sb_start_write+0x19b/0x290 + [162513.516727] ? preempt_count_add+0x49/0xa0 + [162513.516732] path_setxattr+0xba/0xd0 + [162513.516739] __x64_sys_setxattr+0x27/0x30 + [162513.516741] do_syscall_64+0x33/0x80 + [162513.516743] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + [162513.516745] RIP: 0033:0x7f5238f56d5a + [162513.516746] Code: Bad RIP value. + [162513.516748] RSP: 002b:00007fff67b97868 EFLAGS: 00000202 ORIG_RAX: 00000000000000bc + [162513.516750] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f5238f56d5a + [162513.516751] RDX: 000055b1fbb0d5a0 RSI: 00007fff67b978a0 RDI: 000055b1fbb0d470 + [162513.516753] RBP: 000055b1fbb0d5a0 R08: 0000000000000001 R09: 00007fff67b97700 + [162513.516754] R10: 0000000000000004 R11: 0000000000000202 R12: 0000000000000004 + [162513.516756] R13: 0000000000000024 R14: 0000000000000001 R15: 00007fff67b978a0 + [162513.516767] INFO: task fsstress:1356196 blocked for more than 120 seconds. + [162513.517064] Not tainted 5.9.0-rc6-btrfs-next-69 #1 + [162513.517365] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. + [162513.517763] task:fsstress state:D stack: 0 pid:1356196 ppid:1356177 flags:0x00004000 + [162513.517780] Call Trace: + [162513.517786] __schedule+0x5ce/0xd00 + [162513.517789] ? _raw_spin_unlock_irqrestore+0x3c/0x60 + [162513.517796] schedule+0x46/0xf0 + [162513.517810] wait_current_trans+0xde/0x140 [btrfs] + [162513.517814] ? finish_wait+0x90/0x90 + [162513.517829] start_transaction+0x37c/0x5f0 [btrfs] + [162513.517845] btrfs_attach_transaction_barrier+0x1f/0x50 [btrfs] + [162513.517857] btrfs_sync_fs+0x61/0x1c0 [btrfs] + [162513.517862] ? __ia32_sys_fdatasync+0x20/0x20 + [162513.517865] iterate_supers+0x87/0xf0 + [162513.517869] ksys_sync+0x60/0xb0 + [162513.517872] __do_sys_sync+0xa/0x10 + [162513.517875] do_syscall_64+0x33/0x80 + [162513.517878] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + [162513.517881] RIP: 0033:0x7f5238f50bd7 + [162513.517883] Code: Bad RIP value. + [162513.517885] RSP: 002b:00007fff67b978e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a2 + [162513.517887] RAX: ffffffffffffffda RBX: 000055b1fad2c560 RCX: 00007f5238f50bd7 + [162513.517889] RDX: 0000000000000000 RSI: 000000007660add2 RDI: 0000000000000053 + [162513.517891] RBP: 0000000000000032 R08: 0000000000000067 R09: 00007f5239019be0 + [162513.517893] R10: fffffffffffff24f R11: 0000000000000206 R12: 0000000000000053 + [162513.517895] R13: 00007fff67b97950 R14: 00007fff67b97906 R15: 000055b1fad1a340 + [162513.517908] INFO: task fsstress:1356197 blocked for more than 120 seconds. + [162513.518298] Not tainted 5.9.0-rc6-btrfs-next-69 #1 + [162513.518672] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. + [162513.519157] task:fsstress state:D stack: 0 pid:1356197 ppid:1356177 flags:0x00000000 + [162513.519160] Call Trace: + [162513.519165] __schedule+0x5ce/0xd00 + [162513.519168] ? _raw_spin_unlock_irqrestore+0x3c/0x60 + [162513.519174] schedule+0x46/0xf0 + [162513.519190] wait_current_trans+0xde/0x140 [btrfs] + [162513.519193] ? finish_wait+0x90/0x90 + [162513.519206] start_transaction+0x4d7/0x5f0 [btrfs] + [162513.519222] btrfs_create+0x57/0x200 [btrfs] + [162513.519230] lookup_open+0x522/0x650 + [162513.519246] path_openat+0x2b8/0xa50 + [162513.519270] do_filp_open+0x91/0x100 + [162513.519275] ? find_held_lock+0x32/0x90 + [162513.519280] ? lock_acquired+0x33b/0x470 + [162513.519285] ? do_raw_spin_unlock+0x4b/0xc0 + [162513.519287] ? _raw_spin_unlock+0x29/0x40 + [162513.519295] do_sys_openat2+0x20d/0x2d0 + [162513.519300] do_sys_open+0x44/0x80 + [162513.519304] do_syscall_64+0x33/0x80 + [162513.519307] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + [162513.519309] RIP: 0033:0x7f5238f4a903 + [162513.519310] Code: Bad RIP value. + [162513.519312] RSP: 002b:00007fff67b97758 EFLAGS: 00000246 ORIG_RAX: 0000000000000055 + [162513.519314] RAX: ffffffffffffffda RBX: 00000000ffffffff RCX: 00007f5238f4a903 + [162513.519316] RDX: 0000000000000000 RSI: 00000000000001b6 RDI: 000055b1fbb0d470 + [162513.519317] RBP: 00007fff67b978c0 R08: 0000000000000001 R09: 0000000000000002 + [162513.519319] R10: 00007fff67b974f7 R11: 0000000000000246 R12: 0000000000000013 + [162513.519320] R13: 00000000000001b6 R14: 00007fff67b97906 R15: 000055b1fad1c620 + [162513.519332] INFO: task btrfs:1356211 blocked for more than 120 seconds. + [162513.519727] Not tainted 5.9.0-rc6-btrfs-next-69 #1 + [162513.520115] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. + [162513.520508] task:btrfs state:D stack: 0 pid:1356211 ppid:1356178 flags:0x00004002 + [162513.520511] Call Trace: + [162513.520516] __schedule+0x5ce/0xd00 + [162513.520519] ? _raw_spin_unlock_irqrestore+0x3c/0x60 + [162513.520525] schedule+0x46/0xf0 + [162513.520544] btrfs_scrub_pause+0x11f/0x180 [btrfs] + [162513.520548] ? finish_wait+0x90/0x90 + [162513.520562] btrfs_commit_transaction+0x45a/0xc30 [btrfs] + [162513.520574] ? start_transaction+0xe0/0x5f0 [btrfs] + [162513.520596] btrfs_dev_replace_finishing+0x6d8/0x711 [btrfs] + [162513.520619] btrfs_dev_replace_by_ioctl.cold+0x1cc/0x1fd [btrfs] + [162513.520639] btrfs_ioctl+0x2a25/0x36f0 [btrfs] + [162513.520643] ? do_sigaction+0xf3/0x240 + [162513.520645] ? find_held_lock+0x32/0x90 + [162513.520648] ? do_sigaction+0xf3/0x240 + [162513.520651] ? lock_acquired+0x33b/0x470 + [162513.520655] ? _raw_spin_unlock_irq+0x24/0x50 + [162513.520657] ? lockdep_hardirqs_on+0x7d/0x100 + [162513.520660] ? _raw_spin_unlock_irq+0x35/0x50 + [162513.520662] ? do_sigaction+0xf3/0x240 + [162513.520671] ? __x64_sys_ioctl+0x83/0xb0 + [162513.520672] __x64_sys_ioctl+0x83/0xb0 + [162513.520677] do_syscall_64+0x33/0x80 + [162513.520679] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + [162513.520681] RIP: 0033:0x7fc3cd307d87 + [162513.520682] Code: Bad RIP value. + [162513.520684] RSP: 002b:00007ffe30a56bb8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 + [162513.520686] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007fc3cd307d87 + [162513.520687] RDX: 00007ffe30a57a30 RSI: 00000000ca289435 RDI: 0000000000000003 + [162513.520689] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 + [162513.520690] R10: 0000000000000008 R11: 0000000000000202 R12: 0000000000000003 + [162513.520692] R13: 0000557323a212e0 R14: 00007ffe30a5a520 R15: 0000000000000001 + [162513.520703] + Showing all locks held in the system: + [162513.520712] 1 lock held by khungtaskd/54: + [162513.520713] #0: ffffffffb40a91a0 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x15/0x197 + [162513.520728] 1 lock held by in:imklog/596: + [162513.520729] #0: ffff8f3f0d781400 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x4d/0x60 + [162513.520782] 1 lock held by btrfs-transacti/1356167: + [162513.520784] #0: ffff8f3d810cc848 (&fs_info->transaction_kthread_mutex){+.+.}-{3:3}, at: transaction_kthread+0x4a/0x170 [btrfs] + [162513.520798] 1 lock held by btrfs/1356190: + [162513.520800] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write_file+0x22/0x60 + [162513.520805] 1 lock held by fsstress/1356184: + [162513.520806] #0: ffff8f3d576440e8 (&type->s_umount_key#62){++++}-{3:3}, at: iterate_supers+0x6f/0xf0 + [162513.520811] 3 locks held by fsstress/1356185: + [162513.520812] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write+0x20/0x50 + [162513.520815] #1: ffff8f3d80a650b8 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: vfs_setxattr+0x50/0x120 + [162513.520820] #2: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs] + [162513.520833] 1 lock held by fsstress/1356196: + [162513.520834] #0: ffff8f3d576440e8 (&type->s_umount_key#62){++++}-{3:3}, at: iterate_supers+0x6f/0xf0 + [162513.520838] 3 locks held by fsstress/1356197: + [162513.520839] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write+0x20/0x50 + [162513.520843] #1: ffff8f3d506465e8 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: path_openat+0x2a7/0xa50 + [162513.520846] #2: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs] + [162513.520858] 2 locks held by btrfs/1356211: + [162513.520859] #0: ffff8f3d810cde30 (&fs_info->dev_replace.lock_finishing_cancel_unmount){+.+.}-{3:3}, at: btrfs_dev_replace_finishing+0x52/0x711 [btrfs] + [162513.520877] #1: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs] + +This was weird because the stack traces show that a transaction commit, +triggered by a device replace operation, is blocking trying to pause any +running scrubs but there are no stack traces of blocked tasks doing a +scrub. + +After poking around with drgn, I noticed there was a scrub task that was +constantly running and blocking for shorts periods of time: + + >>> t = find_task(prog, 1356190) + >>> prog.stack_trace(t) + #0 __schedule+0x5ce/0xcfc + #1 schedule+0x46/0xe4 + #2 schedule_timeout+0x1df/0x475 + #3 btrfs_reada_wait+0xda/0x132 + #4 scrub_stripe+0x2a8/0x112f + #5 scrub_chunk+0xcd/0x134 + #6 scrub_enumerate_chunks+0x29e/0x5ee + #7 btrfs_scrub_dev+0x2d5/0x91b + #8 btrfs_ioctl+0x7f5/0x36e7 + #9 __x64_sys_ioctl+0x83/0xb0 + #10 do_syscall_64+0x33/0x77 + #11 entry_SYSCALL_64+0x7c/0x156 + +Which corresponds to: + +int btrfs_reada_wait(void *handle) +{ + struct reada_control *rc = handle; + struct btrfs_fs_info *fs_info = rc->fs_info; + + while (atomic_read(&rc->elems)) { + if (!atomic_read(&fs_info->reada_works_cnt)) + reada_start_machine(fs_info); + wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, + (HZ + 9) / 10); + } +(...) + +So the counter "rc->elems" was set to 1 and never decreased to 0, causing +the scrub task to loop forever in that function. Then I used the following +script for drgn to check the readahead requests: + + $ cat dump_reada.py + import sys + import drgn + from drgn import NULL, Object, cast, container_of, execscript, \ + reinterpret, sizeof + from drgn.helpers.linux import * + + mnt_path = b"/home/fdmanana/btrfs-tests/scratch_1" + + mnt = None + for mnt in for_each_mount(prog, dst = mnt_path): + pass + + if mnt is None: + sys.stderr.write(f'Error: mount point {mnt_path} not found\n') + sys.exit(1) + + fs_info = cast('struct btrfs_fs_info *', mnt.mnt.mnt_sb.s_fs_info) + + def dump_re(re): + nzones = re.nzones.value_() + print(f're at {hex(re.value_())}') + print(f'\t logical {re.logical.value_()}') + print(f'\t refcnt {re.refcnt.value_()}') + print(f'\t nzones {nzones}') + for i in range(nzones): + dev = re.zones[i].device + name = dev.name.str.string_() + print(f'\t\t dev id {dev.devid.value_()} name {name}') + print() + + for _, e in radix_tree_for_each(fs_info.reada_tree): + re = cast('struct reada_extent *', e) + dump_re(re) + + $ drgn dump_reada.py + re at 0xffff8f3da9d25ad8 + logical 38928384 + refcnt 1 + nzones 1 + dev id 0 name b'/dev/sdd' + $ + +So there was one readahead extent with a single zone corresponding to the +source device of that last device replace operation logged in dmesg/syslog. +Also the ID of that zone's device was 0 which is a special value set in +the source device of a device replace operation when the operation finishes +(constant BTRFS_DEV_REPLACE_DEVID set at btrfs_dev_replace_finishing()), +confirming again that device /dev/sdd was the source of a device replace +operation. + +Normally there should be as many zones in the readahead extent as there are +devices, and I wasn't expecting the extent to be in a block group with a +'single' profile, so I went and confirmed with the following drgn script +that there weren't any single profile block groups: + + $ cat dump_block_groups.py + import sys + import drgn + from drgn import NULL, Object, cast, container_of, execscript, \ + reinterpret, sizeof + from drgn.helpers.linux import * + + mnt_path = b"/home/fdmanana/btrfs-tests/scratch_1" + + mnt = None + for mnt in for_each_mount(prog, dst = mnt_path): + pass + + if mnt is None: + sys.stderr.write(f'Error: mount point {mnt_path} not found\n') + sys.exit(1) + + fs_info = cast('struct btrfs_fs_info *', mnt.mnt.mnt_sb.s_fs_info) + + BTRFS_BLOCK_GROUP_DATA = (1 << 0) + BTRFS_BLOCK_GROUP_SYSTEM = (1 << 1) + BTRFS_BLOCK_GROUP_METADATA = (1 << 2) + BTRFS_BLOCK_GROUP_RAID0 = (1 << 3) + BTRFS_BLOCK_GROUP_RAID1 = (1 << 4) + BTRFS_BLOCK_GROUP_DUP = (1 << 5) + BTRFS_BLOCK_GROUP_RAID10 = (1 << 6) + BTRFS_BLOCK_GROUP_RAID5 = (1 << 7) + BTRFS_BLOCK_GROUP_RAID6 = (1 << 8) + BTRFS_BLOCK_GROUP_RAID1C3 = (1 << 9) + BTRFS_BLOCK_GROUP_RAID1C4 = (1 << 10) + + def bg_flags_string(bg): + flags = bg.flags.value_() + ret = '' + if flags & BTRFS_BLOCK_GROUP_DATA: + ret = 'data' + if flags & BTRFS_BLOCK_GROUP_METADATA: + if len(ret) > 0: + ret += '|' + ret += 'meta' + if flags & BTRFS_BLOCK_GROUP_SYSTEM: + if len(ret) > 0: + ret += '|' + ret += 'system' + if flags & BTRFS_BLOCK_GROUP_RAID0: + ret += ' raid0' + elif flags & BTRFS_BLOCK_GROUP_RAID1: + ret += ' raid1' + elif flags & BTRFS_BLOCK_GROUP_DUP: + ret += ' dup' + elif flags & BTRFS_BLOCK_GROUP_RAID10: + ret += ' raid10' + elif flags & BTRFS_BLOCK_GROUP_RAID5: + ret += ' raid5' + elif flags & BTRFS_BLOCK_GROUP_RAID6: + ret += ' raid6' + elif flags & BTRFS_BLOCK_GROUP_RAID1C3: + ret += ' raid1c3' + elif flags & BTRFS_BLOCK_GROUP_RAID1C4: + ret += ' raid1c4' + else: + ret += ' single' + + return ret + + def dump_bg(bg): + print() + print(f'block group at {hex(bg.value_())}') + print(f'\t start {bg.start.value_()} length {bg.length.value_()}') + print(f'\t flags {bg.flags.value_()} - {bg_flags_string(bg)}') + + bg_root = fs_info.block_group_cache_tree.address_of_() + for bg in rbtree_inorder_for_each_entry('struct btrfs_block_group', bg_root, 'cache_node'): + dump_bg(bg) + + $ drgn dump_block_groups.py + + block group at 0xffff8f3d673b0400 + start 22020096 length 16777216 + flags 258 - system raid6 + + block group at 0xffff8f3d53ddb400 + start 38797312 length 536870912 + flags 260 - meta raid6 + + block group at 0xffff8f3d5f4d9c00 + start 575668224 length 2147483648 + flags 257 - data raid6 + + block group at 0xffff8f3d08189000 + start 2723151872 length 67108864 + flags 258 - system raid6 + + block group at 0xffff8f3db70ff000 + start 2790260736 length 1073741824 + flags 260 - meta raid6 + + block group at 0xffff8f3d5f4dd800 + start 3864002560 length 67108864 + flags 258 - system raid6 + + block group at 0xffff8f3d67037000 + start 3931111424 length 2147483648 + flags 257 - data raid6 + $ + +So there were only 2 reasons left for having a readahead extent with a +single zone: reada_find_zone(), called when creating a readahead extent, +returned NULL either because we failed to find the corresponding block +group or because a memory allocation failed. With some additional and +custom tracing I figured out that on every further ocurrence of the +problem the block group had just been deleted when we were looping to +create the zones for the readahead extent (at reada_find_extent()), so we +ended up with only one zone in the readahead extent, corresponding to a +device that ends up getting replaced. + +So after figuring that out it became obvious why the hang happens: + +1) Task A starts a scrub on any device of the filesystem, except for + device /dev/sdd; + +2) Task B starts a device replace with /dev/sdd as the source device; + +3) Task A calls btrfs_reada_add() from scrub_stripe() and it is currently + starting to scrub a stripe from block group X. This call to + btrfs_reada_add() is the one for the extent tree. When btrfs_reada_add() + calls reada_add_block(), it passes the logical address of the extent + tree's root node as its 'logical' argument - a value of 38928384; + +4) Task A then enters reada_find_extent(), called from reada_add_block(). + It finds there isn't any existing readahead extent for the logical + address 38928384, so it proceeds to the path of creating a new one. + + It calls btrfs_map_block() to find out which stripes exist for the block + group X. On the first iteration of the for loop that iterates over the + stripes, it finds the stripe for device /dev/sdd, so it creates one + zone for that device and adds it to the readahead extent. Before getting + into the second iteration of the loop, the cleanup kthread deletes block + group X because it was empty. So in the iterations for the remaining + stripes it does not add more zones to the readahead extent, because the + calls to reada_find_zone() returned NULL because they couldn't find + block group X anymore. + + As a result the new readahead extent has a single zone, corresponding to + the device /dev/sdd; + +4) Before task A returns to btrfs_reada_add() and queues the readahead job + for the readahead work queue, task B finishes the device replace and at + btrfs_dev_replace_finishing() swaps the device /dev/sdd with the new + device /dev/sdg; + +5) Task A returns to reada_add_block(), which increments the counter + "->elems" of the reada_control structure allocated at btrfs_reada_add(). + + Then it returns back to btrfs_reada_add() and calls + reada_start_machine(). This queues a job in the readahead work queue to + run the function reada_start_machine_worker(), which calls + __reada_start_machine(). + + At __reada_start_machine() we take the device list mutex and for each + device found in the current device list, we call + reada_start_machine_dev() to start the readahead work. However at this + point the device /dev/sdd was already freed and is not in the device + list anymore. + + This means the corresponding readahead for the extent at 38928384 is + never started, and therefore the "->elems" counter of the reada_control + structure allocated at btrfs_reada_add() never goes down to 0, causing + the call to btrfs_reada_wait(), done by the scrub task, to wait forever. + +Note that the readahead request can be made either after the device replace +started or before it started, however in pratice it is very unlikely that a +device replace is able to start after a readahead request is made and is +able to complete before the readahead request completes - maybe only on a +very small and nearly empty filesystem. + +This hang however is not the only problem we can have with readahead and +device removals. When the readahead extent has other zones other than the +one corresponding to the device that is being removed (either by a device +replace or a device remove operation), we risk having a use-after-free on +the device when dropping the last reference of the readahead extent. + +For example if we create a readahead extent with two zones, one for the +device /dev/sdd and one for the device /dev/sde: + +1) Before the readahead worker starts, the device /dev/sdd is removed, + and the corresponding btrfs_device structure is freed. However the + readahead extent still has the zone pointing to the device structure; + +2) When the readahead worker starts, it only finds device /dev/sde in the + current device list of the filesystem; + +3) It starts the readahead work, at reada_start_machine_dev(), using the + device /dev/sde; + +4) Then when it finishes reading the extent from device /dev/sde, it calls + __readahead_hook() which ends up dropping the last reference on the + readahead extent through the last call to reada_extent_put(); + +5) At reada_extent_put() it iterates over each zone of the readahead extent + and attempts to delete an element from the device's 'reada_extents' + radix tree, resulting in a use-after-free, as the device pointer of the + zone for /dev/sdd is now stale. We can also access the device after + dropping the last reference of a zone, through reada_zone_release(), + also called by reada_extent_put(). + +And a device remove suffers the same problem, however since it shrinks the +device size down to zero before removing the device, it is very unlikely to +still have readahead requests not completed by the time we free the device, +the only possibility is if the device has a very little space allocated. + +While the hang problem is exclusive to scrub, since it is currently the +only user of btrfs_reada_add() and btrfs_reada_wait(), the use-after-free +problem affects any path that triggers readhead, which includes +btree_readahead_hook() and __readahead_hook() (a readahead worker can +trigger readahed for the children of a node) for example - any path that +ends up calling reada_add_block() can trigger the use-after-free after a +device is removed. + +So fix this by waiting for any readahead requests for a device to complete +before removing a device, ensuring that while waiting for existing ones no +new ones can be made. + +This problem has been around for a very long time - the readahead code was +added in 2011, device remove exists since 2008 and device replace was +introduced in 2013, hard to pick a specific commit for a git Fixes tag. + +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.h | 2 ++ + fs/btrfs/dev-replace.c | 5 +++++ + fs/btrfs/reada.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ + fs/btrfs/volumes.c | 3 +++ + fs/btrfs/volumes.h | 1 + + 5 files changed, 56 insertions(+) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -3517,6 +3517,8 @@ struct reada_control *btrfs_reada_add(st + int btrfs_reada_wait(void *handle); + void btrfs_reada_detach(void *handle); + int btree_readahead_hook(struct extent_buffer *eb, int err); ++void btrfs_reada_remove_dev(struct btrfs_device *dev); ++void btrfs_reada_undo_remove_dev(struct btrfs_device *dev); + + static inline int is_fstree(u64 rootid) + { +--- a/fs/btrfs/dev-replace.c ++++ b/fs/btrfs/dev-replace.c +@@ -668,6 +668,9 @@ static int btrfs_dev_replace_finishing(s + } + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + ++ if (!scrub_ret) ++ btrfs_reada_remove_dev(src_device); ++ + /* + * We have to use this loop approach because at this point src_device + * has to be available for transaction commit to complete, yet new +@@ -676,6 +679,7 @@ static int btrfs_dev_replace_finishing(s + while (1) { + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { ++ btrfs_reada_undo_remove_dev(src_device); + mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); + return PTR_ERR(trans); + } +@@ -726,6 +730,7 @@ error: + up_write(&dev_replace->rwsem); + mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); ++ btrfs_reada_undo_remove_dev(src_device); + btrfs_rm_dev_replace_blocked(fs_info); + if (tgt_device) + btrfs_destroy_dev_replace_tgtdev(tgt_device); +--- a/fs/btrfs/reada.c ++++ b/fs/btrfs/reada.c +@@ -421,6 +421,9 @@ static struct reada_extent *reada_find_e + if (!dev->bdev) + continue; + ++ if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state)) ++ continue; ++ + if (dev_replace_is_ongoing && + dev == fs_info->dev_replace.tgtdev) { + /* +@@ -1014,3 +1017,45 @@ void btrfs_reada_detach(void *handle) + + kref_put(&rc->refcnt, reada_control_release); + } ++ ++/* ++ * Before removing a device (device replace or device remove ioctls), call this ++ * function to wait for all existing readahead requests on the device and to ++ * make sure no one queues more readahead requests for the device. ++ * ++ * Must be called without holding neither the device list mutex nor the device ++ * replace semaphore, otherwise it will deadlock. ++ */ ++void btrfs_reada_remove_dev(struct btrfs_device *dev) ++{ ++ struct btrfs_fs_info *fs_info = dev->fs_info; ++ ++ /* Serialize with readahead extent creation at reada_find_extent(). */ ++ spin_lock(&fs_info->reada_lock); ++ set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); ++ spin_unlock(&fs_info->reada_lock); ++ ++ /* ++ * There might be readahead requests added to the radix trees which ++ * were not yet added to the readahead work queue. We need to start ++ * them and wait for their completion, otherwise we can end up with ++ * use-after-free problems when dropping the last reference on the ++ * readahead extents and their zones, as they need to access the ++ * device structure. ++ */ ++ reada_start_machine(fs_info); ++ btrfs_flush_workqueue(fs_info->readahead_workers); ++} ++ ++/* ++ * If when removing a device (device replace or device remove ioctls) an error ++ * happens after calling btrfs_reada_remove_dev(), call this to undo what that ++ * function did. This is safe to call even if btrfs_reada_remove_dev() was not ++ * called before. ++ */ ++void btrfs_reada_undo_remove_dev(struct btrfs_device *dev) ++{ ++ spin_lock(&dev->fs_info->reada_lock); ++ clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); ++ spin_unlock(&dev->fs_info->reada_lock); ++} +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -2104,6 +2104,8 @@ int btrfs_rm_device(struct btrfs_fs_info + + mutex_unlock(&uuid_mutex); + ret = btrfs_shrink_device(device, 0); ++ if (!ret) ++ btrfs_reada_remove_dev(device); + mutex_lock(&uuid_mutex); + if (ret) + goto error_undo; +@@ -2191,6 +2193,7 @@ out: + return ret; + + error_undo: ++ btrfs_reada_undo_remove_dev(device); + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { + mutex_lock(&fs_info->chunk_mutex); + list_add(&device->dev_alloc_list, +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -50,6 +50,7 @@ struct btrfs_io_geometry { + #define BTRFS_DEV_STATE_MISSING (2) + #define BTRFS_DEV_STATE_REPLACE_TGT (3) + #define BTRFS_DEV_STATE_FLUSH_SENT (4) ++#define BTRFS_DEV_STATE_NO_READA (5) + + struct btrfs_device { + struct list_head dev_list; /* device_list_mutex */ diff --git a/queue-5.9/btrfs-fix-use-after-free-on-readahead-extent-after-failure-to-create-it.patch b/queue-5.9/btrfs-fix-use-after-free-on-readahead-extent-after-failure-to-create-it.patch new file mode 100644 index 00000000000..3435a0d9f6e --- /dev/null +++ b/queue-5.9/btrfs-fix-use-after-free-on-readahead-extent-after-failure-to-create-it.patch @@ -0,0 +1,139 @@ +From 83bc1560e02e25c6439341352024ebe8488f4fbd Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 12 Oct 2020 11:55:23 +0100 +Subject: btrfs: fix use-after-free on readahead extent after failure to create it + +From: Filipe Manana + +commit 83bc1560e02e25c6439341352024ebe8488f4fbd upstream. + +If we fail to find suitable zones for a new readahead extent, we end up +leaving a stale pointer in the global readahead extents radix tree +(fs_info->reada_tree), which can trigger the following trace later on: + + [13367.696354] BUG: kernel NULL pointer dereference, address: 00000000000000b0 + [13367.696802] #PF: supervisor read access in kernel mode + [13367.697249] #PF: error_code(0x0000) - not-present page + [13367.697721] PGD 0 P4D 0 + [13367.698171] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI + [13367.698632] CPU: 6 PID: 851214 Comm: btrfs Tainted: G W 5.9.0-rc6-btrfs-next-69 #1 + [13367.699100] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 + [13367.700069] RIP: 0010:__lock_acquire+0x20a/0x3970 + [13367.700562] Code: ff 1f 0f b7 c0 48 0f (...) + [13367.701609] RSP: 0018:ffffb14448f57790 EFLAGS: 00010046 + [13367.702140] RAX: 0000000000000000 RBX: 29b935140c15e8cf RCX: 0000000000000000 + [13367.702698] RDX: 0000000000000002 RSI: ffffffffb3d66bd0 RDI: 0000000000000046 + [13367.703240] RBP: ffff8a52ba8ac040 R08: 00000c2866ad9288 R09: 0000000000000001 + [13367.703783] R10: 0000000000000001 R11: 00000000b66d9b53 R12: ffff8a52ba8ac9b0 + [13367.704330] R13: 0000000000000000 R14: ffff8a532b6333e8 R15: 0000000000000000 + [13367.704880] FS: 00007fe1df6b5700(0000) GS:ffff8a5376600000(0000) knlGS:0000000000000000 + [13367.705438] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [13367.705995] CR2: 00000000000000b0 CR3: 000000022cca8004 CR4: 00000000003706e0 + [13367.706565] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + [13367.707127] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + [13367.707686] Call Trace: + [13367.708246] ? ___slab_alloc+0x395/0x740 + [13367.708820] ? reada_add_block+0xae/0xee0 [btrfs] + [13367.709383] lock_acquire+0xb1/0x480 + [13367.709955] ? reada_add_block+0xe0/0xee0 [btrfs] + [13367.710537] ? reada_add_block+0xae/0xee0 [btrfs] + [13367.711097] ? rcu_read_lock_sched_held+0x5d/0x90 + [13367.711659] ? kmem_cache_alloc_trace+0x8d2/0x990 + [13367.712221] ? lock_acquired+0x33b/0x470 + [13367.712784] _raw_spin_lock+0x34/0x80 + [13367.713356] ? reada_add_block+0xe0/0xee0 [btrfs] + [13367.713966] reada_add_block+0xe0/0xee0 [btrfs] + [13367.714529] ? btrfs_root_node+0x15/0x1f0 [btrfs] + [13367.715077] btrfs_reada_add+0x117/0x170 [btrfs] + [13367.715620] scrub_stripe+0x21e/0x10d0 [btrfs] + [13367.716141] ? kvm_sched_clock_read+0x5/0x10 + [13367.716657] ? __lock_acquire+0x41e/0x3970 + [13367.717184] ? scrub_chunk+0x60/0x140 [btrfs] + [13367.717697] ? find_held_lock+0x32/0x90 + [13367.718254] ? scrub_chunk+0x60/0x140 [btrfs] + [13367.718773] ? lock_acquired+0x33b/0x470 + [13367.719278] ? scrub_chunk+0xcd/0x140 [btrfs] + [13367.719786] scrub_chunk+0xcd/0x140 [btrfs] + [13367.720291] scrub_enumerate_chunks+0x270/0x5c0 [btrfs] + [13367.720787] ? finish_wait+0x90/0x90 + [13367.721281] btrfs_scrub_dev+0x1ee/0x620 [btrfs] + [13367.721762] ? rcu_read_lock_any_held+0x8e/0xb0 + [13367.722235] ? preempt_count_add+0x49/0xa0 + [13367.722710] ? __sb_start_write+0x19b/0x290 + [13367.723192] btrfs_ioctl+0x7f5/0x36f0 [btrfs] + [13367.723660] ? __fget_files+0x101/0x1d0 + [13367.724118] ? find_held_lock+0x32/0x90 + [13367.724559] ? __fget_files+0x101/0x1d0 + [13367.724982] ? __x64_sys_ioctl+0x83/0xb0 + [13367.725399] __x64_sys_ioctl+0x83/0xb0 + [13367.725802] do_syscall_64+0x33/0x80 + [13367.726188] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + [13367.726574] RIP: 0033:0x7fe1df7add87 + [13367.726948] Code: 00 00 00 48 8b 05 09 91 (...) + [13367.727763] RSP: 002b:00007fe1df6b4d48 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 + [13367.728179] RAX: ffffffffffffffda RBX: 000055ce1fb596a0 RCX: 00007fe1df7add87 + [13367.728604] RDX: 000055ce1fb596a0 RSI: 00000000c400941b RDI: 0000000000000003 + [13367.729021] RBP: 0000000000000000 R08: 00007fe1df6b5700 R09: 0000000000000000 + [13367.729431] R10: 00007fe1df6b5700 R11: 0000000000000246 R12: 00007ffd922b07de + [13367.729842] R13: 00007ffd922b07df R14: 00007fe1df6b4e40 R15: 0000000000802000 + [13367.730275] Modules linked in: btrfs blake2b_generic xor (...) + [13367.732638] CR2: 00000000000000b0 + [13367.733166] ---[ end trace d298b6805556acd9 ]--- + +What happens is the following: + +1) At reada_find_extent() we don't find any existing readahead extent for + the metadata extent starting at logical address X; + +2) So we proceed to create a new one. We then call btrfs_map_block() to get + information about which stripes contain extent X; + +3) After that we iterate over the stripes and create only one zone for the + readahead extent - only one because reada_find_zone() returned NULL for + all iterations except for one, either because a memory allocation failed + or it couldn't find the block group of the extent (it may have just been + deleted); + +4) We then add the new readahead extent to the readahead extents radix + tree at fs_info->reada_tree; + +5) Then we iterate over each zone of the new readahead extent, and find + that the device used for that zone no longer exists, because it was + removed or it was the source device of a device replace operation. + Since this left 'have_zone' set to 0, after finishing the loop we jump + to the 'error' label, call kfree() on the new readahead extent and + return without removing it from the radix tree at fs_info->reada_tree; + +6) Any future call to reada_find_extent() for the logical address X will + find the stale pointer in the readahead extents radix tree, increment + its reference counter, which can trigger the use-after-free right + away or return it to the caller reada_add_block() that results in the + use-after-free of the example trace above. + +So fix this by making sure we delete the readahead extent from the radix +tree if we fail to setup zones for it (when 'have_zone = 0'). + +Fixes: 319450211842ba ("btrfs: reada: bypass adding extent when all zone failed") +CC: stable@vger.kernel.org # 4.9+ +Reviewed-by: Johannes Thumshirn +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/reada.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/btrfs/reada.c ++++ b/fs/btrfs/reada.c +@@ -445,6 +445,8 @@ static struct reada_extent *reada_find_e + } + have_zone = 1; + } ++ if (!have_zone) ++ radix_tree_delete(&fs_info->reada_tree, index); + spin_unlock(&fs_info->reada_lock); + up_read(&fs_info->dev_replace.rwsem); + diff --git a/queue-5.9/btrfs-improve-device-scanning-messages.patch b/queue-5.9/btrfs-improve-device-scanning-messages.patch new file mode 100644 index 00000000000..0fb399d8014 --- /dev/null +++ b/queue-5.9/btrfs-improve-device-scanning-messages.patch @@ -0,0 +1,59 @@ +From 79dae17d8d44b2d15779e332180080af45df5352 Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Thu, 3 Sep 2020 21:30:12 +0800 +Subject: btrfs: improve device scanning messages + +From: Anand Jain + +commit 79dae17d8d44b2d15779e332180080af45df5352 upstream. + +Systems booting without the initramfs seems to scan an unusual kind +of device path (/dev/root). And at a later time, the device is updated +to the correct path. We generally print the process name and PID of the +process scanning the device but we don't capture the same information if +the device path is rescanned with a different pathname. + +The current message is too long, so drop the unnecessary UUID and add +process name and PID. + +While at this also update the duplicate device warning to include the +process name and PID so the messages are consistent + +CC: stable@vger.kernel.org # 4.19+ +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=89721 +Signed-off-by: Anand Jain +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/volumes.c | 14 ++++++++------ + 1 file changed, 8 insertions(+), 6 deletions(-) + +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -942,16 +942,18 @@ static noinline struct btrfs_device *dev + bdput(path_bdev); + mutex_unlock(&fs_devices->device_list_mutex); + btrfs_warn_in_rcu(device->fs_info, +- "duplicate device fsid:devid for %pU:%llu old:%s new:%s", +- disk_super->fsid, devid, +- rcu_str_deref(device->name), path); ++ "duplicate device %s devid %llu generation %llu scanned by %s (%d)", ++ path, devid, found_transid, ++ current->comm, ++ task_pid_nr(current)); + return ERR_PTR(-EEXIST); + } + bdput(path_bdev); + btrfs_info_in_rcu(device->fs_info, +- "device fsid %pU devid %llu moved old:%s new:%s", +- disk_super->fsid, devid, +- rcu_str_deref(device->name), path); ++ "devid %llu device path %s changed to %s scanned by %s (%d)", ++ devid, rcu_str_deref(device->name), ++ path, current->comm, ++ task_pid_nr(current)); + } + + name = rcu_string_strdup(path, GFP_NOFS); diff --git a/queue-5.9/btrfs-qgroup-fix-qgroup-meta-rsv-leak-for-subvolume-operations.patch b/queue-5.9/btrfs-qgroup-fix-qgroup-meta-rsv-leak-for-subvolume-operations.patch new file mode 100644 index 00000000000..f30d098c433 --- /dev/null +++ b/queue-5.9/btrfs-qgroup-fix-qgroup-meta-rsv-leak-for-subvolume-operations.patch @@ -0,0 +1,155 @@ +From e85fde5162bf1b242cbd6daf7dba0f9b457d592b Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Fri, 24 Jul 2020 14:46:10 +0800 +Subject: btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations + +From: Qu Wenruo + +commit e85fde5162bf1b242cbd6daf7dba0f9b457d592b upstream. + +[BUG] +When quota is enabled for TEST_DEV, generic/013 sometimes fails like this: + + generic/013 14s ... _check_dmesg: something found in dmesg (see xfstests-dev/results//generic/013.dmesg) + +And with the following metadata leak: + + BTRFS warning (device dm-3): qgroup 0/1370 has unreleased space, type 2 rsv 49152 + ------------[ cut here ]------------ + WARNING: CPU: 2 PID: 47912 at fs/btrfs/disk-io.c:4078 close_ctree+0x1dc/0x323 [btrfs] + Call Trace: + btrfs_put_super+0x15/0x17 [btrfs] + generic_shutdown_super+0x72/0x110 + kill_anon_super+0x18/0x30 + btrfs_kill_super+0x17/0x30 [btrfs] + deactivate_locked_super+0x3b/0xa0 + deactivate_super+0x40/0x50 + cleanup_mnt+0x135/0x190 + __cleanup_mnt+0x12/0x20 + task_work_run+0x64/0xb0 + __prepare_exit_to_usermode+0x1bc/0x1c0 + __syscall_return_slowpath+0x47/0x230 + do_syscall_64+0x64/0xb0 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + ---[ end trace a6cfd45ba80e4e06 ]--- + BTRFS error (device dm-3): qgroup reserved space leaked + BTRFS info (device dm-3): disk space caching is enabled + BTRFS info (device dm-3): has skinny extents + +[CAUSE] +The qgroup preallocated meta rsv operations of that offending root are: + + btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072 + btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072 + btrfs_subvolume_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=49152 + btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072 + btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072 + +It's pretty obvious that, we reserve qgroup meta rsv in +btrfs_subvolume_reserve_metadata(), but doesn't have corresponding +release/convert calls in btrfs_subvolume_release_metadata(). + +This leads to the leakage. + +[FIX] +To fix this bug, we should follow what we're doing in +btrfs_delalloc_reserve_metadata(), where we reserve qgroup space, and +add it to block_rsv->qgroup_rsv_reserved. + +And free the qgroup reserved metadata space when releasing the +block_rsv. + +To do this, we need to change the btrfs_subvolume_release_metadata() to +accept btrfs_root, and record the qgroup_to_release number, and call +btrfs_qgroup_convert_reserved_meta() for it. + +Fixes: 733e03a0b26a ("btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans") +CC: stable@vger.kernel.org # 4.19+ +Reviewed-by: Josef Bacik +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.h | 2 +- + fs/btrfs/inode.c | 2 +- + fs/btrfs/ioctl.c | 6 +++--- + fs/btrfs/root-tree.c | 13 +++++++++++-- + 4 files changed, 16 insertions(+), 7 deletions(-) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -2619,7 +2619,7 @@ enum btrfs_flush_state { + int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int nitems, bool use_global_rsv); +-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, ++void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); + void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -4051,7 +4051,7 @@ out_end_trans: + err = ret; + inode->i_flags |= S_DEAD; + out_release: +- btrfs_subvolume_release_metadata(fs_info, &block_rsv); ++ btrfs_subvolume_release_metadata(root, &block_rsv); + out_up_write: + up_write(&fs_info->subvol_sem); + if (err) { +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -618,7 +618,7 @@ static noinline int create_subvol(struct + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); +- btrfs_subvolume_release_metadata(fs_info, &block_rsv); ++ btrfs_subvolume_release_metadata(root, &block_rsv); + goto fail_free; + } + trans->block_rsv = &block_rsv; +@@ -742,7 +742,7 @@ fail: + kfree(root_item); + trans->block_rsv = NULL; + trans->bytes_reserved = 0; +- btrfs_subvolume_release_metadata(fs_info, &block_rsv); ++ btrfs_subvolume_release_metadata(root, &block_rsv); + + err = btrfs_commit_transaction(trans); + if (err && !ret) +@@ -856,7 +856,7 @@ fail: + if (ret && pending_snapshot->snap) + pending_snapshot->snap->anon_dev = 0; + btrfs_put_root(pending_snapshot->snap); +- btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); ++ btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv); + free_pending: + if (pending_snapshot->anon_dev) + free_anon_bdev(pending_snapshot->anon_dev); +--- a/fs/btrfs/root-tree.c ++++ b/fs/btrfs/root-tree.c +@@ -512,11 +512,20 @@ int btrfs_subvolume_reserve_metadata(str + if (ret && qgroup_num_bytes) + btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + ++ if (!ret) { ++ spin_lock(&rsv->lock); ++ rsv->qgroup_rsv_reserved += qgroup_num_bytes; ++ spin_unlock(&rsv->lock); ++ } + return ret; + } + +-void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, ++void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv) + { +- btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL); ++ struct btrfs_fs_info *fs_info = root->fs_info; ++ u64 qgroup_to_release; ++ ++ btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release); ++ btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release); + } diff --git a/queue-5.9/btrfs-qgroup-fix-wrong-qgroup-metadata-reserve-for-delayed-inode.patch b/queue-5.9/btrfs-qgroup-fix-wrong-qgroup-metadata-reserve-for-delayed-inode.patch new file mode 100644 index 00000000000..9bbba89a2b8 --- /dev/null +++ b/queue-5.9/btrfs-qgroup-fix-wrong-qgroup-metadata-reserve-for-delayed-inode.patch @@ -0,0 +1,66 @@ +From b4c5d8fdfff3e2b6c4fa4a5043e8946dff500f8c Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Fri, 24 Jul 2020 14:46:09 +0800 +Subject: btrfs: qgroup: fix wrong qgroup metadata reserve for delayed inode + +From: Qu Wenruo + +commit b4c5d8fdfff3e2b6c4fa4a5043e8946dff500f8c upstream. + +For delayed inode facility, qgroup metadata is reserved for it, and +later freed. + +However we're freeing more bytes than we reserved. +In btrfs_delayed_inode_reserve_metadata(): + + num_bytes = btrfs_calc_metadata_size(fs_info, 1); + ... + ret = btrfs_qgroup_reserve_meta_prealloc(root, + fs_info->nodesize, true); + ... + if (!ret) { + node->bytes_reserved = num_bytes; + +But in btrfs_delayed_inode_release_metadata(): + + if (qgroup_free) + btrfs_qgroup_free_meta_prealloc(node->root, + node->bytes_reserved); + else + btrfs_qgroup_convert_reserved_meta(node->root, + node->bytes_reserved); + +This means, we're always releasing more qgroup metadata rsv than we have +reserved. + +This won't trigger selftest warning, as btrfs qgroup metadata rsv has +extra protection against cases like quota enabled half-way. + +But we still need to fix this problem any way. + +This patch will use the same num_bytes for qgroup metadata rsv so we +could handle it correctly. + +Fixes: f218ea6c4792 ("btrfs: delayed-inode: Remove wrong qgroup meta reservation calls") +CC: stable@vger.kernel.org # 4.19+ +Reviewed-by: Josef Bacik +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/delayed-inode.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/btrfs/delayed-inode.c ++++ b/fs/btrfs/delayed-inode.c +@@ -627,8 +627,7 @@ static int btrfs_delayed_inode_reserve_m + */ + if (!src_rsv || (!trans->bytes_reserved && + src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { +- ret = btrfs_qgroup_reserve_meta_prealloc(root, +- fs_info->nodesize, true); ++ ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); + if (ret < 0) + return ret; + ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, diff --git a/queue-5.9/btrfs-reschedule-if-necessary-when-logging-directory-items.patch b/queue-5.9/btrfs-reschedule-if-necessary-when-logging-directory-items.patch new file mode 100644 index 00000000000..cda4553bab2 --- /dev/null +++ b/queue-5.9/btrfs-reschedule-if-necessary-when-logging-directory-items.patch @@ -0,0 +1,111 @@ +From bb56f02f26fe23798edb1b2175707419b28c752a Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 14 Sep 2020 15:27:50 +0100 +Subject: btrfs: reschedule if necessary when logging directory items + +From: Filipe Manana + +commit bb56f02f26fe23798edb1b2175707419b28c752a upstream. + +Logging directories with many entries can take a significant amount of +time, and in some cases monopolize a cpu/core for a long time if the +logging task doesn't happen to block often enough. + +Johannes and Lu Fengqi reported test case generic/041 triggering a soft +lockup when the kernel has CONFIG_SOFTLOCKUP_DETECTOR=y. For this test +case we log an inode with 3002 hard links, and because the test removed +one hard link before fsyncing the file, the inode logging causes the +parent directory do be logged as well, which has 6004 directory items to +log (3002 BTRFS_DIR_ITEM_KEY items plus 3002 BTRFS_DIR_INDEX_KEY items), +so it can take a significant amount of time and trigger the soft lockup. + +So just make tree-log.c:log_dir_items() reschedule when necessary, +releasing the current search path before doing so and then resume from +where it was before the reschedule. + +The stack trace produced when the soft lockup happens is the following: + +[10480.277653] watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [xfs_io:28172] +[10480.279418] Modules linked in: dm_thin_pool dm_persistent_data (...) +[10480.284915] irq event stamp: 29646366 +[10480.285987] hardirqs last enabled at (29646365): [] __slab_alloc.constprop.0+0x56/0x60 +[10480.288482] hardirqs last disabled at (29646366): [] irqentry_enter+0x1d/0x50 +[10480.290856] softirqs last enabled at (4612): [] __do_softirq+0x323/0x56c +[10480.293615] softirqs last disabled at (4483): [] asm_call_on_stack+0xf/0x20 +[10480.296428] CPU: 2 PID: 28172 Comm: xfs_io Not tainted 5.9.0-rc4-default+ #1248 +[10480.298948] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 +[10480.302455] RIP: 0010:__slab_alloc.constprop.0+0x19/0x60 +[10480.304151] Code: 86 e8 31 75 21 00 66 66 2e 0f 1f 84 00 00 00 (...) +[10480.309558] RSP: 0018:ffffadbe09397a58 EFLAGS: 00000282 +[10480.311179] RAX: ffff8a495ab92840 RBX: 0000000000000282 RCX: 0000000000000006 +[10480.313242] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff85249b66 +[10480.315260] RBP: ffff8a497d04b740 R08: 0000000000000001 R09: 0000000000000001 +[10480.317229] R10: ffff8a497d044800 R11: ffff8a495ab93c40 R12: 0000000000000000 +[10480.319169] R13: 0000000000000000 R14: 0000000000000c40 R15: ffffffffc01daf70 +[10480.321104] FS: 00007fa1dc5c0e40(0000) GS:ffff8a497da00000(0000) knlGS:0000000000000000 +[10480.323559] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[10480.325235] CR2: 00007fa1dc5befb8 CR3: 0000000004f8a006 CR4: 0000000000170ea0 +[10480.327259] Call Trace: +[10480.328286] ? overwrite_item+0x1f0/0x5a0 [btrfs] +[10480.329784] __kmalloc+0x831/0xa20 +[10480.331009] ? btrfs_get_32+0xb0/0x1d0 [btrfs] +[10480.332464] overwrite_item+0x1f0/0x5a0 [btrfs] +[10480.333948] log_dir_items+0x2ee/0x570 [btrfs] +[10480.335413] log_directory_changes+0x82/0xd0 [btrfs] +[10480.336926] btrfs_log_inode+0xc9b/0xda0 [btrfs] +[10480.338374] ? init_once+0x20/0x20 [btrfs] +[10480.339711] btrfs_log_inode_parent+0x8d3/0xd10 [btrfs] +[10480.341257] ? dget_parent+0x97/0x2e0 +[10480.342480] btrfs_log_dentry_safe+0x3a/0x50 [btrfs] +[10480.343977] btrfs_sync_file+0x24b/0x5e0 [btrfs] +[10480.345381] do_fsync+0x38/0x70 +[10480.346483] __x64_sys_fsync+0x10/0x20 +[10480.347703] do_syscall_64+0x2d/0x70 +[10480.348891] entry_SYSCALL_64_after_hwframe+0x44/0xa9 +[10480.350444] RIP: 0033:0x7fa1dc80970b +[10480.351642] Code: 0f 05 48 3d 00 f0 ff ff 77 45 c3 0f 1f 40 00 48 (...) +[10480.356952] RSP: 002b:00007fffb3d081d0 EFLAGS: 00000293 ORIG_RAX: 000000000000004a +[10480.359458] RAX: ffffffffffffffda RBX: 0000562d93d45e40 RCX: 00007fa1dc80970b +[10480.361426] RDX: 0000562d93d44ab0 RSI: 0000562d93d45e60 RDI: 0000000000000003 +[10480.363367] RBP: 0000000000000001 R08: 0000000000000000 R09: 00007fa1dc7b2a40 +[10480.365317] R10: 0000562d93d0e366 R11: 0000000000000293 R12: 0000000000000001 +[10480.367299] R13: 0000562d93d45290 R14: 0000562d93d45e40 R15: 0000562d93d45e60 + +Link: https://lore.kernel.org/linux-btrfs/20180713090216.GC575@fnst.localdomain/ +Reported-by: Johannes Thumshirn +CC: stable@vger.kernel.org # 4.4+ +Tested-by: Johannes Thumshirn +Reviewed-by: Johannes Thumshirn +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-log.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -3615,6 +3615,7 @@ static noinline int log_dir_items(struct + * search and this search we'll not find the key again and can just + * bail. + */ ++search: + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret != 0) + goto done; +@@ -3634,6 +3635,13 @@ static noinline int log_dir_items(struct + + if (min_key.objectid != ino || min_key.type != key_type) + goto done; ++ ++ if (need_resched()) { ++ btrfs_release_path(path); ++ cond_resched(); ++ goto search; ++ } ++ + ret = overwrite_item(trans, log, dst_path, src, i, + &min_key); + if (ret) { diff --git a/queue-5.9/btrfs-reschedule-when-cloning-lots-of-extents.patch b/queue-5.9/btrfs-reschedule-when-cloning-lots-of-extents.patch new file mode 100644 index 00000000000..ab10cd43633 --- /dev/null +++ b/queue-5.9/btrfs-reschedule-when-cloning-lots-of-extents.patch @@ -0,0 +1,94 @@ +From 6b613cc97f0ace77f92f7bc112b8f6ad3f52baf8 Mon Sep 17 00:00:00 2001 +From: Johannes Thumshirn +Date: Tue, 22 Sep 2020 17:27:29 +0900 +Subject: btrfs: reschedule when cloning lots of extents + +From: Johannes Thumshirn + +commit 6b613cc97f0ace77f92f7bc112b8f6ad3f52baf8 upstream. + +We have several occurrences of a soft lockup from fstest's generic/175 +testcase, which look more or less like this one: + + watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [xfs_io:10030] + Kernel panic - not syncing: softlockup: hung tasks + CPU: 0 PID: 10030 Comm: xfs_io Tainted: G L 5.9.0-rc5+ #768 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4-rebuilt.opensuse.org 04/01/2014 + Call Trace: + + dump_stack+0x77/0xa0 + panic+0xfa/0x2cb + watchdog_timer_fn.cold+0x85/0xa5 + ? lockup_detector_update_enable+0x50/0x50 + __hrtimer_run_queues+0x99/0x4c0 + ? recalibrate_cpu_khz+0x10/0x10 + hrtimer_run_queues+0x9f/0xb0 + update_process_times+0x28/0x80 + tick_handle_periodic+0x1b/0x60 + __sysvec_apic_timer_interrupt+0x76/0x210 + asm_call_on_stack+0x12/0x20 + + sysvec_apic_timer_interrupt+0x7f/0x90 + asm_sysvec_apic_timer_interrupt+0x12/0x20 + RIP: 0010:btrfs_tree_unlock+0x91/0x1a0 [btrfs] + RSP: 0018:ffffc90007123a58 EFLAGS: 00000282 + RAX: ffff8881cea2fbe0 RBX: ffff8881cea2fbe0 RCX: 0000000000000000 + RDX: ffff8881d23fd200 RSI: ffffffff82045220 RDI: ffff8881cea2fba0 + RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000032 + R10: 0000160000000000 R11: 0000000000001000 R12: 0000000000001000 + R13: ffff8882357fd5b0 R14: ffff88816fa76e70 R15: ffff8881cea2fad0 + ? btrfs_tree_unlock+0x15b/0x1a0 [btrfs] + btrfs_release_path+0x67/0x80 [btrfs] + btrfs_insert_replace_extent+0x177/0x2c0 [btrfs] + btrfs_replace_file_extents+0x472/0x7c0 [btrfs] + btrfs_clone+0x9ba/0xbd0 [btrfs] + btrfs_clone_files.isra.0+0xeb/0x140 [btrfs] + ? file_update_time+0xcd/0x120 + btrfs_remap_file_range+0x322/0x3b0 [btrfs] + do_clone_file_range+0xb7/0x1e0 + vfs_clone_file_range+0x30/0xa0 + ioctl_file_clone+0x8a/0xc0 + do_vfs_ioctl+0x5b2/0x6f0 + __x64_sys_ioctl+0x37/0xa0 + do_syscall_64+0x33/0x40 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + RIP: 0033:0x7f87977fc247 + RSP: 002b:00007ffd51a2f6d8 EFLAGS: 00000206 ORIG_RAX: 0000000000000010 + RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f87977fc247 + RDX: 00007ffd51a2f710 RSI: 000000004020940d RDI: 0000000000000003 + RBP: 0000000000000004 R08: 00007ffd51a79080 R09: 0000000000000000 + R10: 00005621f11352f2 R11: 0000000000000206 R12: 0000000000000000 + R13: 0000000000000000 R14: 00005621f128b958 R15: 0000000080000000 + Kernel Offset: disabled + ---[ end Kernel panic - not syncing: softlockup: hung tasks ]--- + +All of these lockup reports have the call chain btrfs_clone_files() -> +btrfs_clone() in common. btrfs_clone_files() calls btrfs_clone() with +both source and destination extents locked and loops over the source +extent to create the clones. + +Conditionally reschedule in the btrfs_clone() loop, to give some time back +to other processes. + +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Johannes Thumshirn +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/reflink.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/btrfs/reflink.c ++++ b/fs/btrfs/reflink.c +@@ -520,6 +520,8 @@ process_slot: + ret = -EINTR; + goto out; + } ++ ++ cond_resched(); + } + ret = 0; + diff --git a/queue-5.9/btrfs-send-orphanize-first-all-conflicting-inodes-when-processing-references.patch b/queue-5.9/btrfs-send-orphanize-first-all-conflicting-inodes-when-processing-references.patch new file mode 100644 index 00000000000..af4cb8ceb33 --- /dev/null +++ b/queue-5.9/btrfs-send-orphanize-first-all-conflicting-inodes-when-processing-references.patch @@ -0,0 +1,273 @@ +From 98272bb77bf4cc20ed1ffca89832d713e70ebf09 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 21 Sep 2020 14:13:29 +0100 +Subject: btrfs: send, orphanize first all conflicting inodes when processing references + +From: Filipe Manana + +commit 98272bb77bf4cc20ed1ffca89832d713e70ebf09 upstream. + +When doing an incremental send it is possible that when processing the new +references for an inode we end up issuing rename or link operations that +have an invalid path, which contains the orphanized name of a directory +before we actually orphanized it, causing the receiver to fail. + +The following reproducer triggers such scenario: + + $ cat reproducer.sh + #!/bin/bash + + mkfs.btrfs -f /dev/sdi >/dev/null + mount /dev/sdi /mnt/sdi + + touch /mnt/sdi/a + touch /mnt/sdi/b + mkdir /mnt/sdi/testdir + # We want "a" to have a lower inode number then "testdir" (257 vs 259). + mv /mnt/sdi/a /mnt/sdi/testdir/a + + # Filesystem looks like: + # + # . (ino 256) + # |----- testdir/ (ino 259) + # | |----- a (ino 257) + # | + # |----- b (ino 258) + + btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap1 + btrfs send -f /tmp/snap1.send /mnt/sdi/snap1 + + # Now rename 259 to "testdir_2", then change the name of 257 to + # "testdir" and make it a direct descendant of the root inode (256). + # Also create a new link for inode 257 with the old name of inode 258. + # By swapping the names and location of several inodes and create a + # nasty dependency chain of rename and link operations. + mv /mnt/sdi/testdir/a /mnt/sdi/a2 + touch /mnt/sdi/testdir/a + mv /mnt/sdi/b /mnt/sdi/b2 + ln /mnt/sdi/a2 /mnt/sdi/b + mv /mnt/sdi/testdir /mnt/sdi/testdir_2 + mv /mnt/sdi/a2 /mnt/sdi/testdir + + # Filesystem now looks like: + # + # . (ino 256) + # |----- testdir_2/ (ino 259) + # | |----- a (ino 260) + # | + # |----- testdir (ino 257) + # |----- b (ino 257) + # |----- b2 (ino 258) + + btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap2 + btrfs send -f /tmp/snap2.send -p /mnt/sdi/snap1 /mnt/sdi/snap2 + + mkfs.btrfs -f /dev/sdj >/dev/null + mount /dev/sdj /mnt/sdj + + btrfs receive -f /tmp/snap1.send /mnt/sdj + btrfs receive -f /tmp/snap2.send /mnt/sdj + + umount /mnt/sdi + umount /mnt/sdj + +When running the reproducer, the receive of the incremental send stream +fails: + + $ ./reproducer.sh + Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap1' + At subvol /mnt/sdi/snap1 + Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap2' + At subvol /mnt/sdi/snap2 + At subvol snap1 + At snapshot snap2 + ERROR: link b -> o259-6-0/a failed: No such file or directory + +The problem happens because of the following: + +1) Before we start iterating the list of new references for inode 257, + we generate its current path and store it at @valid_path, done at + the very beginning of process_recorded_refs(). The generated path + is "o259-6-0/a", containing the orphanized name for inode 259; + +2) Then we iterate over the list of new references, which has the + references "b" and "testdir" in that specific order; + +3) We process reference "b" first, because it is in the list before + reference "testdir". We then issue a link operation to create + the new reference "b" using a target path corresponding to the + content at @valid_path, which corresponds to "o259-6-0/a". + However we haven't yet orphanized inode 259, its name is still + "testdir", and not "o259-6-0". The orphanization of 259 did not + happen yet because we will process the reference named "testdir" + for inode 257 only in the next iteration of the loop that goes + over the list of new references. + +Fix the issue by having a preliminar iteration over all the new references +at process_recorded_refs(). This iteration is responsible only for doing +the orphanization of other inodes that have and old reference that +conflicts with one of the new references of the inode we are currently +processing. The emission of rename and link operations happen now in the +next iteration of the new references. + +A test case for fstests will follow soon. + +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/send.c | 127 ++++++++++++++++++++++++++++++++++++++------------------ + 1 file changed, 87 insertions(+), 40 deletions(-) + +--- a/fs/btrfs/send.c ++++ b/fs/btrfs/send.c +@@ -3880,52 +3880,56 @@ static int process_recorded_refs(struct + goto out; + } + ++ /* ++ * Before doing any rename and link operations, do a first pass on the ++ * new references to orphanize any unprocessed inodes that may have a ++ * reference that conflicts with one of the new references of the current ++ * inode. This needs to happen first because a new reference may conflict ++ * with the old reference of a parent directory, so we must make sure ++ * that the path used for link and rename commands don't use an ++ * orphanized name when an ancestor was not yet orphanized. ++ * ++ * Example: ++ * ++ * Parent snapshot: ++ * ++ * . (ino 256) ++ * |----- testdir/ (ino 259) ++ * | |----- a (ino 257) ++ * | ++ * |----- b (ino 258) ++ * ++ * Send snapshot: ++ * ++ * . (ino 256) ++ * |----- testdir_2/ (ino 259) ++ * | |----- a (ino 260) ++ * | ++ * |----- testdir (ino 257) ++ * |----- b (ino 257) ++ * |----- b2 (ino 258) ++ * ++ * Processing the new reference for inode 257 with name "b" may happen ++ * before processing the new reference with name "testdir". If so, we ++ * must make sure that by the time we send a link command to create the ++ * hard link "b", inode 259 was already orphanized, since the generated ++ * path in "valid_path" already contains the orphanized name for 259. ++ * We are processing inode 257, so only later when processing 259 we do ++ * the rename operation to change its temporary (orphanized) name to ++ * "testdir_2". ++ */ + list_for_each_entry(cur, &sctx->new_refs, list) { +- /* +- * We may have refs where the parent directory does not exist +- * yet. This happens if the parent directories inum is higher +- * than the current inum. To handle this case, we create the +- * parent directory out of order. But we need to check if this +- * did already happen before due to other refs in the same dir. +- */ + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; +- if (ret == inode_state_will_create) { +- ret = 0; +- /* +- * First check if any of the current inodes refs did +- * already create the dir. +- */ +- list_for_each_entry(cur2, &sctx->new_refs, list) { +- if (cur == cur2) +- break; +- if (cur2->dir == cur->dir) { +- ret = 1; +- break; +- } +- } +- +- /* +- * If that did not happen, check if a previous inode +- * did already create the dir. +- */ +- if (!ret) +- ret = did_create_dir(sctx, cur->dir); +- if (ret < 0) +- goto out; +- if (!ret) { +- ret = send_create_inode(sctx, cur->dir); +- if (ret < 0) +- goto out; +- } +- } ++ if (ret == inode_state_will_create) ++ continue; + + /* +- * Check if this new ref would overwrite the first ref of +- * another unprocessed inode. If yes, orphanize the +- * overwritten inode. If we find an overwritten ref that is +- * not the first ref, simply unlink it. ++ * Check if this new ref would overwrite the first ref of another ++ * unprocessed inode. If yes, orphanize the overwritten inode. ++ * If we find an overwritten ref that is not the first ref, ++ * simply unlink it. + */ + ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, + cur->name, cur->name_len, +@@ -4002,6 +4006,49 @@ static int process_recorded_refs(struct + if (ret < 0) + goto out; + } ++ } ++ ++ } ++ ++ list_for_each_entry(cur, &sctx->new_refs, list) { ++ /* ++ * We may have refs where the parent directory does not exist ++ * yet. This happens if the parent directories inum is higher ++ * than the current inum. To handle this case, we create the ++ * parent directory out of order. But we need to check if this ++ * did already happen before due to other refs in the same dir. ++ */ ++ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); ++ if (ret < 0) ++ goto out; ++ if (ret == inode_state_will_create) { ++ ret = 0; ++ /* ++ * First check if any of the current inodes refs did ++ * already create the dir. ++ */ ++ list_for_each_entry(cur2, &sctx->new_refs, list) { ++ if (cur == cur2) ++ break; ++ if (cur2->dir == cur->dir) { ++ ret = 1; ++ break; ++ } ++ } ++ ++ /* ++ * If that did not happen, check if a previous inode ++ * did already create the dir. ++ */ ++ if (!ret) ++ ret = did_create_dir(sctx, cur->dir); ++ if (ret < 0) ++ goto out; ++ if (!ret) { ++ ret = send_create_inode(sctx, cur->dir); ++ if (ret < 0) ++ goto out; ++ } + } + + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { diff --git a/queue-5.9/btrfs-send-recompute-reference-path-after-orphanization-of-a-directory.patch b/queue-5.9/btrfs-send-recompute-reference-path-after-orphanization-of-a-directory.patch new file mode 100644 index 00000000000..40bb11410ef --- /dev/null +++ b/queue-5.9/btrfs-send-recompute-reference-path-after-orphanization-of-a-directory.patch @@ -0,0 +1,244 @@ +From 9c2b4e0347067396ceb3ae929d6888c81d610259 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 21 Sep 2020 14:13:30 +0100 +Subject: btrfs: send, recompute reference path after orphanization of a directory + +From: Filipe Manana + +commit 9c2b4e0347067396ceb3ae929d6888c81d610259 upstream. + +During an incremental send, when an inode has multiple new references we +might end up emitting rename operations for orphanizations that have a +source path that is no longer valid due to a previous orphanization of +some directory inode. This causes the receiver to fail since it tries +to rename a path that does not exists. + +Example reproducer: + + $ cat reproducer.sh + #!/bin/bash + + mkfs.btrfs -f /dev/sdi >/dev/null + mount /dev/sdi /mnt/sdi + + touch /mnt/sdi/f1 + touch /mnt/sdi/f2 + mkdir /mnt/sdi/d1 + mkdir /mnt/sdi/d1/d2 + + # Filesystem looks like: + # + # . (ino 256) + # |----- f1 (ino 257) + # |----- f2 (ino 258) + # |----- d1/ (ino 259) + # |----- d2/ (ino 260) + + btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap1 + btrfs send -f /tmp/snap1.send /mnt/sdi/snap1 + + # Now do a series of changes such that: + # + # *) inode 258 has one new hardlink and the previous name changed + # + # *) both names conflict with the old names of two other inodes: + # + # 1) the new name "d1" conflicts with the old name of inode 259, + # under directory inode 256 (root) + # + # 2) the new name "d2" conflicts with the old name of inode 260 + # under directory inode 259 + # + # *) inodes 259 and 260 now have the old names of inode 258 + # + # *) inode 257 is now located under inode 260 - an inode with a number + # smaller than the inode (258) for which we created a second hard + # link and swapped its names with inodes 259 and 260 + # + ln /mnt/sdi/f2 /mnt/sdi/d1/f2_link + mv /mnt/sdi/f1 /mnt/sdi/d1/d2/f1 + + # Swap d1 and f2. + mv /mnt/sdi/d1 /mnt/sdi/tmp + mv /mnt/sdi/f2 /mnt/sdi/d1 + mv /mnt/sdi/tmp /mnt/sdi/f2 + + # Swap d2 and f2_link + mv /mnt/sdi/f2/d2 /mnt/sdi/tmp + mv /mnt/sdi/f2/f2_link /mnt/sdi/f2/d2 + mv /mnt/sdi/tmp /mnt/sdi/f2/f2_link + + # Filesystem now looks like: + # + # . (ino 256) + # |----- d1 (ino 258) + # |----- f2/ (ino 259) + # |----- f2_link/ (ino 260) + # | |----- f1 (ino 257) + # | + # |----- d2 (ino 258) + + btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap2 + btrfs send -f /tmp/snap2.send -p /mnt/sdi/snap1 /mnt/sdi/snap2 + + mkfs.btrfs -f /dev/sdj >/dev/null + mount /dev/sdj /mnt/sdj + + btrfs receive -f /tmp/snap1.send /mnt/sdj + btrfs receive -f /tmp/snap2.send /mnt/sdj + + umount /mnt/sdi + umount /mnt/sdj + +When executed the receive of the incremental stream fails: + + $ ./reproducer.sh + Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap1' + At subvol /mnt/sdi/snap1 + Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap2' + At subvol /mnt/sdi/snap2 + At subvol snap1 + At snapshot snap2 + ERROR: rename d1/d2 -> o260-6-0 failed: No such file or directory + +This happens because: + +1) When processing inode 257 we end up computing the name for inode 259 + because it is an ancestor in the send snapshot, and at that point it + still has its old name, "d1", from the parent snapshot because inode + 259 was not yet processed. We then cache that name, which is valid + until we start processing inode 259 (or set the progress to 260 after + processing its references); + +2) Later we start processing inode 258 and collecting all its new + references into the list sctx->new_refs. The first reference in the + list happens to be the reference for name "d1" while the reference for + name "d2" is next (the last element of the list). + We compute the full path "d1/d2" for this second reference and store + it in the reference (its ->full_path member). The path used for the + new parent directory was "d1" and not "f2" because inode 259, the + new parent, was not yet processed; + +3) When we start processing the new references at process_recorded_refs() + we start with the first reference in the list, for the new name "d1". + Because there is a conflicting inode that was not yet processed, which + is directory inode 259, we orphanize it, renaming it from "d1" to + "o259-6-0"; + +4) Then we start processing the new reference for name "d2", and we + realize it conflicts with the reference of inode 260 in the parent + snapshot. So we issue an orphanization operation for inode 260 by + emitting a rename operation with a destination path of "o260-6-0" + and a source path of "d1/d2" - this source path is the value we + stored in the reference earlier at step 2), corresponding to the + ->full_path member of the reference, however that path is no longer + valid due to the orphanization of the directory inode 259 in step 3). + This makes the receiver fail since the path does not exists, it should + have been "o259-6-0/d2". + +Fix this by recomputing the full path of a reference before emitting an +orphanization if we previously orphanized any directory, since that +directory could be a parent in the new path. This is a rare scenario so +keeping it simple and not checking if that previously orphanized directory +is in fact an ancestor of the inode we are trying to orphanize. + +A test case for fstests follows soon. + +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/send.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 72 insertions(+) + +--- a/fs/btrfs/send.c ++++ b/fs/btrfs/send.c +@@ -3813,6 +3813,72 @@ static int update_ref_path(struct send_c + } + + /* ++ * When processing the new references for an inode we may orphanize an existing ++ * directory inode because its old name conflicts with one of the new references ++ * of the current inode. Later, when processing another new reference of our ++ * inode, we might need to orphanize another inode, but the path we have in the ++ * reference reflects the pre-orphanization name of the directory we previously ++ * orphanized. For example: ++ * ++ * parent snapshot looks like: ++ * ++ * . (ino 256) ++ * |----- f1 (ino 257) ++ * |----- f2 (ino 258) ++ * |----- d1/ (ino 259) ++ * |----- d2/ (ino 260) ++ * ++ * send snapshot looks like: ++ * ++ * . (ino 256) ++ * |----- d1 (ino 258) ++ * |----- f2/ (ino 259) ++ * |----- f2_link/ (ino 260) ++ * | |----- f1 (ino 257) ++ * | ++ * |----- d2 (ino 258) ++ * ++ * When processing inode 257 we compute the name for inode 259 as "d1", and we ++ * cache it in the name cache. Later when we start processing inode 258, when ++ * collecting all its new references we set a full path of "d1/d2" for its new ++ * reference with name "d2". When we start processing the new references we ++ * start by processing the new reference with name "d1", and this results in ++ * orphanizing inode 259, since its old reference causes a conflict. Then we ++ * move on the next new reference, with name "d2", and we find out we must ++ * orphanize inode 260, as its old reference conflicts with ours - but for the ++ * orphanization we use a source path corresponding to the path we stored in the ++ * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the ++ * receiver fail since the path component "d1/" no longer exists, it was renamed ++ * to "o259-6-0/" when processing the previous new reference. So in this case we ++ * must recompute the path in the new reference and use it for the new ++ * orphanization operation. ++ */ ++static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) ++{ ++ char *name; ++ int ret; ++ ++ name = kmemdup(ref->name, ref->name_len, GFP_KERNEL); ++ if (!name) ++ return -ENOMEM; ++ ++ fs_path_reset(ref->full_path); ++ ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path); ++ if (ret < 0) ++ goto out; ++ ++ ret = fs_path_add(ref->full_path, name, ref->name_len); ++ if (ret < 0) ++ goto out; ++ ++ /* Update the reference's base name pointer. */ ++ set_ref_path(ref, ref->full_path); ++out: ++ kfree(name); ++ return ret; ++} ++ ++/* + * This does all the move/link/unlink/rmdir magic. + */ + static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) +@@ -3946,6 +4012,12 @@ static int process_recorded_refs(struct + struct name_cache_entry *nce; + struct waiting_dir_move *wdm; + ++ if (orphanized_dir) { ++ ret = refresh_ref_path(sctx, cur); ++ if (ret < 0) ++ goto out; ++ } ++ + ret = orphanize_inode(sctx, ow_inode, ow_gen, + cur->full_path); + if (ret < 0) diff --git a/queue-5.9/btrfs-skip-devices-without-magic-signature-when-mounting.patch b/queue-5.9/btrfs-skip-devices-without-magic-signature-when-mounting.patch new file mode 100644 index 00000000000..36b93015077 --- /dev/null +++ b/queue-5.9/btrfs-skip-devices-without-magic-signature-when-mounting.patch @@ -0,0 +1,93 @@ +From 96c2e067ed3e3e004580a643c76f58729206b829 Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Wed, 30 Sep 2020 21:09:52 +0800 +Subject: btrfs: skip devices without magic signature when mounting + +From: Anand Jain + +commit 96c2e067ed3e3e004580a643c76f58729206b829 upstream. + +Many things can happen after the device is scanned and before the device +is mounted. One such thing is losing the BTRFS_MAGIC on the device. +If it happens we still won't free that device from the memory and cause +the userland confusion. + +For example: As the BTRFS_IOC_DEV_INFO still carries the device path +which does not have the BTRFS_MAGIC, 'btrfs fi show' still lists +device which does not belong to the filesystem anymore: + + $ mkfs.btrfs -fq -draid1 -mraid1 /dev/sda /dev/sdb + $ wipefs -a /dev/sdb + # /dev/sdb does not contain magic signature + $ mount -o degraded /dev/sda /btrfs + $ btrfs fi show -m + Label: none uuid: 470ec6fb-646b-4464-b3cb-df1b26c527bd + Total devices 2 FS bytes used 128.00KiB + devid 1 size 3.00GiB used 571.19MiB path /dev/sda + devid 2 size 3.00GiB used 571.19MiB path /dev/sdb + +We need to distinguish the missing signature and invalid superblock, so +add a specific error code ENODATA for that. This also fixes failure of +fstest btrfs/198. + +CC: stable@vger.kernel.org # 4.19+ +Reviewed-by: Josef Bacik +Signed-off-by: Anand Jain +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/disk-io.c | 8 ++++++-- + fs/btrfs/volumes.c | 18 ++++++++++++------ + 2 files changed, 18 insertions(+), 8 deletions(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -3482,8 +3482,12 @@ struct btrfs_super_block *btrfs_read_dev + return ERR_CAST(page); + + super = page_address(page); +- if (btrfs_super_bytenr(super) != bytenr || +- btrfs_super_magic(super) != BTRFS_MAGIC) { ++ if (btrfs_super_magic(super) != BTRFS_MAGIC) { ++ btrfs_release_disk_super(super); ++ return ERR_PTR(-ENODATA); ++ } ++ ++ if (btrfs_super_bytenr(super) != bytenr) { + btrfs_release_disk_super(super); + return ERR_PTR(-EINVAL); + } +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -1200,17 +1200,23 @@ static int open_fs_devices(struct btrfs_ + { + struct btrfs_device *device; + struct btrfs_device *latest_dev = NULL; ++ struct btrfs_device *tmp_device; + + flags |= FMODE_EXCL; + +- list_for_each_entry(device, &fs_devices->devices, dev_list) { +- /* Just open everything we can; ignore failures here */ +- if (btrfs_open_one_device(fs_devices, device, flags, holder)) +- continue; ++ list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, ++ dev_list) { ++ int ret; + +- if (!latest_dev || +- device->generation > latest_dev->generation) ++ ret = btrfs_open_one_device(fs_devices, device, flags, holder); ++ if (ret == 0 && ++ (!latest_dev || device->generation > latest_dev->generation)) { + latest_dev = device; ++ } else if (ret == -ENODATA) { ++ fs_devices->num_devices--; ++ list_del(&device->dev_list); ++ btrfs_free_device(device); ++ } + } + if (fs_devices->open_devices == 0) + return -EINVAL; diff --git a/queue-5.9/btrfs-sysfs-init-devices-outside-of-the-chunk_mutex.patch b/queue-5.9/btrfs-sysfs-init-devices-outside-of-the-chunk_mutex.patch new file mode 100644 index 00000000000..7f4714726e5 --- /dev/null +++ b/queue-5.9/btrfs-sysfs-init-devices-outside-of-the-chunk_mutex.patch @@ -0,0 +1,188 @@ +From ca10845a56856fff4de3804c85e6424d0f6d0cde Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Tue, 1 Sep 2020 08:09:01 -0400 +Subject: btrfs: sysfs: init devices outside of the chunk_mutex + +From: Josef Bacik + +commit ca10845a56856fff4de3804c85e6424d0f6d0cde upstream. + +While running btrfs/061, btrfs/073, btrfs/078, or btrfs/178 we hit the +following lockdep splat: + + ====================================================== + WARNING: possible circular locking dependency detected + 5.9.0-rc3+ #4 Not tainted + ------------------------------------------------------ + kswapd0/100 is trying to acquire lock: + ffff96ecc22ef4a0 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x330 + + but task is already holding lock: + ffffffff8dd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 + + which lock already depends on the new lock. + + the existing dependency chain (in reverse order) is: + + -> #3 (fs_reclaim){+.+.}-{0:0}: + fs_reclaim_acquire+0x65/0x80 + slab_pre_alloc_hook.constprop.0+0x20/0x200 + kmem_cache_alloc+0x37/0x270 + alloc_inode+0x82/0xb0 + iget_locked+0x10d/0x2c0 + kernfs_get_inode+0x1b/0x130 + kernfs_get_tree+0x136/0x240 + sysfs_get_tree+0x16/0x40 + vfs_get_tree+0x28/0xc0 + path_mount+0x434/0xc00 + __x64_sys_mount+0xe3/0x120 + do_syscall_64+0x33/0x40 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + -> #2 (kernfs_mutex){+.+.}-{3:3}: + __mutex_lock+0x7e/0x7e0 + kernfs_add_one+0x23/0x150 + kernfs_create_link+0x63/0xa0 + sysfs_do_create_link_sd+0x5e/0xd0 + btrfs_sysfs_add_devices_dir+0x81/0x130 + btrfs_init_new_device+0x67f/0x1250 + btrfs_ioctl+0x1ef/0x2e20 + __x64_sys_ioctl+0x83/0xb0 + do_syscall_64+0x33/0x40 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: + __mutex_lock+0x7e/0x7e0 + btrfs_chunk_alloc+0x125/0x3a0 + find_free_extent+0xdf6/0x1210 + btrfs_reserve_extent+0xb3/0x1b0 + btrfs_alloc_tree_block+0xb0/0x310 + alloc_tree_block_no_bg_flush+0x4a/0x60 + __btrfs_cow_block+0x11a/0x530 + btrfs_cow_block+0x104/0x220 + btrfs_search_slot+0x52e/0x9d0 + btrfs_insert_empty_items+0x64/0xb0 + btrfs_insert_delayed_items+0x90/0x4f0 + btrfs_commit_inode_delayed_items+0x93/0x140 + btrfs_log_inode+0x5de/0x2020 + btrfs_log_inode_parent+0x429/0xc90 + btrfs_log_new_name+0x95/0x9b + btrfs_rename2+0xbb9/0x1800 + vfs_rename+0x64f/0x9f0 + do_renameat2+0x320/0x4e0 + __x64_sys_rename+0x1f/0x30 + do_syscall_64+0x33/0x40 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + -> #0 (&delayed_node->mutex){+.+.}-{3:3}: + __lock_acquire+0x119c/0x1fc0 + lock_acquire+0xa7/0x3d0 + __mutex_lock+0x7e/0x7e0 + __btrfs_release_delayed_node.part.0+0x3f/0x330 + btrfs_evict_inode+0x24c/0x500 + evict+0xcf/0x1f0 + dispose_list+0x48/0x70 + prune_icache_sb+0x44/0x50 + super_cache_scan+0x161/0x1e0 + do_shrink_slab+0x178/0x3c0 + shrink_slab+0x17c/0x290 + shrink_node+0x2b2/0x6d0 + balance_pgdat+0x30a/0x670 + kswapd+0x213/0x4c0 + kthread+0x138/0x160 + ret_from_fork+0x1f/0x30 + + other info that might help us debug this: + + Chain exists of: + &delayed_node->mutex --> kernfs_mutex --> fs_reclaim + + Possible unsafe locking scenario: + + CPU0 CPU1 + ---- ---- + lock(fs_reclaim); + lock(kernfs_mutex); + lock(fs_reclaim); + lock(&delayed_node->mutex); + + *** DEADLOCK *** + + 3 locks held by kswapd0/100: + #0: ffffffff8dd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 + #1: ffffffff8dd65c50 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x115/0x290 + #2: ffff96ed2ade30e0 (&type->s_umount_key#36){++++}-{3:3}, at: super_cache_scan+0x38/0x1e0 + + stack backtrace: + CPU: 0 PID: 100 Comm: kswapd0 Not tainted 5.9.0-rc3+ #4 + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 + Call Trace: + dump_stack+0x8b/0xb8 + check_noncircular+0x12d/0x150 + __lock_acquire+0x119c/0x1fc0 + lock_acquire+0xa7/0x3d0 + ? __btrfs_release_delayed_node.part.0+0x3f/0x330 + __mutex_lock+0x7e/0x7e0 + ? __btrfs_release_delayed_node.part.0+0x3f/0x330 + ? __btrfs_release_delayed_node.part.0+0x3f/0x330 + ? lock_acquire+0xa7/0x3d0 + ? find_held_lock+0x2b/0x80 + __btrfs_release_delayed_node.part.0+0x3f/0x330 + btrfs_evict_inode+0x24c/0x500 + evict+0xcf/0x1f0 + dispose_list+0x48/0x70 + prune_icache_sb+0x44/0x50 + super_cache_scan+0x161/0x1e0 + do_shrink_slab+0x178/0x3c0 + shrink_slab+0x17c/0x290 + shrink_node+0x2b2/0x6d0 + balance_pgdat+0x30a/0x670 + kswapd+0x213/0x4c0 + ? _raw_spin_unlock_irqrestore+0x41/0x50 + ? add_wait_queue_exclusive+0x70/0x70 + ? balance_pgdat+0x670/0x670 + kthread+0x138/0x160 + ? kthread_create_worker_on_cpu+0x40/0x40 + ret_from_fork+0x1f/0x30 + +This happens because we are holding the chunk_mutex at the time of +adding in a new device. However we only need to hold the +device_list_mutex, as we're going to iterate over the fs_devices +devices. Move the sysfs init stuff outside of the chunk_mutex to get +rid of this lockdep splat. + +CC: stable@vger.kernel.org # 4.4.x: f3cd2c58110dad14e: btrfs: sysfs, rename device_link add/remove functions +CC: stable@vger.kernel.org # 4.4.x +Reported-by: David Sterba +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/volumes.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -2613,9 +2613,6 @@ int btrfs_init_new_device(struct btrfs_f + btrfs_set_super_num_devices(fs_info->super_copy, + orig_super_num_devices + 1); + +- /* add sysfs device entry */ +- btrfs_sysfs_add_devices_dir(fs_devices, device); +- + /* + * we've got more storage, clear any full flags on the space + * infos +@@ -2623,6 +2620,10 @@ int btrfs_init_new_device(struct btrfs_f + btrfs_clear_space_info_full(fs_info); + + mutex_unlock(&fs_info->chunk_mutex); ++ ++ /* Add sysfs device entry */ ++ btrfs_sysfs_add_devices_dir(fs_devices, device); ++ + mutex_unlock(&fs_devices->device_list_mutex); + + if (seeding_dev) { diff --git a/queue-5.9/btrfs-tracepoints-output-proper-root-owner-for-trace_find_free_extent.patch b/queue-5.9/btrfs-tracepoints-output-proper-root-owner-for-trace_find_free_extent.patch new file mode 100644 index 00000000000..e8423f8c781 --- /dev/null +++ b/queue-5.9/btrfs-tracepoints-output-proper-root-owner-for-trace_find_free_extent.patch @@ -0,0 +1,120 @@ +From 437490fed3b0c9ae21af8f70e0f338d34560842b Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Tue, 28 Jul 2020 09:42:49 +0800 +Subject: btrfs: tracepoints: output proper root owner for trace_find_free_extent() + +From: Qu Wenruo + +commit 437490fed3b0c9ae21af8f70e0f338d34560842b upstream. + +The current trace event always output result like this: + + find_free_extent: root=2(EXTENT_TREE) len=16384 empty_size=0 flags=4(METADATA) + find_free_extent: root=2(EXTENT_TREE) len=16384 empty_size=0 flags=4(METADATA) + find_free_extent: root=2(EXTENT_TREE) len=8192 empty_size=0 flags=1(DATA) + find_free_extent: root=2(EXTENT_TREE) len=8192 empty_size=0 flags=1(DATA) + find_free_extent: root=2(EXTENT_TREE) len=4096 empty_size=0 flags=1(DATA) + find_free_extent: root=2(EXTENT_TREE) len=4096 empty_size=0 flags=1(DATA) + +T's saying we're allocating data extent for EXTENT tree, which is not +even possible. + +It's because we always use EXTENT tree as the owner for +trace_find_free_extent() without using the @root from +btrfs_reserve_extent(). + +This patch will change the parameter to use proper @root for +trace_find_free_extent(): + +Now it looks much better: + + find_free_extent: root=5(FS_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP) + find_free_extent: root=5(FS_TREE) len=8192 empty_size=0 flags=1(DATA) + find_free_extent: root=5(FS_TREE) len=16384 empty_size=0 flags=1(DATA) + find_free_extent: root=5(FS_TREE) len=4096 empty_size=0 flags=1(DATA) + find_free_extent: root=5(FS_TREE) len=8192 empty_size=0 flags=1(DATA) + find_free_extent: root=5(FS_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP) + find_free_extent: root=7(CSUM_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP) + find_free_extent: root=2(EXTENT_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP) + find_free_extent: root=1(ROOT_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP) + +Reported-by: Hans van Kranenburg +CC: stable@vger.kernel.org # 5.4+ +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 7 ++++--- + include/trace/events/btrfs.h | 10 ++++++---- + 2 files changed, 10 insertions(+), 7 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -3918,11 +3918,12 @@ static int prepare_allocation(struct btr + * |- Push harder to find free extents + * |- If not found, re-iterate all block groups + */ +-static noinline int find_free_extent(struct btrfs_fs_info *fs_info, ++static noinline int find_free_extent(struct btrfs_root *root, + u64 ram_bytes, u64 num_bytes, u64 empty_size, + u64 hint_byte_orig, struct btrfs_key *ins, + u64 flags, int delalloc) + { ++ struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; + int cache_block_group_error = 0; + struct btrfs_block_group *block_group = NULL; +@@ -3954,7 +3955,7 @@ static noinline int find_free_extent(str + ins->objectid = 0; + ins->offset = 0; + +- trace_find_free_extent(fs_info, num_bytes, empty_size, flags); ++ trace_find_free_extent(root, num_bytes, empty_size, flags); + + space_info = btrfs_find_space_info(fs_info, flags); + if (!space_info) { +@@ -4203,7 +4204,7 @@ int btrfs_reserve_extent(struct btrfs_ro + flags = get_alloc_profile_by_root(root, is_data); + again: + WARN_ON(num_bytes < fs_info->sectorsize); +- ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, ++ ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, + hint_byte, ins, flags, delalloc); + if (!ret && !is_data) { + btrfs_dec_block_group_reservations(fs_info, ins->objectid); +--- a/include/trace/events/btrfs.h ++++ b/include/trace/events/btrfs.h +@@ -1176,25 +1176,27 @@ DEFINE_EVENT(btrfs__reserved_extent, bt + + TRACE_EVENT(find_free_extent, + +- TP_PROTO(const struct btrfs_fs_info *fs_info, u64 num_bytes, ++ TP_PROTO(const struct btrfs_root *root, u64 num_bytes, + u64 empty_size, u64 data), + +- TP_ARGS(fs_info, num_bytes, empty_size, data), ++ TP_ARGS(root, num_bytes, empty_size, data), + + TP_STRUCT__entry_btrfs( ++ __field( u64, root_objectid ) + __field( u64, num_bytes ) + __field( u64, empty_size ) + __field( u64, data ) + ), + +- TP_fast_assign_btrfs(fs_info, ++ TP_fast_assign_btrfs(root->fs_info, ++ __entry->root_objectid = root->root_key.objectid; + __entry->num_bytes = num_bytes; + __entry->empty_size = empty_size; + __entry->data = data; + ), + + TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)", +- show_root_type(BTRFS_EXTENT_TREE_OBJECTID), ++ show_root_type(__entry->root_objectid), + __entry->num_bytes, __entry->empty_size, __entry->data, + __print_flags((unsigned long)__entry->data, "|", + BTRFS_GROUP_FLAGS)) diff --git a/queue-5.9/btrfs-tree-checker-fix-false-alert-caused-by-legacy-btrfs-root-item.patch b/queue-5.9/btrfs-tree-checker-fix-false-alert-caused-by-legacy-btrfs-root-item.patch new file mode 100644 index 00000000000..cead2cb1500 --- /dev/null +++ b/queue-5.9/btrfs-tree-checker-fix-false-alert-caused-by-legacy-btrfs-root-item.patch @@ -0,0 +1,102 @@ +From 1465af12e254a68706e110846f59cf0f09683184 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Tue, 22 Sep 2020 10:37:01 +0800 +Subject: btrfs: tree-checker: fix false alert caused by legacy btrfs root item + +From: Qu Wenruo + +commit 1465af12e254a68706e110846f59cf0f09683184 upstream. + +Commit 259ee7754b67 ("btrfs: tree-checker: Add ROOT_ITEM check") +introduced btrfs root item size check, however btrfs root item has two +versions, the legacy one which just ends before generation_v2 member, is +smaller than current btrfs root item size. + +This caused btrfs kernel to reject valid but old tree root leaves. + +Fix this problem by also allowing legacy root item, since kernel can +already handle them pretty well and upgrade to newer root item format +when needed. + +Reported-by: Martin Steigerwald +Fixes: 259ee7754b67 ("btrfs: tree-checker: Add ROOT_ITEM check") +CC: stable@vger.kernel.org # 5.4+ +Tested-By: Martin Steigerwald +Reviewed-by: Josef Bacik +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-checker.c | 17 ++++++++++++----- + include/uapi/linux/btrfs_tree.h | 14 ++++++++++++++ + 2 files changed, 26 insertions(+), 5 deletions(-) + +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -1035,7 +1035,7 @@ static int check_root_item(struct extent + int slot) + { + struct btrfs_fs_info *fs_info = leaf->fs_info; +- struct btrfs_root_item ri; ++ struct btrfs_root_item ri = { 0 }; + const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY | + BTRFS_ROOT_SUBVOL_DEAD; + int ret; +@@ -1044,14 +1044,21 @@ static int check_root_item(struct extent + if (ret < 0) + return ret; + +- if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) { ++ if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) && ++ btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) { + generic_err(leaf, slot, +- "invalid root item size, have %u expect %zu", +- btrfs_item_size_nr(leaf, slot), sizeof(ri)); ++ "invalid root item size, have %u expect %zu or %u", ++ btrfs_item_size_nr(leaf, slot), sizeof(ri), ++ btrfs_legacy_root_item_size()); + } + ++ /* ++ * For legacy root item, the members starting at generation_v2 will be ++ * all filled with 0. ++ * And since we allow geneartion_v2 as 0, it will still pass the check. ++ */ + read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), +- sizeof(ri)); ++ btrfs_item_size_nr(leaf, slot)); + + /* Generation related */ + if (btrfs_root_generation(&ri) > +--- a/include/uapi/linux/btrfs_tree.h ++++ b/include/uapi/linux/btrfs_tree.h +@@ -4,6 +4,11 @@ + + #include + #include ++#ifdef __KERNEL__ ++#include ++#else ++#include ++#endif + + /* + * This header contains the structure definitions and constants used +@@ -645,6 +650,15 @@ struct btrfs_root_item { + } __attribute__ ((__packed__)); + + /* ++ * Btrfs root item used to be smaller than current size. The old format ends ++ * at where member generation_v2 is. ++ */ ++static inline __u32 btrfs_legacy_root_item_size(void) ++{ ++ return offsetof(struct btrfs_root_item, generation_v2); ++} ++ ++/* + * this is used for both forward and backward root refs + */ + struct btrfs_root_ref { diff --git a/queue-5.9/btrfs-tree-checker-validate-number-of-chunk-stripes-and-parity.patch b/queue-5.9/btrfs-tree-checker-validate-number-of-chunk-stripes-and-parity.patch new file mode 100644 index 00000000000..4af3eb3be13 --- /dev/null +++ b/queue-5.9/btrfs-tree-checker-validate-number-of-chunk-stripes-and-parity.patch @@ -0,0 +1,64 @@ +From 85d07fbe09efd1c529ff3e025e2f0d2c6c96a1b7 Mon Sep 17 00:00:00 2001 +From: Daniel Xu +Date: Thu, 8 Oct 2020 18:09:10 -0700 +Subject: btrfs: tree-checker: validate number of chunk stripes and parity + +From: Daniel Xu + +commit 85d07fbe09efd1c529ff3e025e2f0d2c6c96a1b7 upstream. + +If there's no parity and num_stripes < ncopies, a crafted image can +trigger a division by zero in calc_stripe_length(). + +The image was generated through fuzzing. + +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Qu Wenruo +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=209587 +Signed-off-by: Daniel Xu +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-checker.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -760,18 +760,36 @@ int btrfs_check_chunk_valid(struct exten + u64 type; + u64 features; + bool mixed = false; ++ int raid_index; ++ int nparity; ++ int ncopies; + + length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); ++ raid_index = btrfs_bg_flags_to_raid_index(type); ++ ncopies = btrfs_raid_array[raid_index].ncopies; ++ nparity = btrfs_raid_array[raid_index].nparity; + + if (!num_stripes) { + chunk_err(leaf, chunk, logical, + "invalid chunk num_stripes, have %u", num_stripes); + return -EUCLEAN; + } ++ if (num_stripes < ncopies) { ++ chunk_err(leaf, chunk, logical, ++ "invalid chunk num_stripes < ncopies, have %u < %d", ++ num_stripes, ncopies); ++ return -EUCLEAN; ++ } ++ if (nparity && num_stripes == nparity) { ++ chunk_err(leaf, chunk, logical, ++ "invalid chunk num_stripes == nparity, have %u == %d", ++ num_stripes, nparity); ++ return -EUCLEAN; ++ } + if (!IS_ALIGNED(logical, fs_info->sectorsize)) { + chunk_err(leaf, chunk, logical, + "invalid chunk logical, have %llu should aligned to %u", diff --git a/queue-5.9/btrfs-use-kvzalloc-to-allocate-clone_roots-in-btrfs_ioctl_send.patch b/queue-5.9/btrfs-use-kvzalloc-to-allocate-clone_roots-in-btrfs_ioctl_send.patch new file mode 100644 index 00000000000..76428faeb75 --- /dev/null +++ b/queue-5.9/btrfs-use-kvzalloc-to-allocate-clone_roots-in-btrfs_ioctl_send.patch @@ -0,0 +1,37 @@ +From 8eb2fd00153a3a96a19c62ac9c6d48c2efebe5e8 Mon Sep 17 00:00:00 2001 +From: Denis Efremov +Date: Mon, 21 Sep 2020 20:03:35 +0300 +Subject: btrfs: use kvzalloc() to allocate clone_roots in btrfs_ioctl_send() + +From: Denis Efremov + +commit 8eb2fd00153a3a96a19c62ac9c6d48c2efebe5e8 upstream. + +btrfs_ioctl_send() used open-coded kvzalloc implementation earlier. +The code was accidentally replaced with kzalloc() call [1]. Restore +the original code by using kvzalloc() to allocate sctx->clone_roots. + +[1] https://patchwork.kernel.org/patch/9757891/#20529627 + +Fixes: 818e010bf9d0 ("btrfs: replace opencoded kvzalloc with the helper") +CC: stable@vger.kernel.org # 4.14+ +Signed-off-by: Denis Efremov +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/send.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/send.c ++++ b/fs/btrfs/send.c +@@ -7300,7 +7300,7 @@ long btrfs_ioctl_send(struct file *mnt_f + + alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); + +- sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL); ++ sctx->clone_roots = kvzalloc(alloc_size, GFP_KERNEL); + if (!sctx->clone_roots) { + ret = -ENOMEM; + goto out; diff --git a/queue-5.9/pm-runtime-remove-link-state-checks-in-rpm_get-put_supplier.patch b/queue-5.9/pm-runtime-remove-link-state-checks-in-rpm_get-put_supplier.patch new file mode 100644 index 00000000000..bd775875560 --- /dev/null +++ b/queue-5.9/pm-runtime-remove-link-state-checks-in-rpm_get-put_supplier.patch @@ -0,0 +1,84 @@ +From d12544fb2aa9944b180c35914031a8384ab082c1 Mon Sep 17 00:00:00 2001 +From: Xiang Chen +Date: Tue, 22 Sep 2020 21:11:06 +0800 +Subject: PM: runtime: Remove link state checks in rpm_get/put_supplier() + +From: Xiang Chen + +commit d12544fb2aa9944b180c35914031a8384ab082c1 upstream. + +To support runtime PM for hisi SAS driver (the driver is in directory +drivers/scsi/hisi_sas), we add device link between scsi_device->sdev_gendev +(consumer device) and hisi_hba->dev(supplier device) with flags +DL_FLAG_PM_RUNTIME | DL_FLAG_RPM_ACTIVE. + +After runtime suspended consumers and supplier, unload the dirver which +causes a hung. + +We found that it called function device_release_driver_internal() to +release the supplier device (hisi_hba->dev), as the device link was +busy, it set the device link state to DL_STATE_SUPPLIER_UNBIND, and +then it called device_release_driver_internal() to release the consumer +device (scsi_device->sdev_gendev). + +Then it would try to call pm_runtime_get_sync() to resume the consumer +device, but because consumer-supplier relation existed, it would try +to resume the supplier first, but as the link state was already +DL_STATE_SUPPLIER_UNBIND, so it skipped resuming the supplier and only +resumed the consumer which hanged (it sends IOs to resume scsi_device +while the SAS controller is suspended). + +Simple flow is as follows: + +device_release_driver_internal -> (supplier device) + if device_links_busy -> + device_links_unbind_consumers -> + ... + WRITE_ONCE(link->status, DL_STATE_SUPPLIER_UNBIND) + device_release_driver_internal (consumer device) + pm_runtime_get_sync -> (consumer device) + ... + __rpm_callback -> + rpm_get_suppliers -> + if link->state == DL_STATE_SUPPLIER_UNBIND -> skip the action of resuming the supplier + ... + pm_runtime_clean_up_links + ... + +Correct suspend/resume ordering between a supplier device and its consumer +devices (resume the supplier device before resuming consumer devices, and +suspend consumer devices before suspending the supplier device) should be +guaranteed by runtime PM, but the state checks in rpm_get_supplier() and +rpm_put_supplier() break this rule, so remove them. + +Signed-off-by: Xiang Chen +[ rjw: Subject and changelog edits ] +Cc: All applicable +Signed-off-by: Rafael J. Wysocki +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/base/power/runtime.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +--- a/drivers/base/power/runtime.c ++++ b/drivers/base/power/runtime.c +@@ -291,8 +291,7 @@ static int rpm_get_suppliers(struct devi + device_links_read_lock_held()) { + int retval; + +- if (!(link->flags & DL_FLAG_PM_RUNTIME) || +- READ_ONCE(link->status) == DL_STATE_SUPPLIER_UNBIND) ++ if (!(link->flags & DL_FLAG_PM_RUNTIME)) + continue; + + retval = pm_runtime_get_sync(link->supplier); +@@ -312,8 +311,6 @@ static void rpm_put_suppliers(struct dev + + list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, + device_links_read_lock_held()) { +- if (READ_ONCE(link->status) == DL_STATE_SUPPLIER_UNBIND) +- continue; + + while (refcount_dec_not_one(&link->rpm_active)) + pm_runtime_put(link->supplier); diff --git a/queue-5.9/scsi-mptfusion-fix-null-pointer-dereferences-in-mptscsih_remove.patch b/queue-5.9/scsi-mptfusion-fix-null-pointer-dereferences-in-mptscsih_remove.patch new file mode 100644 index 00000000000..f0fdcdbb9bd --- /dev/null +++ b/queue-5.9/scsi-mptfusion-fix-null-pointer-dereferences-in-mptscsih_remove.patch @@ -0,0 +1,77 @@ +From 2f4843b172c2c0360ee7792ad98025fae7baefde Mon Sep 17 00:00:00 2001 +From: Helge Deller +Date: Thu, 22 Oct 2020 11:00:05 +0200 +Subject: scsi: mptfusion: Fix null pointer dereferences in mptscsih_remove() + +From: Helge Deller + +commit 2f4843b172c2c0360ee7792ad98025fae7baefde upstream. + +The mptscsih_remove() function triggers a kernel oops if the Scsi_Host +pointer (ioc->sh) is NULL, as can be seen in this syslog: + + ioc0: LSI53C1030 B2: Capabilities={Initiator,Target} + Begin: Waiting for root file system ... + scsi host2: error handler thread failed to spawn, error = -4 + mptspi: ioc0: WARNING - Unable to register controller with SCSI subsystem + Backtrace: + [<000000001045b7cc>] mptspi_probe+0x248/0x3d0 [mptspi] + [<0000000040946470>] pci_device_probe+0x1ac/0x2d8 + [<0000000040add668>] really_probe+0x1bc/0x988 + [<0000000040ade704>] driver_probe_device+0x160/0x218 + [<0000000040adee24>] device_driver_attach+0x160/0x188 + [<0000000040adef90>] __driver_attach+0x144/0x320 + [<0000000040ad7c78>] bus_for_each_dev+0xd4/0x158 + [<0000000040adc138>] driver_attach+0x4c/0x80 + [<0000000040adb3ec>] bus_add_driver+0x3e0/0x498 + [<0000000040ae0130>] driver_register+0xf4/0x298 + [<00000000409450c4>] __pci_register_driver+0x78/0xa8 + [<000000000007d248>] mptspi_init+0x18c/0x1c4 [mptspi] + +This patch adds the necessary NULL-pointer checks. Successfully tested on +a HP C8000 parisc workstation with buggy SCSI drives. + +Link: https://lore.kernel.org/r/20201022090005.GA9000@ls3530.fritz.box +Cc: +Signed-off-by: Helge Deller +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/message/fusion/mptscsih.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +--- a/drivers/message/fusion/mptscsih.c ++++ b/drivers/message/fusion/mptscsih.c +@@ -1176,8 +1176,10 @@ mptscsih_remove(struct pci_dev *pdev) + MPT_SCSI_HOST *hd; + int sz1; + +- if((hd = shost_priv(host)) == NULL) +- return; ++ if (host == NULL) ++ hd = NULL; ++ else ++ hd = shost_priv(host); + + mptscsih_shutdown(pdev); + +@@ -1193,14 +1195,15 @@ mptscsih_remove(struct pci_dev *pdev) + "Free'd ScsiLookup (%d) memory\n", + ioc->name, sz1)); + +- kfree(hd->info_kbuf); ++ if (hd) ++ kfree(hd->info_kbuf); + + /* NULL the Scsi_Host pointer + */ + ioc->sh = NULL; + +- scsi_host_put(host); +- ++ if (host) ++ scsi_host_put(host); + mpt_detach(pdev); + + } diff --git a/queue-5.9/scsi-qla2xxx-fix-crash-on-session-cleanup-with-unload.patch b/queue-5.9/scsi-qla2xxx-fix-crash-on-session-cleanup-with-unload.patch new file mode 100644 index 00000000000..a8be4b6ef69 --- /dev/null +++ b/queue-5.9/scsi-qla2xxx-fix-crash-on-session-cleanup-with-unload.patch @@ -0,0 +1,49 @@ +From 50457dab670f396557e60c07f086358460876353 Mon Sep 17 00:00:00 2001 +From: Quinn Tran +Date: Tue, 29 Sep 2020 03:21:50 -0700 +Subject: scsi: qla2xxx: Fix crash on session cleanup with unload + +From: Quinn Tran + +commit 50457dab670f396557e60c07f086358460876353 upstream. + +On unload, session cleanup prematurely gave the signal for driver unload +path to advance. + +Link: https://lore.kernel.org/r/20200929102152.32278-6-njavali@marvell.com +Fixes: 726b85487067 ("qla2xxx: Add framework for async fabric discovery") +Cc: stable@vger.kernel.org +Reviewed-by: Himanshu Madhani +Signed-off-by: Quinn Tran +Signed-off-by: Nilesh Javali +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/scsi/qla2xxx/qla_target.c | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +--- a/drivers/scsi/qla2xxx/qla_target.c ++++ b/drivers/scsi/qla2xxx/qla_target.c +@@ -1229,14 +1229,15 @@ void qlt_schedule_sess_for_deletion(stru + case DSC_DELETE_PEND: + return; + case DSC_DELETED: +- if (tgt && tgt->tgt_stop && (tgt->sess_count == 0)) +- wake_up_all(&tgt->waitQ); +- if (sess->vha->fcport_count == 0) +- wake_up_all(&sess->vha->fcport_waitQ); +- + if (!sess->plogi_link[QLT_PLOGI_LINK_SAME_WWN] && +- !sess->plogi_link[QLT_PLOGI_LINK_CONFLICT]) ++ !sess->plogi_link[QLT_PLOGI_LINK_CONFLICT]) { ++ if (tgt && tgt->tgt_stop && tgt->sess_count == 0) ++ wake_up_all(&tgt->waitQ); ++ ++ if (sess->vha->fcport_count == 0) ++ wake_up_all(&sess->vha->fcport_waitQ); + return; ++ } + break; + case DSC_UPD_FCPORT: + /* diff --git a/queue-5.9/scsi-qla2xxx-fix-mpi-reset-needed-message.patch b/queue-5.9/scsi-qla2xxx-fix-mpi-reset-needed-message.patch new file mode 100644 index 00000000000..c44ebcc090a --- /dev/null +++ b/queue-5.9/scsi-qla2xxx-fix-mpi-reset-needed-message.patch @@ -0,0 +1,40 @@ +From 7a6cdbd5e87515ebf6231b762ad903c7cff87b9c Mon Sep 17 00:00:00 2001 +From: Arun Easi +Date: Tue, 29 Sep 2020 03:21:48 -0700 +Subject: scsi: qla2xxx: Fix MPI reset needed message + +From: Arun Easi + +commit 7a6cdbd5e87515ebf6231b762ad903c7cff87b9c upstream. + +When printing the message: + + "MPI Heartbeat stop. MPI reset is not needed.." + +..the wrong register was checked leading to always printing that MPI reset +is not needed, even when it is needed. Fix the MPI reset message. + +Link: https://lore.kernel.org/r/20200929102152.32278-4-njavali@marvell.com +Fixes: cbb01c2f2f63 ("scsi: qla2xxx: Fix MPI failure AEN (8200) handling") +Cc: stable@vger.kernel.org +Reviewed-by: Himanshu Madhani +Signed-off-by: Arun Easi +Signed-off-by: Nilesh Javali +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/scsi/qla2xxx/qla_isr.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/scsi/qla2xxx/qla_isr.c ++++ b/drivers/scsi/qla2xxx/qla_isr.c +@@ -767,7 +767,7 @@ qla27xx_handle_8200_aen(scsi_qla_host_t + ql_log(ql_log_warn, vha, 0x02f0, + "MPI Heartbeat stop. MPI reset is%s needed. " + "MB0[%xh] MB1[%xh] MB2[%xh] MB3[%xh]\n", +- mb[0] & BIT_8 ? "" : " not", ++ mb[1] & BIT_8 ? "" : " not", + mb[0], mb[1], mb[2], mb[3]); + + if ((mb[1] & BIT_8) == 0) diff --git a/queue-5.9/scsi-qla2xxx-fix-reset-of-mpi-firmware.patch b/queue-5.9/scsi-qla2xxx-fix-reset-of-mpi-firmware.patch new file mode 100644 index 00000000000..99911553c80 --- /dev/null +++ b/queue-5.9/scsi-qla2xxx-fix-reset-of-mpi-firmware.patch @@ -0,0 +1,171 @@ +From 3e6efab865ac943f4ec43913eb665695737112b0 Mon Sep 17 00:00:00 2001 +From: Arun Easi +Date: Tue, 29 Sep 2020 03:21:49 -0700 +Subject: scsi: qla2xxx: Fix reset of MPI firmware + +From: Arun Easi + +commit 3e6efab865ac943f4ec43913eb665695737112b0 upstream. + +Normally, the MPI firmware is reset when an MPI dump is collected. If an +unsaved MPI dump exists in the driver, though, an alternate mechanism is +used. This mechanism, which was not fully correct, is not recommended and +instead an MPI dump template walk is suggested to perform the MPI reset. + +To allow for the MPI dump template walk, extra space is reserved in the MPI +dump buffer which gets used only when there is already an MPI dump in +place. + +Link: https://lore.kernel.org/r/20200929102152.32278-5-njavali@marvell.com +Fixes: cbb01c2f2f63 ("scsi: qla2xxx: Fix MPI failure AEN (8200) handling") +Cc: stable@vger.kernel.org +Reviewed-by: Himanshu Madhani +Signed-off-by: Arun Easi +Signed-off-by: Nilesh Javali +Signed-off-by: Martin K. Petersen +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/scsi/qla2xxx/qla_attr.c | 10 ++++++-- + drivers/scsi/qla2xxx/qla_gbl.h | 1 + drivers/scsi/qla2xxx/qla_init.c | 2 + + drivers/scsi/qla2xxx/qla_tmpl.c | 49 ++++++++++------------------------------ + 4 files changed, 23 insertions(+), 39 deletions(-) + +--- a/drivers/scsi/qla2xxx/qla_attr.c ++++ b/drivers/scsi/qla2xxx/qla_attr.c +@@ -157,6 +157,14 @@ qla2x00_sysfs_write_fw_dump(struct file + vha->host_no); + } + break; ++ case 10: ++ if (IS_QLA27XX(ha) || IS_QLA28XX(ha)) { ++ ql_log(ql_log_info, vha, 0x70e9, ++ "Issuing MPI firmware dump on host#%ld.\n", ++ vha->host_no); ++ ha->isp_ops->mpi_fw_dump(vha, 0); ++ } ++ break; + } + return count; + } +@@ -744,8 +752,6 @@ qla2x00_sysfs_write_reset(struct file *f + qla83xx_idc_audit(vha, IDC_AUDIT_TIMESTAMP); + qla83xx_idc_unlock(vha, 0); + break; +- } else if (IS_QLA27XX(ha) || IS_QLA28XX(ha)) { +- qla27xx_reset_mpi(vha); + } else { + /* Make sure FC side is not in reset */ + WARN_ON_ONCE(qla2x00_wait_for_hba_online(vha) != +--- a/drivers/scsi/qla2xxx/qla_gbl.h ++++ b/drivers/scsi/qla2xxx/qla_gbl.h +@@ -938,6 +938,5 @@ extern void qla24xx_process_purex_list(s + + /* nvme.c */ + void qla_nvme_unregister_remote_port(struct fc_port *fcport); +-void qla27xx_reset_mpi(scsi_qla_host_t *vha); + void qla_handle_els_plogi_done(scsi_qla_host_t *vha, struct event_arg *ea); + #endif /* _QLA_GBL_H */ +--- a/drivers/scsi/qla2xxx/qla_init.c ++++ b/drivers/scsi/qla2xxx/qla_init.c +@@ -3298,6 +3298,8 @@ qla2x00_alloc_fw_dump(scsi_qla_host_t *v + j, fwdt->dump_size); + dump_size += fwdt->dump_size; + } ++ /* Add space for spare MPI fw dump. */ ++ dump_size += ha->fwdt[1].dump_size; + } else { + req_q_size = req->length * sizeof(request_t); + rsp_q_size = rsp->length * sizeof(response_t); +--- a/drivers/scsi/qla2xxx/qla_tmpl.c ++++ b/drivers/scsi/qla2xxx/qla_tmpl.c +@@ -12,33 +12,6 @@ + #define IOBASE(vha) IOBAR(ISPREG(vha)) + #define INVALID_ENTRY ((struct qla27xx_fwdt_entry *)0xffffffffffffffffUL) + +-/* hardware_lock assumed held. */ +-static void +-qla27xx_write_remote_reg(struct scsi_qla_host *vha, +- u32 addr, u32 data) +-{ +- struct device_reg_24xx __iomem *reg = &vha->hw->iobase->isp24; +- +- ql_dbg(ql_dbg_misc, vha, 0xd300, +- "%s: addr/data = %xh/%xh\n", __func__, addr, data); +- +- wrt_reg_dword(®->iobase_addr, 0x40); +- wrt_reg_dword(®->iobase_c4, data); +- wrt_reg_dword(®->iobase_window, addr); +-} +- +-void +-qla27xx_reset_mpi(scsi_qla_host_t *vha) +-{ +- ql_dbg(ql_dbg_misc + ql_dbg_verbose, vha, 0xd301, +- "Entered %s.\n", __func__); +- +- qla27xx_write_remote_reg(vha, 0x104050, 0x40004); +- qla27xx_write_remote_reg(vha, 0x10405c, 0x4); +- +- vha->hw->stat.num_mpi_reset++; +-} +- + static inline void + qla27xx_insert16(uint16_t value, void *buf, ulong *len) + { +@@ -1028,7 +1001,6 @@ void + qla27xx_mpi_fwdump(scsi_qla_host_t *vha, int hardware_locked) + { + ulong flags = 0; +- bool need_mpi_reset = true; + + #ifndef __CHECKER__ + if (!hardware_locked) +@@ -1036,14 +1008,20 @@ qla27xx_mpi_fwdump(scsi_qla_host_t *vha, + #endif + if (!vha->hw->mpi_fw_dump) { + ql_log(ql_log_warn, vha, 0x02f3, "-> mpi_fwdump no buffer\n"); +- } else if (vha->hw->mpi_fw_dumped) { +- ql_log(ql_log_warn, vha, 0x02f4, +- "-> MPI firmware already dumped (%p) -- ignoring request\n", +- vha->hw->mpi_fw_dump); + } else { + struct fwdt *fwdt = &vha->hw->fwdt[1]; + ulong len; + void *buf = vha->hw->mpi_fw_dump; ++ bool walk_template_only = false; ++ ++ if (vha->hw->mpi_fw_dumped) { ++ /* Use the spare area for any further dumps. */ ++ buf += fwdt->dump_size; ++ walk_template_only = true; ++ ql_log(ql_log_warn, vha, 0x02f4, ++ "-> MPI firmware already dumped -- dump saving to temporary buffer %p.\n", ++ buf); ++ } + + ql_log(ql_log_warn, vha, 0x02f5, "-> fwdt1 running...\n"); + if (!fwdt->template) { +@@ -1058,9 +1036,10 @@ qla27xx_mpi_fwdump(scsi_qla_host_t *vha, + ql_log(ql_log_warn, vha, 0x02f7, + "-> fwdt1 fwdump residual=%+ld\n", + fwdt->dump_size - len); +- } else { +- need_mpi_reset = false; + } ++ vha->hw->stat.num_mpi_reset++; ++ if (walk_template_only) ++ goto bailout; + + vha->hw->mpi_fw_dump_len = len; + vha->hw->mpi_fw_dumped = 1; +@@ -1072,8 +1051,6 @@ qla27xx_mpi_fwdump(scsi_qla_host_t *vha, + } + + bailout: +- if (need_mpi_reset) +- qla27xx_reset_mpi(vha); + #ifndef __CHECKER__ + if (!hardware_locked) + spin_unlock_irqrestore(&vha->hw->hardware_lock, flags); diff --git a/queue-5.9/series b/queue-5.9/series index 0d42f422654..171954010d4 100644 --- a/queue-5.9/series +++ b/queue-5.9/series @@ -213,3 +213,25 @@ acpi-cpufreq-honor-_psd-table-setting-on-new-amd-cpus.patch io-wq-assign-numa-node-locality-if-appropriate.patch w1-mxc_w1-fix-timeout-resolution-problem-leading-to-bus-error.patch fs-kernel_read_file-remove-firmware_prealloc_buffer-enum.patch +scsi-mptfusion-fix-null-pointer-dereferences-in-mptscsih_remove.patch +scsi-qla2xxx-fix-mpi-reset-needed-message.patch +scsi-qla2xxx-fix-reset-of-mpi-firmware.patch +scsi-qla2xxx-fix-crash-on-session-cleanup-with-unload.patch +pm-runtime-remove-link-state-checks-in-rpm_get-put_supplier.patch +btrfs-qgroup-fix-wrong-qgroup-metadata-reserve-for-delayed-inode.patch +btrfs-improve-device-scanning-messages.patch +btrfs-qgroup-fix-qgroup-meta-rsv-leak-for-subvolume-operations.patch +btrfs-sysfs-init-devices-outside-of-the-chunk_mutex.patch +btrfs-tracepoints-output-proper-root-owner-for-trace_find_free_extent.patch +btrfs-reschedule-if-necessary-when-logging-directory-items.patch +btrfs-send-orphanize-first-all-conflicting-inodes-when-processing-references.patch +btrfs-send-recompute-reference-path-after-orphanization-of-a-directory.patch +btrfs-use-kvzalloc-to-allocate-clone_roots-in-btrfs_ioctl_send.patch +btrfs-tree-checker-fix-false-alert-caused-by-legacy-btrfs-root-item.patch +btrfs-reschedule-when-cloning-lots-of-extents.patch +btrfs-cleanup-cow-block-on-error.patch +btrfs-skip-devices-without-magic-signature-when-mounting.patch +btrfs-tree-checker-validate-number-of-chunk-stripes-and-parity.patch +btrfs-fix-use-after-free-on-readahead-extent-after-failure-to-create-it.patch +btrfs-fix-readahead-hang-and-use-after-free-after-removing-a-device.patch +btrfs-drop-the-path-before-adding-block-group-sysfs-files.patch