From cb40c4e6fa3f0b6f20441c37338495cc357e4e8c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 29 Jun 2020 13:04:25 +0200 Subject: [PATCH] 5.7-stable patches added patches: btrfs-check-if-a-log-root-exists-before-locking-the-log_mutex-on-unlink.patch btrfs-fix-bytes_may_use-underflow-when-running-balance-and-scrub-in-parallel.patch btrfs-fix-data-block-group-relocation-failure-due-to-concurrent-scrub.patch btrfs-fix-failure-of-rwf_nowait-write-into-prealloc-extent-beyond-eof.patch btrfs-fix-hang-on-snapshot-creation-after-rwf_nowait-write.patch btrfs-fix-rwf_nowait-write-not-failling-when-we-need-to-cow.patch --- ...fore-locking-the-log_mutex-on-unlink.patch | 148 +++++++++++++ ...unning-balance-and-scrub-in-parallel.patch | 151 ++++++++++++++ ...tion-failure-due-to-concurrent-scrub.patch | 194 ++++++++++++++++++ ...rite-into-prealloc-extent-beyond-eof.patch | 61 ++++++ ...shot-creation-after-rwf_nowait-write.patch | 53 +++++ ...ite-not-failling-when-we-need-to-cow.patch | 86 ++++++++ queue-5.7/series | 6 + 7 files changed, 699 insertions(+) create mode 100644 queue-5.7/btrfs-check-if-a-log-root-exists-before-locking-the-log_mutex-on-unlink.patch create mode 100644 queue-5.7/btrfs-fix-bytes_may_use-underflow-when-running-balance-and-scrub-in-parallel.patch create mode 100644 queue-5.7/btrfs-fix-data-block-group-relocation-failure-due-to-concurrent-scrub.patch create mode 100644 queue-5.7/btrfs-fix-failure-of-rwf_nowait-write-into-prealloc-extent-beyond-eof.patch create mode 100644 queue-5.7/btrfs-fix-hang-on-snapshot-creation-after-rwf_nowait-write.patch create mode 100644 queue-5.7/btrfs-fix-rwf_nowait-write-not-failling-when-we-need-to-cow.patch diff --git a/queue-5.7/btrfs-check-if-a-log-root-exists-before-locking-the-log_mutex-on-unlink.patch b/queue-5.7/btrfs-check-if-a-log-root-exists-before-locking-the-log_mutex-on-unlink.patch new file mode 100644 index 00000000000..56aac4c97da --- /dev/null +++ b/queue-5.7/btrfs-check-if-a-log-root-exists-before-locking-the-log_mutex-on-unlink.patch @@ -0,0 +1,148 @@ +From e7a79811d0db136dc2d336b56d54cf1b774ce972 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 15 Jun 2020 10:38:44 +0100 +Subject: btrfs: check if a log root exists before locking the log_mutex on unlink + +From: Filipe Manana + +commit e7a79811d0db136dc2d336b56d54cf1b774ce972 upstream. + +This brings back an optimization that commit e678934cbe5f02 ("btrfs: +Remove unnecessary check from join_running_log_trans") removed, but in +a different form. So it's almost equivalent to a revert. + +That commit removed an optimization where we avoid locking a root's +log_mutex when there is no log tree created in the current transaction. +The affected code path is triggered through unlink operations. + +That commit was based on the assumption that the optimization was not +necessary because we used to have the following checks when the patch +was authored: + + int btrfs_del_dir_entries_in_log(...) + { + (...) + if (dir->logged_trans < trans->transid) + return 0; + + ret = join_running_log_trans(root); + (...) + } + + int btrfs_del_inode_ref_in_log(...) + { + (...) + if (inode->logged_trans < trans->transid) + return 0; + + ret = join_running_log_trans(root); + (...) + } + +However before that patch was merged, another patch was merged first which +replaced those checks because they were buggy. + +That other patch corresponds to commit 803f0f64d17769 ("Btrfs: fix fsync +not persisting dentry deletions due to inode evictions"). The assumption +that if the logged_trans field of an inode had a smaller value then the +current transaction's generation (transid) meant that the inode was not +logged in the current transaction was only correct if the inode was not +evicted and reloaded in the current transaction. So the corresponding bug +fix changed those checks and replaced them with the following helper +function: + + static bool inode_logged(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) + { + if (inode->logged_trans == trans->transid) + return true; + + if (inode->last_trans == trans->transid && + test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && + !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) + return true; + + return false; + } + +So if we have a subvolume without a log tree in the current transaction +(because we had no fsyncs), every time we unlink an inode we can end up +trying to lock the log_mutex of the root through join_running_log_trans() +twice, once for the inode being unlinked (by btrfs_del_inode_ref_in_log()) +and once for the parent directory (with btrfs_del_dir_entries_in_log()). + +This means if we have several unlink operations happening in parallel for +inodes in the same subvolume, and the those inodes and/or their parent +inode were changed in the current transaction, we end up having a lot of +contention on the log_mutex. + +The test robots from intel reported a -30.7% performance regression for +a REAIM test after commit e678934cbe5f02 ("btrfs: Remove unnecessary check +from join_running_log_trans"). + +So just bring back the optimization to join_running_log_trans() where we +check first if a log root exists before trying to lock the log_mutex. This +is done by checking for a bit that is set on the root when a log tree is +created and removed when a log tree is freed (at transaction commit time). + +Commit e678934cbe5f02 ("btrfs: Remove unnecessary check from +join_running_log_trans") was merged in the 5.4 merge window while commit +803f0f64d17769 ("Btrfs: fix fsync not persisting dentry deletions due to +inode evictions") was merged in the 5.3 merge window. But the first +commit was actually authored before the second commit (May 23 2019 vs +June 19 2019). + +Reported-by: kernel test robot +Link: https://lore.kernel.org/lkml/20200611090233.GL12456@shao2-debian/ +Fixes: e678934cbe5f02 ("btrfs: Remove unnecessary check from join_running_log_trans") +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.h | 2 ++ + fs/btrfs/tree-log.c | 5 +++++ + 2 files changed, 7 insertions(+) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -988,6 +988,8 @@ enum { + BTRFS_ROOT_DEAD_RELOC_TREE, + /* Mark dead root stored on device whose cleanup needs to be resumed */ + BTRFS_ROOT_DEAD_TREE, ++ /* The root has a log tree. Used only for subvolume roots. */ ++ BTRFS_ROOT_HAS_LOG_TREE, + }; + + /* +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -169,6 +169,7 @@ static int start_log_trans(struct btrfs_ + if (ret) + goto out; + ++ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); + clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); + root->log_start_pid = current->pid; + } +@@ -195,6 +196,9 @@ static int join_running_log_trans(struct + { + int ret = -ENOENT; + ++ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state)) ++ return ret; ++ + mutex_lock(&root->log_mutex); + if (root->log_root) { + ret = 0; +@@ -3312,6 +3316,7 @@ int btrfs_free_log(struct btrfs_trans_ha + if (root->log_root) { + free_log_tree(trans, root->log_root); + root->log_root = NULL; ++ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); + } + return 0; + } diff --git a/queue-5.7/btrfs-fix-bytes_may_use-underflow-when-running-balance-and-scrub-in-parallel.patch b/queue-5.7/btrfs-fix-bytes_may_use-underflow-when-running-balance-and-scrub-in-parallel.patch new file mode 100644 index 00000000000..bb0e774100b --- /dev/null +++ b/queue-5.7/btrfs-fix-bytes_may_use-underflow-when-running-balance-and-scrub-in-parallel.patch @@ -0,0 +1,151 @@ +From 6bd335b469f945f75474c11e3f577f85409f39c3 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 8 Jun 2020 13:33:05 +0100 +Subject: btrfs: fix bytes_may_use underflow when running balance and scrub in parallel + +From: Filipe Manana + +commit 6bd335b469f945f75474c11e3f577f85409f39c3 upstream. + +When balance and scrub are running in parallel it is possible to end up +with an underflow of the bytes_may_use counter of the data space_info +object, which triggers a warning like the following: + + [134243.793196] BTRFS info (device sdc): relocating block group 1104150528 flags data + [134243.806891] ------------[ cut here ]------------ + [134243.807561] WARNING: CPU: 1 PID: 26884 at fs/btrfs/space-info.h:125 btrfs_add_reserved_bytes+0x1da/0x280 [btrfs] + [134243.808819] Modules linked in: btrfs blake2b_generic xor (...) + [134243.815779] CPU: 1 PID: 26884 Comm: kworker/u8:8 Tainted: G W 5.6.0-rc7-btrfs-next-58 #5 + [134243.816944] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 + [134243.818389] Workqueue: writeback wb_workfn (flush-btrfs-108483) + [134243.819186] RIP: 0010:btrfs_add_reserved_bytes+0x1da/0x280 [btrfs] + [134243.819963] Code: 0b f2 85 (...) + [134243.822271] RSP: 0018:ffffa4160aae7510 EFLAGS: 00010287 + [134243.822929] RAX: 000000000000c000 RBX: ffff96159a8c1000 RCX: 0000000000000000 + [134243.823816] RDX: 0000000000008000 RSI: 0000000000000000 RDI: ffff96158067a810 + [134243.824742] RBP: ffff96158067a800 R08: 0000000000000001 R09: 0000000000000000 + [134243.825636] R10: ffff961501432a40 R11: 0000000000000000 R12: 000000000000c000 + [134243.826532] R13: 0000000000000001 R14: ffffffffffff4000 R15: ffff96158067a810 + [134243.827432] FS: 0000000000000000(0000) GS:ffff9615baa00000(0000) knlGS:0000000000000000 + [134243.828451] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [134243.829184] CR2: 000055bd7e414000 CR3: 00000001077be004 CR4: 00000000003606e0 + [134243.830083] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + [134243.830975] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + [134243.831867] Call Trace: + [134243.832211] find_free_extent+0x4a0/0x16c0 [btrfs] + [134243.832846] btrfs_reserve_extent+0x91/0x180 [btrfs] + [134243.833487] cow_file_range+0x12d/0x490 [btrfs] + [134243.834080] fallback_to_cow+0x82/0x1b0 [btrfs] + [134243.834689] ? release_extent_buffer+0x121/0x170 [btrfs] + [134243.835370] run_delalloc_nocow+0x33f/0xa30 [btrfs] + [134243.836032] btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs] + [134243.836725] ? find_lock_delalloc_range+0x221/0x250 [btrfs] + [134243.837450] writepage_delalloc+0xe8/0x150 [btrfs] + [134243.838059] __extent_writepage+0xe8/0x4c0 [btrfs] + [134243.838674] extent_write_cache_pages+0x237/0x530 [btrfs] + [134243.839364] extent_writepages+0x44/0xa0 [btrfs] + [134243.839946] do_writepages+0x23/0x80 + [134243.840401] __writeback_single_inode+0x59/0x700 + [134243.841006] writeback_sb_inodes+0x267/0x5f0 + [134243.841548] __writeback_inodes_wb+0x87/0xe0 + [134243.842091] wb_writeback+0x382/0x590 + [134243.842574] ? wb_workfn+0x4a2/0x6c0 + [134243.843030] wb_workfn+0x4a2/0x6c0 + [134243.843468] process_one_work+0x26d/0x6a0 + [134243.843978] worker_thread+0x4f/0x3e0 + [134243.844452] ? process_one_work+0x6a0/0x6a0 + [134243.844981] kthread+0x103/0x140 + [134243.845400] ? kthread_create_worker_on_cpu+0x70/0x70 + [134243.846030] ret_from_fork+0x3a/0x50 + [134243.846494] irq event stamp: 0 + [134243.846892] hardirqs last enabled at (0): [<0000000000000000>] 0x0 + [134243.847682] hardirqs last disabled at (0): [] copy_process+0x74f/0x2020 + [134243.848687] softirqs last enabled at (0): [] copy_process+0x74f/0x2020 + [134243.849913] softirqs last disabled at (0): [<0000000000000000>] 0x0 + [134243.850698] ---[ end trace bd7c03622e0b0a96 ]--- + [134243.851335] ------------[ cut here ]------------ + +When relocating a data block group, for each extent allocated in the +block group we preallocate another extent with the same size for the +data relocation inode (we do it at prealloc_file_extent_cluster()). +We reserve space by calling btrfs_check_data_free_space(), which ends +up incrementing the data space_info's bytes_may_use counter, and +then call btrfs_prealloc_file_range() to allocate the extent, which +always decrements the bytes_may_use counter by the same amount. + +The expectation is that writeback of the data relocation inode always +follows a NOCOW path, by writing into the preallocated extents. However, +when starting writeback we might end up falling back into the COW path, +because the block group that contains the preallocated extent was turned +into RO mode by a scrub running in parallel. The COW path then calls the +extent allocator which ends up calling btrfs_add_reserved_bytes(), and +this function decrements the bytes_may_use counter of the data space_info +object by an amount corresponding to the size of the allocated extent, +despite we haven't previously incremented it. When the counter currently +has a value smaller then the allocated extent we reset the counter to 0 +and emit a warning, otherwise we just decrement it and slowly mess up +with this counter which is crucial for space reservation, the end result +can be granting reserved space to tasks when there isn't really enough +free space, and having the tasks fail later in critical places where +error handling consists of a transaction abort or hitting a BUG_ON(). + +Fix this by making sure that if we fallback to the COW path for a data +relocation inode, we increment the bytes_may_use counter of the data +space_info object. The COW path will then decrement it at +btrfs_add_reserved_bytes() on success or through its error handling part +by a call to extent_clear_unlock_delalloc() (which ends up calling +btrfs_clear_delalloc_extent() that does the decrement operation) in case +of an error. + +Test case btrfs/061 from fstests could sporadically trigger this. + +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 17 ++++++++++++----- + 1 file changed, 12 insertions(+), 5 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -1361,6 +1361,8 @@ static int fallback_to_cow(struct inode + int *page_started, unsigned long *nr_written) + { + const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode)); ++ const bool is_reloc_ino = (BTRFS_I(inode)->root->root_key.objectid == ++ BTRFS_DATA_RELOC_TREE_OBJECTID); + const u64 range_bytes = end + 1 - start; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + u64 range_start = start; +@@ -1391,18 +1393,23 @@ static int fallback_to_cow(struct inode + * data space info, which we incremented in the step above. + * + * If we need to fallback to cow and the inode corresponds to a free +- * space cache inode, we must also increment bytes_may_use of the data +- * space_info for the same reason. Space caches always get a prealloc ++ * space cache inode or an inode of the data relocation tree, we must ++ * also increment bytes_may_use of the data space_info for the same ++ * reason. Space caches and relocated data extents always get a prealloc + * extent for them, however scrub or balance may have set the block +- * group that contains that extent to RO mode. ++ * group that contains that extent to RO mode and therefore force COW ++ * when starting writeback. + */ + count = count_range_bits(io_tree, &range_start, end, range_bytes, + EXTENT_NORESERVE, 0); +- if (count > 0 || is_space_ino) { +- const u64 bytes = is_space_ino ? range_bytes : count; ++ if (count > 0 || is_space_ino || is_reloc_ino) { ++ u64 bytes = count; + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_space_info *sinfo = fs_info->data_sinfo; + ++ if (is_space_ino || is_reloc_ino) ++ bytes = range_bytes; ++ + spin_lock(&sinfo->lock); + btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); + spin_unlock(&sinfo->lock); diff --git a/queue-5.7/btrfs-fix-data-block-group-relocation-failure-due-to-concurrent-scrub.patch b/queue-5.7/btrfs-fix-data-block-group-relocation-failure-due-to-concurrent-scrub.patch new file mode 100644 index 00000000000..d9d287fdf9d --- /dev/null +++ b/queue-5.7/btrfs-fix-data-block-group-relocation-failure-due-to-concurrent-scrub.patch @@ -0,0 +1,194 @@ +From 432cd2a10f1c10cead91fe706ff5dc52f06d642a Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 8 Jun 2020 13:32:55 +0100 +Subject: btrfs: fix data block group relocation failure due to concurrent scrub + +From: Filipe Manana + +commit 432cd2a10f1c10cead91fe706ff5dc52f06d642a upstream. + +When running relocation of a data block group while scrub is running in +parallel, it is possible that the relocation will fail and abort the +current transaction with an -EINVAL error: + + [134243.988595] BTRFS info (device sdc): found 14 extents, stage: move data extents + [134243.999871] ------------[ cut here ]------------ + [134244.000741] BTRFS: Transaction aborted (error -22) + [134244.001692] WARNING: CPU: 0 PID: 26954 at fs/btrfs/ctree.c:1071 __btrfs_cow_block+0x6a7/0x790 [btrfs] + [134244.003380] Modules linked in: btrfs blake2b_generic xor raid6_pq (...) + [134244.012577] CPU: 0 PID: 26954 Comm: btrfs Tainted: G W 5.6.0-rc7-btrfs-next-58 #5 + [134244.014162] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 + [134244.016184] RIP: 0010:__btrfs_cow_block+0x6a7/0x790 [btrfs] + [134244.017151] Code: 48 c7 c7 (...) + [134244.020549] RSP: 0018:ffffa41607863888 EFLAGS: 00010286 + [134244.021515] RAX: 0000000000000000 RBX: ffff9614bdfe09c8 RCX: 0000000000000000 + [134244.022822] RDX: 0000000000000001 RSI: ffffffffb3d63980 RDI: 0000000000000001 + [134244.024124] RBP: ffff961589e8c000 R08: 0000000000000000 R09: 0000000000000001 + [134244.025424] R10: ffffffffc0ae5955 R11: 0000000000000000 R12: ffff9614bd530d08 + [134244.026725] R13: ffff9614ced41b88 R14: ffff9614bdfe2a48 R15: 0000000000000000 + [134244.028024] FS: 00007f29b63c08c0(0000) GS:ffff9615ba600000(0000) knlGS:0000000000000000 + [134244.029491] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [134244.030560] CR2: 00007f4eb339b000 CR3: 0000000130d6e006 CR4: 00000000003606f0 + [134244.031997] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + [134244.033153] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + [134244.034484] Call Trace: + [134244.034984] btrfs_cow_block+0x12b/0x2b0 [btrfs] + [134244.035859] do_relocation+0x30b/0x790 [btrfs] + [134244.036681] ? do_raw_spin_unlock+0x49/0xc0 + [134244.037460] ? _raw_spin_unlock+0x29/0x40 + [134244.038235] relocate_tree_blocks+0x37b/0x730 [btrfs] + [134244.039245] relocate_block_group+0x388/0x770 [btrfs] + [134244.040228] btrfs_relocate_block_group+0x161/0x2e0 [btrfs] + [134244.041323] btrfs_relocate_chunk+0x36/0x110 [btrfs] + [134244.041345] btrfs_balance+0xc06/0x1860 [btrfs] + [134244.043382] ? btrfs_ioctl_balance+0x27c/0x310 [btrfs] + [134244.045586] btrfs_ioctl_balance+0x1ed/0x310 [btrfs] + [134244.045611] btrfs_ioctl+0x1880/0x3760 [btrfs] + [134244.049043] ? do_raw_spin_unlock+0x49/0xc0 + [134244.049838] ? _raw_spin_unlock+0x29/0x40 + [134244.050587] ? __handle_mm_fault+0x11b3/0x14b0 + [134244.051417] ? ksys_ioctl+0x92/0xb0 + [134244.052070] ksys_ioctl+0x92/0xb0 + [134244.052701] ? trace_hardirqs_off_thunk+0x1a/0x1c + [134244.053511] __x64_sys_ioctl+0x16/0x20 + [134244.054206] do_syscall_64+0x5c/0x280 + [134244.054891] entry_SYSCALL_64_after_hwframe+0x49/0xbe + [134244.055819] RIP: 0033:0x7f29b51c9dd7 + [134244.056491] Code: 00 00 00 (...) + [134244.059767] RSP: 002b:00007ffcccc1dd08 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 + [134244.061168] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f29b51c9dd7 + [134244.062474] RDX: 00007ffcccc1dda0 RSI: 00000000c4009420 RDI: 0000000000000003 + [134244.063771] RBP: 0000000000000003 R08: 00005565cea4b000 R09: 0000000000000000 + [134244.065032] R10: 0000000000000541 R11: 0000000000000202 R12: 00007ffcccc2060a + [134244.066327] R13: 00007ffcccc1dda0 R14: 0000000000000002 R15: 00007ffcccc1dec0 + [134244.067626] irq event stamp: 0 + [134244.068202] hardirqs last enabled at (0): [<0000000000000000>] 0x0 + [134244.069351] hardirqs last disabled at (0): [] copy_process+0x74f/0x2020 + [134244.070909] softirqs last enabled at (0): [] copy_process+0x74f/0x2020 + [134244.072392] softirqs last disabled at (0): [<0000000000000000>] 0x0 + [134244.073432] ---[ end trace bd7c03622e0b0a99 ]--- + +The -EINVAL error comes from the following chain of function calls: + + __btrfs_cow_block() <-- aborts the transaction + btrfs_reloc_cow_block() + replace_file_extents() + get_new_location() <-- returns -EINVAL + +When relocating a data block group, for each allocated extent of the block +group, we preallocate another extent (at prealloc_file_extent_cluster()), +associated with the data relocation inode, and then dirty all its pages. +These preallocated extents have, and must have, the same size that extents +from the data block group being relocated have. + +Later before we start the relocation stage that updates pointers (bytenr +field of file extent items) to point to the the new extents, we trigger +writeback for the data relocation inode. The expectation is that writeback +will write the pages to the previously preallocated extents, that it +follows the NOCOW path. That is generally the case, however, if a scrub +is running it may have turned the block group that contains those extents +into RO mode, in which case writeback falls back to the COW path. + +However in the COW path instead of allocating exactly one extent with the +expected size, the allocator may end up allocating several smaller extents +due to free space fragmentation - because we tell it at cow_file_range() +that the minimum allocation size can match the filesystem's sector size. +This later breaks the relocation's expectation that an extent associated +to a file extent item in the data relocation inode has the same size as +the respective extent pointed by a file extent item in another tree - in +this case the extent to which the relocation inode poins to is smaller, +causing relocation.c:get_new_location() to return -EINVAL. + +For example, if we are relocating a data block group X that has a logical +address of X and the block group has an extent allocated at the logical +address X + 128KiB with a size of 64KiB: + +1) At prealloc_file_extent_cluster() we allocate an extent for the data + relocation inode with a size of 64KiB and associate it to the file + offset 128KiB (X + 128KiB - X) of the data relocation inode. This + preallocated extent was allocated at block group Z; + +2) A scrub running in parallel turns block group Z into RO mode and + starts scrubing its extents; + +3) Relocation triggers writeback for the data relocation inode; + +4) When running delalloc (btrfs_run_delalloc_range()), we try first the + NOCOW path because the data relocation inode has BTRFS_INODE_PREALLOC + set in its flags. However, because block group Z is in RO mode, the + NOCOW path (run_delalloc_nocow()) falls back into the COW path, by + calling cow_file_range(); + +5) At cow_file_range(), in the first iteration of the while loop we call + btrfs_reserve_extent() to allocate a 64KiB extent and pass it a minimum + allocation size of 4KiB (fs_info->sectorsize). Due to free space + fragmentation, btrfs_reserve_extent() ends up allocating two extents + of 32KiB each, each one on a different iteration of that while loop; + +6) Writeback of the data relocation inode completes; + +7) Relocation proceeds and ends up at relocation.c:replace_file_extents(), + with a leaf which has a file extent item that points to the data extent + from block group X, that has a logical address (bytenr) of X + 128KiB + and a size of 64KiB. Then it calls get_new_location(), which does a + lookup in the data relocation tree for a file extent item starting at + offset 128KiB (X + 128KiB - X) and belonging to the data relocation + inode. It finds a corresponding file extent item, however that item + points to an extent that has a size of 32KiB, which doesn't match the + expected size of 64KiB, resuling in -EINVAL being returned from this + function and propagated up to __btrfs_cow_block(), which aborts the + current transaction. + +To fix this make sure that at cow_file_range() when we call the allocator +we pass it a minimum allocation size corresponding the desired extent size +if the inode belongs to the data relocation tree, otherwise pass it the +filesystem's sector size as the minimum allocation size. + +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 19 ++++++++++++++++++- + 1 file changed, 18 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -985,6 +985,7 @@ static noinline int cow_file_range(struc + u64 num_bytes; + unsigned long ram_size; + u64 cur_alloc_size = 0; ++ u64 min_alloc_size; + u64 blocksize = fs_info->sectorsize; + struct btrfs_key ins; + struct extent_map *em; +@@ -1035,10 +1036,26 @@ static noinline int cow_file_range(struc + btrfs_drop_extent_cache(BTRFS_I(inode), start, + start + num_bytes - 1, 0); + ++ /* ++ * Relocation relies on the relocated extents to have exactly the same ++ * size as the original extents. Normally writeback for relocation data ++ * extents follows a NOCOW path because relocation preallocates the ++ * extents. However, due to an operation such as scrub turning a block ++ * group to RO mode, it may fallback to COW mode, so we must make sure ++ * an extent allocated during COW has exactly the requested size and can ++ * not be split into smaller extents, otherwise relocation breaks and ++ * fails during the stage where it updates the bytenr of file extent ++ * items. ++ */ ++ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) ++ min_alloc_size = num_bytes; ++ else ++ min_alloc_size = fs_info->sectorsize; ++ + while (num_bytes > 0) { + cur_alloc_size = num_bytes; + ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, +- fs_info->sectorsize, 0, alloc_hint, ++ min_alloc_size, 0, alloc_hint, + &ins, 1, 1); + if (ret < 0) + goto out_unlock; diff --git a/queue-5.7/btrfs-fix-failure-of-rwf_nowait-write-into-prealloc-extent-beyond-eof.patch b/queue-5.7/btrfs-fix-failure-of-rwf_nowait-write-into-prealloc-extent-beyond-eof.patch new file mode 100644 index 00000000000..cced689babe --- /dev/null +++ b/queue-5.7/btrfs-fix-failure-of-rwf_nowait-write-into-prealloc-extent-beyond-eof.patch @@ -0,0 +1,61 @@ +From 4b1946284dd6641afdb9457101056d9e6ee6204c Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 15 Jun 2020 18:48:58 +0100 +Subject: btrfs: fix failure of RWF_NOWAIT write into prealloc extent beyond eof + +From: Filipe Manana + +commit 4b1946284dd6641afdb9457101056d9e6ee6204c upstream. + +If we attempt to write to prealloc extent located after eof using a +RWF_NOWAIT write, we always fail with -EAGAIN. + +We do actually check if we have an allocated extent for the write at +the start of btrfs_file_write_iter() through a call to check_can_nocow(), +but later when we go into the actual direct IO write path we simply +return -EAGAIN if the write starts at or beyond EOF. + +Trivial to reproduce: + + $ mkfs.btrfs -f /dev/sdb + $ mount /dev/sdb /mnt + + $ touch /mnt/foo + $ chattr +C /mnt/foo + + $ xfs_io -d -c "pwrite -S 0xab 0 64K" /mnt/foo + wrote 65536/65536 bytes at offset 0 + 64 KiB, 16 ops; 0.0004 sec (135.575 MiB/sec and 34707.1584 ops/sec) + + $ xfs_io -c "falloc -k 64K 1M" /mnt/foo + + $ xfs_io -d -c "pwrite -N -V 1 -S 0xfe -b 64K 64K 64K" /mnt/foo + pwrite: Resource temporarily unavailable + +On xfs and ext4 the write succeeds, as expected. + +Fix this by removing the wrong check at btrfs_direct_IO(). + +Fixes: edf064e7c6fec3 ("btrfs: nowait aio support") +CC: stable@vger.kernel.org # 4.14+ +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 3 --- + 1 file changed, 3 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -8262,9 +8262,6 @@ static ssize_t btrfs_direct_IO(struct ki + dio_data.overwrite = 1; + inode_unlock(inode); + relock = true; +- } else if (iocb->ki_flags & IOCB_NOWAIT) { +- ret = -EAGAIN; +- goto out; + } + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + offset, count); diff --git a/queue-5.7/btrfs-fix-hang-on-snapshot-creation-after-rwf_nowait-write.patch b/queue-5.7/btrfs-fix-hang-on-snapshot-creation-after-rwf_nowait-write.patch new file mode 100644 index 00000000000..df702da400b --- /dev/null +++ b/queue-5.7/btrfs-fix-hang-on-snapshot-creation-after-rwf_nowait-write.patch @@ -0,0 +1,53 @@ +From f2cb2f39ccc30fa13d3ac078d461031a63960e5b Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 15 Jun 2020 18:46:01 +0100 +Subject: btrfs: fix hang on snapshot creation after RWF_NOWAIT write + +From: Filipe Manana + +commit f2cb2f39ccc30fa13d3ac078d461031a63960e5b upstream. + +If we do a successful RWF_NOWAIT write we end up locking the snapshot lock +of the inode, through a call to check_can_nocow(), but we never unlock it. + +This means the next attempt to create a snapshot on the subvolume will +hang forever. + +Trivial reproducer: + + $ mkfs.btrfs -f /dev/sdb + $ mount /dev/sdb /mnt + + $ touch /mnt/foobar + $ chattr +C /mnt/foobar + $ xfs_io -d -c "pwrite -S 0xab 0 64K" /mnt/foobar + $ xfs_io -d -c "pwrite -N -V 1 -S 0xfe 0 64K" /mnt/foobar + + $ btrfs subvolume snapshot -r /mnt /mnt/snap + --> hangs + +Fix this by unlocking the snapshot lock if check_can_nocow() returned +success. + +Fixes: edf064e7c6fec3 ("btrfs: nowait aio support") +CC: stable@vger.kernel.org # 4.14+ +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/file.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -1922,6 +1922,8 @@ static ssize_t btrfs_file_write_iter(str + inode_unlock(inode); + return -EAGAIN; + } ++ /* check_can_nocow() locks the snapshot lock on success */ ++ btrfs_drew_write_unlock(&root->snapshot_lock); + } + + current->backing_dev_info = inode_to_bdi(inode); diff --git a/queue-5.7/btrfs-fix-rwf_nowait-write-not-failling-when-we-need-to-cow.patch b/queue-5.7/btrfs-fix-rwf_nowait-write-not-failling-when-we-need-to-cow.patch new file mode 100644 index 00000000000..9dce15c4d91 --- /dev/null +++ b/queue-5.7/btrfs-fix-rwf_nowait-write-not-failling-when-we-need-to-cow.patch @@ -0,0 +1,86 @@ +From 260a63395f90f67d6ab89e4266af9e3dc34a77e9 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 15 Jun 2020 18:49:13 +0100 +Subject: btrfs: fix RWF_NOWAIT write not failling when we need to cow + +From: Filipe Manana + +commit 260a63395f90f67d6ab89e4266af9e3dc34a77e9 upstream. + +If we attempt to do a RWF_NOWAIT write against a file range for which we +can only do NOCOW for a part of it, due to the existence of holes or +shared extents for example, we proceed with the write as if it were +possible to NOCOW the whole range. + +Example: + + $ mkfs.btrfs -f /dev/sdb + $ mount /dev/sdb /mnt + + $ touch /mnt/sdj/bar + $ chattr +C /mnt/sdj/bar + + $ xfs_io -d -c "pwrite -S 0xab -b 256K 0 256K" /mnt/bar + wrote 262144/262144 bytes at offset 0 + 256 KiB, 1 ops; 0.0003 sec (694.444 MiB/sec and 2777.7778 ops/sec) + + $ xfs_io -c "fpunch 64K 64K" /mnt/bar + $ sync + + $ xfs_io -d -c "pwrite -N -V 1 -b 128K -S 0xfe 0 128K" /mnt/bar + wrote 131072/131072 bytes at offset 0 + 128 KiB, 1 ops; 0.0007 sec (160.051 MiB/sec and 1280.4097 ops/sec) + +This last write should fail with -EAGAIN since the file range from 64K to +128K is a hole. On xfs it fails, as expected, but on ext4 it currently +succeeds because apparently it is expensive to check if there are extents +allocated for the whole range, but I'll check with the ext4 people. + +Fix the issue by checking if check_can_nocow() returns a number of +NOCOW'able bytes smaller then the requested number of bytes, and if it +does return -EAGAIN. + +Fixes: edf064e7c6fec3 ("btrfs: nowait aio support") +CC: stable@vger.kernel.org # 4.14+ +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/file.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -1912,18 +1912,29 @@ static ssize_t btrfs_file_write_iter(str + pos = iocb->ki_pos; + count = iov_iter_count(from); + if (iocb->ki_flags & IOCB_NOWAIT) { ++ size_t nocow_bytes = count; ++ + /* + * We will allocate space in case nodatacow is not set, + * so bail + */ + if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | + BTRFS_INODE_PREALLOC)) || +- check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) { ++ check_can_nocow(BTRFS_I(inode), pos, &nocow_bytes) <= 0) { + inode_unlock(inode); + return -EAGAIN; + } + /* check_can_nocow() locks the snapshot lock on success */ + btrfs_drew_write_unlock(&root->snapshot_lock); ++ /* ++ * There are holes in the range or parts of the range that must ++ * be COWed (shared extents, RO block groups, etc), so just bail ++ * out. ++ */ ++ if (nocow_bytes < count) { ++ inode_unlock(inode); ++ return -EAGAIN; ++ } + } + + current->backing_dev_info = inode_to_bdi(inode); diff --git a/queue-5.7/series b/queue-5.7/series index 4a62588eff1..987268eeb91 100644 --- a/queue-5.7/series +++ b/queue-5.7/series @@ -219,3 +219,9 @@ kvm-vmx-stop-context-switching-msr_ia32_umwait_control.patch x86-cpu-use-pinning-mask-for-cr4-bits-needing-to-be-0.patch x86-cpu-reinitialize-ia32_feat_ctl-msr-on-bsp-during-wakeup.patch x86-asm-64-align-start-of-__clear_user-loop-to-16-bytes.patch +btrfs-fix-bytes_may_use-underflow-when-running-balance-and-scrub-in-parallel.patch +btrfs-fix-data-block-group-relocation-failure-due-to-concurrent-scrub.patch +btrfs-check-if-a-log-root-exists-before-locking-the-log_mutex-on-unlink.patch +btrfs-fix-hang-on-snapshot-creation-after-rwf_nowait-write.patch +btrfs-fix-failure-of-rwf_nowait-write-into-prealloc-extent-beyond-eof.patch +btrfs-fix-rwf_nowait-write-not-failling-when-we-need-to-cow.patch -- 2.47.3