From: Greg Kroah-Hartman Date: Thu, 18 Jun 2020 14:55:07 +0000 (+0200) Subject: 5.4-stable patches X-Git-Tag: v4.4.228~54 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=2470fa1a3b7025712e94614f7706eba85f2adf2d;p=thirdparty%2Fkernel%2Fstable-queue.git 5.4-stable patches added patches: btrfs-fix-error-handling-when-submitting-direct-i-o-bio.patch btrfs-fix-space_info-bytes_may_use-underflow-after-nocow-buffered-write.patch btrfs-fix-space_info-bytes_may_use-underflow-during-space-cache-writeout.patch btrfs-fix-wrong-file-range-cleanup-after-an-error-filling-dealloc-range.patch btrfs-force-chunk-allocation-if-our-global-rsv-is-larger-than-metadata.patch btrfs-free-alien-device-after-device-add.patch btrfs-include-non-missing-as-a-qualifier-for-the-latest_bdev.patch btrfs-send-emit-file-capabilities-after-chown.patch --- diff --git a/queue-5.4/btrfs-fix-error-handling-when-submitting-direct-i-o-bio.patch b/queue-5.4/btrfs-fix-error-handling-when-submitting-direct-i-o-bio.patch new file mode 100644 index 00000000000..f2e77e28e43 --- /dev/null +++ b/queue-5.4/btrfs-fix-error-handling-when-submitting-direct-i-o-bio.patch @@ -0,0 +1,63 @@ +From 6d3113a193e3385c72240096fe397618ecab6e43 Mon Sep 17 00:00:00 2001 +From: Omar Sandoval +Date: Thu, 16 Apr 2020 14:46:12 -0700 +Subject: btrfs: fix error handling when submitting direct I/O bio + +From: Omar Sandoval + +commit 6d3113a193e3385c72240096fe397618ecab6e43 upstream. + +In btrfs_submit_direct_hook(), if a direct I/O write doesn't span a RAID +stripe or chunk, we submit orig_bio without cloning it. In this case, we +don't increment pending_bios. Then, if btrfs_submit_dio_bio() fails, we +decrement pending_bios to -1, and we never complete orig_bio. Fix it by +initializing pending_bios to 1 instead of incrementing later. + +Fixing this exposes another bug: we put orig_bio prematurely and then +put it again from end_io. Fix it by not putting orig_bio. + +After this change, pending_bios is really more of a reference count, but +I'll leave that cleanup separate to keep the fix small. + +Fixes: e65e15355429 ("btrfs: fix panic caused by direct IO") +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Nikolay Borisov +Reviewed-by: Josef Bacik +Reviewed-by: Johannes Thumshirn +Signed-off-by: Omar Sandoval +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -8534,7 +8534,6 @@ static int btrfs_submit_direct_hook(stru + + /* bio split */ + ASSERT(geom.len <= INT_MAX); +- atomic_inc(&dip->pending_bios); + do { + clone_len = min_t(int, submit_len, geom.len); + +@@ -8584,7 +8583,8 @@ submit: + if (!status) + return 0; + +- bio_put(bio); ++ if (bio != orig_bio) ++ bio_put(bio); + out_err: + dip->errors = 1; + /* +@@ -8625,7 +8625,7 @@ static void btrfs_submit_direct(struct b + bio->bi_private = dip; + dip->orig_bio = bio; + dip->dio_bio = dio_bio; +- atomic_set(&dip->pending_bios, 0); ++ atomic_set(&dip->pending_bios, 1); + io_bio = btrfs_io_bio(bio); + io_bio->logical = file_offset; + diff --git a/queue-5.4/btrfs-fix-space_info-bytes_may_use-underflow-after-nocow-buffered-write.patch b/queue-5.4/btrfs-fix-space_info-bytes_may_use-underflow-after-nocow-buffered-write.patch new file mode 100644 index 00000000000..c1e180639d7 --- /dev/null +++ b/queue-5.4/btrfs-fix-space_info-bytes_may_use-underflow-after-nocow-buffered-write.patch @@ -0,0 +1,197 @@ +From 467dc47ea99c56e966e99d09dae54869850abeeb Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 27 May 2020 11:16:07 +0100 +Subject: btrfs: fix space_info bytes_may_use underflow after nocow buffered write + +From: Filipe Manana + +commit 467dc47ea99c56e966e99d09dae54869850abeeb upstream. + +When doing a buffered write we always try to reserve data space for it, +even when the file has the NOCOW bit set or the write falls into a file +range covered by a prealloc extent. This is done both because it is +expensive to check if we can do a nocow write (checking if an extent is +shared through reflinks or if there's a hole in the range for example), +and because when writeback starts we might actually need to fallback to +COW mode (for example the block group containing the target extents was +turned into RO mode due to a scrub or balance). + +When we are unable to reserve data space we check if we can do a nocow +write, and if we can, we proceed with dirtying the pages and setting up +the range for delalloc. In this case the bytes_may_use counter of the +data space_info object is not incremented, unlike in the case where we +are able to reserve data space (done through btrfs_check_data_free_space() +which calls btrfs_alloc_data_chunk_ondemand()). + +Later when running delalloc we attempt to start writeback in nocow mode +but we might revert back to cow mode, for example because in the meanwhile +a block group was turned into RO mode by a scrub or relocation. The cow +path after successfully allocating an extent ends up calling +btrfs_add_reserved_bytes(), which expects the bytes_may_use counter of +the data space_info object to have been incremented before - but we did +not do it when the buffered write started, since there was not enough +available data space. So btrfs_add_reserved_bytes() ends up decrementing +the bytes_may_use counter anyway, and when the counter's current value +is smaller then the size of the allocated extent we get a stack trace +like the following: + + ------------[ cut here ]------------ + WARNING: CPU: 0 PID: 20138 at fs/btrfs/space-info.h:115 btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs] + Modules linked in: btrfs blake2b_generic xor raid6_pq libcrc32c (...) + CPU: 0 PID: 20138 Comm: kworker/u8:15 Not tainted 5.6.0-rc7-btrfs-next-58 #5 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 + Workqueue: writeback wb_workfn (flush-btrfs-1754) + RIP: 0010:btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs] + Code: ff ff 48 (...) + RSP: 0018:ffffbda18a4b3568 EFLAGS: 00010287 + RAX: 0000000000000000 RBX: ffff9ca076f5d800 RCX: 0000000000000000 + RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff9ca068470410 + RBP: fffffffffffff000 R08: 0000000000000001 R09: 0000000000000000 + R10: ffff9ca079d58040 R11: 0000000000000000 R12: ffff9ca068470400 + R13: ffff9ca0408b2000 R14: 0000000000001000 R15: ffff9ca076f5d800 + FS: 0000000000000000(0000) GS:ffff9ca07a600000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 00005605dbfe7048 CR3: 0000000138570006 CR4: 00000000003606f0 + DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + Call Trace: + find_free_extent+0x4a0/0x16c0 [btrfs] + btrfs_reserve_extent+0x91/0x180 [btrfs] + cow_file_range+0x12d/0x490 [btrfs] + run_delalloc_nocow+0x341/0xa40 [btrfs] + btrfs_run_delalloc_range+0x1ea/0x6d0 [btrfs] + ? find_lock_delalloc_range+0x221/0x250 [btrfs] + writepage_delalloc+0xe8/0x150 [btrfs] + __extent_writepage+0xe8/0x4c0 [btrfs] + extent_write_cache_pages+0x237/0x530 [btrfs] + ? btrfs_wq_submit_bio+0x9f/0xc0 [btrfs] + extent_writepages+0x44/0xa0 [btrfs] + do_writepages+0x23/0x80 + __writeback_single_inode+0x59/0x700 + writeback_sb_inodes+0x267/0x5f0 + __writeback_inodes_wb+0x87/0xe0 + wb_writeback+0x382/0x590 + ? wb_workfn+0x4a2/0x6c0 + wb_workfn+0x4a2/0x6c0 + process_one_work+0x26d/0x6a0 + worker_thread+0x4f/0x3e0 + ? process_one_work+0x6a0/0x6a0 + kthread+0x103/0x140 + ? kthread_create_worker_on_cpu+0x70/0x70 + ret_from_fork+0x3a/0x50 + irq event stamp: 0 + hardirqs last enabled at (0): [<0000000000000000>] 0x0 + hardirqs last disabled at (0): [] copy_process+0x74f/0x2020 + softirqs last enabled at (0): [] copy_process+0x74f/0x2020 + softirqs last disabled at (0): [<0000000000000000>] 0x0 + ---[ end trace f9f6ef8ec4cd8ec9 ]--- + +So to fix this, when falling back into cow mode check if space was not +reserved, by testing for the bit EXTENT_NORESERVE in the respective file +range, and if not, increment the bytes_may_use counter for the data +space_info object. Also clear the EXTENT_NORESERVE bit from the range, so +that if the cow path fails it decrements the bytes_may_use counter when +clearing the delalloc range (through the btrfs_clear_delalloc_extent() +callback). + +Fixes: 7ee9e4405f264e ("Btrfs: check if we can nocow if we don't have data space") +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 56 insertions(+), 5 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -49,6 +49,7 @@ + #include "qgroup.h" + #include "delalloc-space.h" + #include "block-group.h" ++#include "space-info.h" + + struct btrfs_iget_args { + struct btrfs_key *location; +@@ -1322,6 +1323,56 @@ static noinline int csum_exist_in_range( + return 1; + } + ++static int fallback_to_cow(struct inode *inode, struct page *locked_page, ++ const u64 start, const u64 end, ++ int *page_started, unsigned long *nr_written) ++{ ++ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; ++ u64 range_start = start; ++ u64 count; ++ ++ /* ++ * If EXTENT_NORESERVE is set it means that when the buffered write was ++ * made we had not enough available data space and therefore we did not ++ * reserve data space for it, since we though we could do NOCOW for the ++ * respective file range (either there is prealloc extent or the inode ++ * has the NOCOW bit set). ++ * ++ * However when we need to fallback to COW mode (because for example the ++ * block group for the corresponding extent was turned to RO mode by a ++ * scrub or relocation) we need to do the following: ++ * ++ * 1) We increment the bytes_may_use counter of the data space info. ++ * If COW succeeds, it allocates a new data extent and after doing ++ * that it decrements the space info's bytes_may_use counter and ++ * increments its bytes_reserved counter by the same amount (we do ++ * this at btrfs_add_reserved_bytes()). So we need to increment the ++ * bytes_may_use counter to compensate (when space is reserved at ++ * buffered write time, the bytes_may_use counter is incremented); ++ * ++ * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so ++ * that if the COW path fails for any reason, it decrements (through ++ * extent_clear_unlock_delalloc()) the bytes_may_use counter of the ++ * data space info, which we incremented in the step above. ++ */ ++ count = count_range_bits(io_tree, &range_start, end, end + 1 - start, ++ EXTENT_NORESERVE, 0); ++ if (count > 0) { ++ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; ++ struct btrfs_space_info *sinfo = fs_info->data_sinfo; ++ ++ spin_lock(&sinfo->lock); ++ btrfs_space_info_update_bytes_may_use(fs_info, sinfo, count); ++ spin_unlock(&sinfo->lock); ++ ++ clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, 0, 0, ++ NULL); ++ } ++ ++ return cow_file_range(inode, locked_page, start, end, page_started, ++ nr_written, 1); ++} ++ + /* + * when nowcow writeback call back. This checks for snapshots or COW copies + * of the extents that exist in the file, and COWs the file as required. +@@ -1569,9 +1620,9 @@ out_check: + * NOCOW, following one which needs to be COW'ed + */ + if (cow_start != (u64)-1) { +- ret = cow_file_range(inode, locked_page, +- cow_start, found_key.offset - 1, +- page_started, nr_written, 1); ++ ret = fallback_to_cow(inode, locked_page, cow_start, ++ found_key.offset - 1, ++ page_started, nr_written); + if (ret) { + if (nocow) + btrfs_dec_nocow_writers(fs_info, +@@ -1660,8 +1711,8 @@ out_check: + + if (cow_start != (u64)-1) { + cur_offset = end; +- ret = cow_file_range(inode, locked_page, cow_start, end, +- page_started, nr_written, 1); ++ ret = fallback_to_cow(inode, locked_page, cow_start, end, ++ page_started, nr_written); + if (ret) + goto error; + } diff --git a/queue-5.4/btrfs-fix-space_info-bytes_may_use-underflow-during-space-cache-writeout.patch b/queue-5.4/btrfs-fix-space_info-bytes_may_use-underflow-during-space-cache-writeout.patch new file mode 100644 index 00000000000..3c6b6b64b38 --- /dev/null +++ b/queue-5.4/btrfs-fix-space_info-bytes_may_use-underflow-during-space-cache-writeout.patch @@ -0,0 +1,146 @@ +From 2166e5edce9ac1edf3b113d6091ef72fcac2d6c4 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 27 May 2020 11:16:19 +0100 +Subject: btrfs: fix space_info bytes_may_use underflow during space cache writeout + +From: Filipe Manana + +commit 2166e5edce9ac1edf3b113d6091ef72fcac2d6c4 upstream. + +We always preallocate a data extent for writing a free space cache, which +causes writeback to always try the nocow path first, since the free space +inode has the prealloc bit set in its flags. + +However if the block group that contains the data extent for the space +cache has been turned to RO mode due to a running scrub or balance for +example, we have to fallback to the cow path. In that case once a new data +extent is allocated we end up calling btrfs_add_reserved_bytes(), which +decrements the counter named bytes_may_use from the data space_info object +with the expection that this counter was previously incremented with the +same amount (the size of the data extent). + +However when we started writeout of the space cache at cache_save_setup(), +we incremented the value of the bytes_may_use counter through a call to +btrfs_check_data_free_space() and then decremented it through a call to +btrfs_prealloc_file_range_trans() immediately after. So when starting the +writeback if we fallback to cow mode we have to increment the counter +bytes_may_use of the data space_info again to compensate for the extent +allocation done by the cow path. + +When this issue happens we are incorrectly decrementing the bytes_may_use +counter and when its current value is smaller then the amount we try to +subtract we end up with the following warning: + + ------------[ cut here ]------------ + WARNING: CPU: 3 PID: 657 at fs/btrfs/space-info.h:115 btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs] + Modules linked in: btrfs blake2b_generic xor raid6_pq libcrc32c (...) + CPU: 3 PID: 657 Comm: kworker/u8:7 Tainted: G W 5.6.0-rc7-btrfs-next-58 #5 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 + Workqueue: writeback wb_workfn (flush-btrfs-1591) + RIP: 0010:btrfs_add_reserved_bytes+0x3d6/0x4e0 [btrfs] + Code: ff ff 48 (...) + RSP: 0000:ffffa41608f13660 EFLAGS: 00010287 + RAX: 0000000000001000 RBX: ffff9615b93ae400 RCX: 0000000000000000 + RDX: 0000000000000002 RSI: 0000000000000000 RDI: ffff9615b96ab410 + RBP: fffffffffffee000 R08: 0000000000000001 R09: 0000000000000000 + R10: ffff961585e62a40 R11: 0000000000000000 R12: ffff9615b96ab400 + R13: ffff9615a1a2a000 R14: 0000000000012000 R15: ffff9615b93ae400 + FS: 0000000000000000(0000) GS:ffff9615bb200000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 000055cbbc2ae178 CR3: 0000000115794006 CR4: 00000000003606e0 + DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + Call Trace: + find_free_extent+0x4a0/0x16c0 [btrfs] + btrfs_reserve_extent+0x91/0x180 [btrfs] + cow_file_range+0x12d/0x490 [btrfs] + btrfs_run_delalloc_range+0x9f/0x6d0 [btrfs] + ? find_lock_delalloc_range+0x221/0x250 [btrfs] + writepage_delalloc+0xe8/0x150 [btrfs] + __extent_writepage+0xe8/0x4c0 [btrfs] + extent_write_cache_pages+0x237/0x530 [btrfs] + extent_writepages+0x44/0xa0 [btrfs] + do_writepages+0x23/0x80 + __writeback_single_inode+0x59/0x700 + writeback_sb_inodes+0x267/0x5f0 + __writeback_inodes_wb+0x87/0xe0 + wb_writeback+0x382/0x590 + ? wb_workfn+0x4a2/0x6c0 + wb_workfn+0x4a2/0x6c0 + process_one_work+0x26d/0x6a0 + worker_thread+0x4f/0x3e0 + ? process_one_work+0x6a0/0x6a0 + kthread+0x103/0x140 + ? kthread_create_worker_on_cpu+0x70/0x70 + ret_from_fork+0x3a/0x50 + irq event stamp: 0 + hardirqs last enabled at (0): [<0000000000000000>] 0x0 + hardirqs last disabled at (0): [] copy_process+0x74f/0x2020 + softirqs last enabled at (0): [] copy_process+0x74f/0x2020 + softirqs last disabled at (0): [<0000000000000000>] 0x0 + ---[ end trace bd7c03622e0b0a52 ]--- + ------------[ cut here ]------------ + +So fix this by incrementing the bytes_may_use counter of the data +space_info when we fallback to the cow path. If the cow path is successful +the counter is decremented after extent allocation (by +btrfs_add_reserved_bytes()), if it fails it ends up being decremented as +well when clearing the delalloc range (extent_clear_unlock_delalloc()). + +This could be triggered sporadically by the test case btrfs/061 from +fstests. + +Fixes: 82d5902d9c681b ("Btrfs: Support reading/writing on disk free ino cache") +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 20 +++++++++++++++----- + 1 file changed, 15 insertions(+), 5 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -1327,6 +1327,8 @@ static int fallback_to_cow(struct inode + const u64 start, const u64 end, + int *page_started, unsigned long *nr_written) + { ++ const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode)); ++ const u64 range_bytes = end + 1 - start; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + u64 range_start = start; + u64 count; +@@ -1354,19 +1356,27 @@ static int fallback_to_cow(struct inode + * that if the COW path fails for any reason, it decrements (through + * extent_clear_unlock_delalloc()) the bytes_may_use counter of the + * data space info, which we incremented in the step above. ++ * ++ * If we need to fallback to cow and the inode corresponds to a free ++ * space cache inode, we must also increment bytes_may_use of the data ++ * space_info for the same reason. Space caches always get a prealloc ++ * extent for them, however scrub or balance may have set the block ++ * group that contains that extent to RO mode. + */ +- count = count_range_bits(io_tree, &range_start, end, end + 1 - start, ++ count = count_range_bits(io_tree, &range_start, end, range_bytes, + EXTENT_NORESERVE, 0); +- if (count > 0) { ++ if (count > 0 || is_space_ino) { ++ const u64 bytes = is_space_ino ? range_bytes : count; + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_space_info *sinfo = fs_info->data_sinfo; + + spin_lock(&sinfo->lock); +- btrfs_space_info_update_bytes_may_use(fs_info, sinfo, count); ++ btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); + spin_unlock(&sinfo->lock); + +- clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, 0, 0, +- NULL); ++ if (count > 0) ++ clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, ++ 0, 0, NULL); + } + + return cow_file_range(inode, locked_page, start, end, page_started, diff --git a/queue-5.4/btrfs-fix-wrong-file-range-cleanup-after-an-error-filling-dealloc-range.patch b/queue-5.4/btrfs-fix-wrong-file-range-cleanup-after-an-error-filling-dealloc-range.patch new file mode 100644 index 00000000000..a7b0f876d01 --- /dev/null +++ b/queue-5.4/btrfs-fix-wrong-file-range-cleanup-after-an-error-filling-dealloc-range.patch @@ -0,0 +1,39 @@ +From e2c8e92d1140754073ad3799eb6620c76bab2078 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 27 May 2020 11:15:53 +0100 +Subject: btrfs: fix wrong file range cleanup after an error filling dealloc range + +From: Filipe Manana + +commit e2c8e92d1140754073ad3799eb6620c76bab2078 upstream. + +If an error happens while running dellaloc in COW mode for a range, we can +end up calling extent_clear_unlock_delalloc() for a range that goes beyond +our range's end offset by 1 byte, which affects 1 extra page. This results +in clearing bits and doing page operations (such as a page unlock) outside +our target range. + +Fix that by calling extent_clear_unlock_delalloc() with an inclusive end +offset, instead of an exclusive end offset, at cow_file_range(). + +Fixes: a315e68f6e8b30 ("Btrfs: fix invalid attempt to free reserved space on failure to cow range") +CC: stable@vger.kernel.org # 4.14+ +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -1132,7 +1132,7 @@ out_unlock: + */ + if (extent_reserved) { + extent_clear_unlock_delalloc(inode, start, +- start + cur_alloc_size, ++ start + cur_alloc_size - 1, + locked_page, + clear_bits, + page_ops); diff --git a/queue-5.4/btrfs-force-chunk-allocation-if-our-global-rsv-is-larger-than-metadata.patch b/queue-5.4/btrfs-force-chunk-allocation-if-our-global-rsv-is-larger-than-metadata.patch new file mode 100644 index 00000000000..59e5414a1ff --- /dev/null +++ b/queue-5.4/btrfs-force-chunk-allocation-if-our-global-rsv-is-larger-than-metadata.patch @@ -0,0 +1,119 @@ +From 9c343784c4328781129bcf9e671645f69fe4b38a Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Fri, 13 Mar 2020 15:28:48 -0400 +Subject: btrfs: force chunk allocation if our global rsv is larger than metadata + +From: Josef Bacik + +commit 9c343784c4328781129bcf9e671645f69fe4b38a upstream. + +Nikolay noticed a bunch of test failures with my global rsv steal +patches. At first he thought they were introduced by them, but they've +been failing for a while with 64k nodes. + +The problem is with 64k nodes we have a global reserve that calculates +out to 13MiB on a freshly made file system, which only has 8MiB of +metadata space. Because of changes I previously made we no longer +account for the global reserve in the overcommit logic, which means we +correctly allow overcommit to happen even though we are already +overcommitted. + +However in some corner cases, for example btrfs/170, we will allocate +the entire file system up with data chunks before we have enough space +pressure to allocate a metadata chunk. Then once the fs is full we +ENOSPC out because we cannot overcommit and the global reserve is taking +up all of the available space. + +The most ideal way to deal with this is to change our space reservation +stuff to take into account the height of the tree's that we're +modifying, so that our global reserve calculation does not end up so +obscenely large. + +However that is a huge undertaking. Instead fix this by forcing a chunk +allocation if the global reserve is larger than the total metadata +space. This gives us essentially the same behavior that happened +before, we get a chunk allocated and these tests can pass. + +This is meant to be a stop-gap measure until we can tackle the "tree +height only" project. + +Fixes: 0096420adb03 ("btrfs: do not account global reserve in can_overcommit") +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Nikolay Borisov +Tested-by: Nikolay Borisov +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/block-rsv.c | 3 +++ + fs/btrfs/transaction.c | 18 ++++++++++++++++++ + 2 files changed, 21 insertions(+) + +--- a/fs/btrfs/block-rsv.c ++++ b/fs/btrfs/block-rsv.c +@@ -5,6 +5,7 @@ + #include "block-rsv.h" + #include "space-info.h" + #include "transaction.h" ++#include "block-group.h" + + static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *block_rsv, +@@ -313,6 +314,8 @@ void btrfs_update_global_block_rsv(struc + else + block_rsv->full = 0; + ++ if (block_rsv->size >= sinfo->total_bytes) ++ sinfo->force_alloc = CHUNK_ALLOC_FORCE; + spin_unlock(&block_rsv->lock); + spin_unlock(&sinfo->lock); + } +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -21,6 +21,7 @@ + #include "dev-replace.h" + #include "qgroup.h" + #include "block-group.h" ++#include "space-info.h" + + #define BTRFS_ROOT_TRANS_TAG 0 + +@@ -451,6 +452,7 @@ start_transaction(struct btrfs_root *roo + u64 num_bytes = 0; + u64 qgroup_reserved = 0; + bool reloc_reserved = false; ++ bool do_chunk_alloc = false; + int ret; + + /* Send isn't supposed to start transactions. */ +@@ -513,6 +515,9 @@ start_transaction(struct btrfs_root *roo + delayed_refs_bytes); + num_bytes -= delayed_refs_bytes; + } ++ ++ if (rsv->space_info->force_alloc) ++ do_chunk_alloc = true; + } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && + !delayed_refs_rsv->full) { + /* +@@ -595,6 +600,19 @@ got_it: + current->journal_info = h; + + /* ++ * If the space_info is marked ALLOC_FORCE then we'll get upgraded to ++ * ALLOC_FORCE the first run through, and then we won't allocate for ++ * anybody else who races in later. We don't care about the return ++ * value here. ++ */ ++ if (do_chunk_alloc && num_bytes) { ++ u64 flags = h->block_rsv->space_info->flags; ++ ++ btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags), ++ CHUNK_ALLOC_NO_FORCE); ++ } ++ ++ /* + * btrfs_record_root_in_trans() needs to alloc new extents, and may + * call btrfs_join_transaction() while we're also starting a + * transaction. diff --git a/queue-5.4/btrfs-free-alien-device-after-device-add.patch b/queue-5.4/btrfs-free-alien-device-after-device-add.patch new file mode 100644 index 00000000000..94bbf24f3a6 --- /dev/null +++ b/queue-5.4/btrfs-free-alien-device-after-device-add.patch @@ -0,0 +1,63 @@ +From 7f551d969037cc128eca60688d9c5a300d84e665 Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Tue, 5 May 2020 02:58:26 +0800 +Subject: btrfs: free alien device after device add + +From: Anand Jain + +commit 7f551d969037cc128eca60688d9c5a300d84e665 upstream. + +When an old device has new fsid through 'btrfs device add -f ' our +fs_devices list has an alien device in one of the fs_devices lists. + +By having an alien device in fs_devices, we have two issues so far + +1. missing device does not not show as missing in the userland + +2. degraded mount will fail + +Both issues are caused by the fact that there's an alien device in the +fs_devices list. (Alien means that it does not belong to the filesystem, +identified by fsid, or does not contain btrfs filesystem at all, eg. due +to overwrite). + +A device can be scanned/added through the control device ioctls +SCAN_DEV, DEVICES_READY or by ADD_DEV. + +And device coming through the control device is checked against the all +other devices in the lists, but this was not the case for ADD_DEV. + +This patch fixes both issues above by removing the alien device. + +CC: stable@vger.kernel.org # 5.4+ +Signed-off-by: Anand Jain +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/volumes.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -2769,8 +2769,18 @@ int btrfs_init_new_device(struct btrfs_f + ret = btrfs_commit_transaction(trans); + } + +- /* Update ctime/mtime for libblkid */ ++ /* ++ * Now that we have written a new super block to this device, check all ++ * other fs_devices list if device_path alienates any other scanned ++ * device. ++ * We can ignore the return value as it typically returns -EINVAL and ++ * only succeeds if the device was an alien. ++ */ ++ btrfs_forget_devices(device_path); ++ ++ /* Update ctime/mtime for blkid or udev */ + update_dev_time(device_path); ++ + return ret; + + error_sysfs: diff --git a/queue-5.4/btrfs-include-non-missing-as-a-qualifier-for-the-latest_bdev.patch b/queue-5.4/btrfs-include-non-missing-as-a-qualifier-for-the-latest_bdev.patch new file mode 100644 index 00000000000..c54fd51dd8b --- /dev/null +++ b/queue-5.4/btrfs-include-non-missing-as-a-qualifier-for-the-latest_bdev.patch @@ -0,0 +1,77 @@ +From 998a0671961f66e9fad4990ed75f80ba3088c2f1 Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Tue, 5 May 2020 02:58:25 +0800 +Subject: btrfs: include non-missing as a qualifier for the latest_bdev + +From: Anand Jain + +commit 998a0671961f66e9fad4990ed75f80ba3088c2f1 upstream. + +btrfs_free_extra_devids() updates fs_devices::latest_bdev to point to +the bdev with greatest device::generation number. For a typical-missing +device the generation number is zero so fs_devices::latest_bdev will +never point to it. + +But if the missing device is due to alienation [1], then +device::generation is not zero and if it is greater or equal to the rest +of device generations in the list, then fs_devices::latest_bdev ends up +pointing to the missing device and reports the error like [2]. + +[1] We maintain devices of a fsid (as in fs_device::fsid) in the +fs_devices::devices list, a device is considered as an alien device +if its fsid does not match with the fs_device::fsid + +Consider a working filesystem with raid1: + + $ mkfs.btrfs -f -d raid1 -m raid1 /dev/sda /dev/sdb + $ mount /dev/sda /mnt-raid1 + $ umount /mnt-raid1 + +While mnt-raid1 was unmounted the user force-adds one of its devices to +another btrfs filesystem: + + $ mkfs.btrfs -f /dev/sdc + $ mount /dev/sdc /mnt-single + $ btrfs dev add -f /dev/sda /mnt-single + +Now the original mnt-raid1 fails to mount in degraded mode, because +fs_devices::latest_bdev is pointing to the alien device. + + $ mount -o degraded /dev/sdb /mnt-raid1 + +[2] +mount: wrong fs type, bad option, bad superblock on /dev/sdb, + missing codepage or helper program, or other error + + In some cases useful info is found in syslog - try + dmesg | tail or so. + + kernel: BTRFS warning (device sdb): devid 1 uuid 072a0192-675b-4d5a-8640-a5cf2b2c704d is missing + kernel: BTRFS error (device sdb): failed to read devices + kernel: BTRFS error (device sdb): open_ctree failed + +Fix the root cause by checking if the device is not missing before it +can be considered for the fs_devices::latest_bdev. + +CC: stable@vger.kernel.org # 4.19+ +Reviewed-by: Josef Bacik +Signed-off-by: Anand Jain +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/volumes.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -1223,6 +1223,8 @@ again: + &device->dev_state)) { + if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, + &device->dev_state) && ++ !test_bit(BTRFS_DEV_STATE_MISSING, ++ &device->dev_state) && + (!latest_dev || + device->generation > latest_dev->generation)) { + latest_dev = device; diff --git a/queue-5.4/btrfs-send-emit-file-capabilities-after-chown.patch b/queue-5.4/btrfs-send-emit-file-capabilities-after-chown.patch new file mode 100644 index 00000000000..dfbc5751b97 --- /dev/null +++ b/queue-5.4/btrfs-send-emit-file-capabilities-after-chown.patch @@ -0,0 +1,154 @@ +From 89efda52e6b6930f80f5adda9c3c9edfb1397191 Mon Sep 17 00:00:00 2001 +From: Marcos Paulo de Souza +Date: Sun, 10 May 2020 23:15:07 -0300 +Subject: btrfs: send: emit file capabilities after chown + +From: Marcos Paulo de Souza + +commit 89efda52e6b6930f80f5adda9c3c9edfb1397191 upstream. + +Whenever a chown is executed, all capabilities of the file being touched +are lost. When doing incremental send with a file with capabilities, +there is a situation where the capability can be lost on the receiving +side. The sequence of actions bellow shows the problem: + + $ mount /dev/sda fs1 + $ mount /dev/sdb fs2 + + $ touch fs1/foo.bar + $ setcap cap_sys_nice+ep fs1/foo.bar + $ btrfs subvolume snapshot -r fs1 fs1/snap_init + $ btrfs send fs1/snap_init | btrfs receive fs2 + + $ chgrp adm fs1/foo.bar + $ setcap cap_sys_nice+ep fs1/foo.bar + + $ btrfs subvolume snapshot -r fs1 fs1/snap_complete + $ btrfs subvolume snapshot -r fs1 fs1/snap_incremental + + $ btrfs send fs1/snap_complete | btrfs receive fs2 + $ btrfs send -p fs1/snap_init fs1/snap_incremental | btrfs receive fs2 + +At this point, only a chown was emitted by "btrfs send" since only the +group was changed. This makes the cap_sys_nice capability to be dropped +from fs2/snap_incremental/foo.bar + +To fix that, only emit capabilities after chown is emitted. The current +code first checks for xattrs that are new/changed, emits them, and later +emit the chown. Now, __process_new_xattr skips capabilities, letting +only finish_inode_if_needed to emit them, if they exist, for the inode +being processed. + +This behavior was being worked around in "btrfs receive" side by caching +the capability and only applying it after chown. Now, xattrs are only +emmited _after_ chown, making that workaround not needed anymore. + +Link: https://github.com/kdave/btrfs-progs/issues/202 +CC: stable@vger.kernel.org # 4.4+ +Suggested-by: Filipe Manana +Reviewed-by: Filipe Manana +Signed-off-by: Marcos Paulo de Souza +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/send.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 67 insertions(+) + +--- a/fs/btrfs/send.c ++++ b/fs/btrfs/send.c +@@ -23,6 +23,7 @@ + #include "btrfs_inode.h" + #include "transaction.h" + #include "compression.h" ++#include "xattr.h" + + /* + * Maximum number of references an extent can have in order for us to attempt to +@@ -4536,6 +4537,10 @@ static int __process_new_xattr(int num, + struct fs_path *p; + struct posix_acl_xattr_header dummy_acl; + ++ /* Capabilities are emitted by finish_inode_if_needed */ ++ if (!strncmp(name, XATTR_NAME_CAPS, name_len)) ++ return 0; ++ + p = fs_path_alloc(); + if (!p) + return -ENOMEM; +@@ -5098,6 +5103,64 @@ static int send_extent_data(struct send_ + return 0; + } + ++/* ++ * Search for a capability xattr related to sctx->cur_ino. If the capability is ++ * found, call send_set_xattr function to emit it. ++ * ++ * Return 0 if there isn't a capability, or when the capability was emitted ++ * successfully, or < 0 if an error occurred. ++ */ ++static int send_capabilities(struct send_ctx *sctx) ++{ ++ struct fs_path *fspath = NULL; ++ struct btrfs_path *path; ++ struct btrfs_dir_item *di; ++ struct extent_buffer *leaf; ++ unsigned long data_ptr; ++ char *buf = NULL; ++ int buf_len; ++ int ret = 0; ++ ++ path = alloc_path_for_send(); ++ if (!path) ++ return -ENOMEM; ++ ++ di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino, ++ XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0); ++ if (!di) { ++ /* There is no xattr for this inode */ ++ goto out; ++ } else if (IS_ERR(di)) { ++ ret = PTR_ERR(di); ++ goto out; ++ } ++ ++ leaf = path->nodes[0]; ++ buf_len = btrfs_dir_data_len(leaf, di); ++ ++ fspath = fs_path_alloc(); ++ buf = kmalloc(buf_len, GFP_KERNEL); ++ if (!fspath || !buf) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); ++ if (ret < 0) ++ goto out; ++ ++ data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); ++ read_extent_buffer(leaf, buf, data_ptr, buf_len); ++ ++ ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS, ++ strlen(XATTR_NAME_CAPS), buf, buf_len); ++out: ++ kfree(buf); ++ fs_path_free(fspath); ++ btrfs_free_path(path); ++ return ret; ++} ++ + static int clone_range(struct send_ctx *sctx, + struct clone_root *clone_root, + const u64 disk_byte, +@@ -6001,6 +6064,10 @@ static int finish_inode_if_needed(struct + goto out; + } + ++ ret = send_capabilities(sctx); ++ if (ret < 0) ++ goto out; ++ + /* + * If other directory inodes depended on our current directory + * inode's move/rename, now do their move/rename operations. diff --git a/queue-5.4/series b/queue-5.4/series index 23c9413a59b..6fbe053d37f 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -157,3 +157,11 @@ bpf-fix-running-sk_skb-program-types-with-ktls.patch selftests-bpf-flow_dissector-close-tap-device-fd-aft.patch kasan-stop-tests-being-eliminated-as-dead-code-with-.patch string.h-fix-incompatibility-between-fortify_source-.patch +btrfs-free-alien-device-after-device-add.patch +btrfs-include-non-missing-as-a-qualifier-for-the-latest_bdev.patch +btrfs-send-emit-file-capabilities-after-chown.patch +btrfs-force-chunk-allocation-if-our-global-rsv-is-larger-than-metadata.patch +btrfs-fix-error-handling-when-submitting-direct-i-o-bio.patch +btrfs-fix-wrong-file-range-cleanup-after-an-error-filling-dealloc-range.patch +btrfs-fix-space_info-bytes_may_use-underflow-after-nocow-buffered-write.patch +btrfs-fix-space_info-bytes_may_use-underflow-during-space-cache-writeout.patch