From: Greg Kroah-Hartman Date: Sun, 11 Nov 2018 20:26:24 +0000 (-0800) Subject: 4.19-stable patches X-Git-Tag: v4.19.2~19 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7b20d46e09d01a4064da599e7530f4b98578f0cf;p=thirdparty%2Fkernel%2Fstable-queue.git 4.19-stable patches added patches: btrfs-don-t-attempt-to-trim-devices-that-don-t-support-it.patch btrfs-don-t-clean-dirty-pages-during-buffered-writes.patch btrfs-enhance-btrfs_trim_fs-function-to-handle-error-better.patch btrfs-ensure-btrfs_trim_fs-can-trim-the-whole-filesystem.patch btrfs-fix-error-handling-in-btrfs_dev_replace_start.patch btrfs-fix-error-handling-in-free_log_tree.patch btrfs-fix-warning-when-replaying-log-after-fsync-of-a-tmpfile.patch btrfs-fix-wrong-dentries-after-fsync-of-file-that-got-its-parent-replaced.patch btrfs-handle-owner-mismatch-gracefully-when-walking-up-tree.patch btrfs-iterate-all-devices-during-trim-instead-of-fs_devices-alloc_list.patch btrfs-keep-trim-from-interfering-with-transaction-commits.patch btrfs-locking-add-extra-check-in-btrfs_init_new_buffer-to-avoid-deadlock.patch btrfs-make-sure-we-create-all-new-block-groups.patch btrfs-protect-space-cache-inode-alloc-with-gfp_nofs.patch btrfs-qgroup-avoid-calling-qgroup-functions-if-qgroup-is-not-enabled.patch btrfs-qgroup-dirty-all-qgroups-before-rescan.patch btrfs-release-metadata-before-running-delayed-refs.patch btrfs-reset-max_extent_size-on-clear-in-a-bitmap.patch btrfs-wait-on-caching-when-putting-the-bg-cache.patch --- diff --git a/queue-4.19/btrfs-don-t-attempt-to-trim-devices-that-don-t-support-it.patch b/queue-4.19/btrfs-don-t-attempt-to-trim-devices-that-don-t-support-it.patch new file mode 100644 index 00000000000..dde3234f324 --- /dev/null +++ b/queue-4.19/btrfs-don-t-attempt-to-trim-devices-that-don-t-support-it.patch @@ -0,0 +1,39 @@ +From 0be88e367fd8fbdb45257615d691f4675dda062f Mon Sep 17 00:00:00 2001 +From: Jeff Mahoney +Date: Thu, 6 Sep 2018 17:18:15 -0400 +Subject: btrfs: don't attempt to trim devices that don't support it + +From: Jeff Mahoney + +commit 0be88e367fd8fbdb45257615d691f4675dda062f upstream. + +We check whether any device the file system is using supports discard in +the ioctl call, but then we attempt to trim free extents on every device +regardless of whether discard is supported. Due to the way we mask off +EOPNOTSUPP, we can end up issuing the trim operations on each free range +on devices that don't support it, just wasting time. + +Fixes: 499f377f49f08 ("btrfs: iterate over unused chunk space in FITRIM") +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Jeff Mahoney +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -10789,6 +10789,10 @@ static int btrfs_trim_free_extents(struc + + *trimmed = 0; + ++ /* Discard not supported = nothing to do. */ ++ if (!blk_queue_discard(bdev_get_queue(device->bdev))) ++ return 0; ++ + /* Not writeable = nothing to do. */ + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + return 0; diff --git a/queue-4.19/btrfs-don-t-clean-dirty-pages-during-buffered-writes.patch b/queue-4.19/btrfs-don-t-clean-dirty-pages-during-buffered-writes.patch new file mode 100644 index 00000000000..a7e5f9a0c43 --- /dev/null +++ b/queue-4.19/btrfs-don-t-clean-dirty-pages-during-buffered-writes.patch @@ -0,0 +1,101 @@ +From 7703bdd8d23e6ef057af3253958a793ec6066b28 Mon Sep 17 00:00:00 2001 +From: Chris Mason +Date: Wed, 20 Jun 2018 07:56:11 -0700 +Subject: Btrfs: don't clean dirty pages during buffered writes + +From: Chris Mason + +commit 7703bdd8d23e6ef057af3253958a793ec6066b28 upstream. + +During buffered writes, we follow this basic series of steps: + +again: + lock all the pages + wait for writeback on all the pages + Take the extent range lock + wait for ordered extents on the whole range + clean all the pages + + if (copy_from_user_in_atomic() hits a fault) { + drop our locks + goto again; + } + + dirty all the pages + release all the locks + +The extra waiting, cleaning and locking are there to make sure we don't +modify pages in flight to the drive, after they've been crc'd. + +If some of the pages in the range were already dirty when the write +began, and we need to goto again, we create a window where a dirty page +has been cleaned and unlocked. It may be reclaimed before we're able to +lock it again, which means we'll read the old contents off the drive and +lose any modifications that had been pending writeback. + +We don't actually need to clean the pages. All of the other locking in +place makes sure we don't start IO on the pages, so we can just leave +them dirty for the duration of the write. + +Fixes: 73d59314e6ed (the original btrfs merge) +CC: stable@vger.kernel.org # v4.4+ +Signed-off-by: Chris Mason +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/file.c | 29 +++++++++++++++++++++++------ + 1 file changed, 23 insertions(+), 6 deletions(-) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -531,6 +531,14 @@ int btrfs_dirty_pages(struct inode *inod + + end_of_last_block = start_pos + num_bytes - 1; + ++ /* ++ * The pages may have already been dirty, clear out old accounting so ++ * we can set things up properly ++ */ ++ clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block, ++ EXTENT_DIRTY | EXTENT_DELALLOC | ++ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached); ++ + if (!btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (start_pos >= isize && + !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) { +@@ -1500,18 +1508,27 @@ lock_and_cleanup_extent_if_need(struct b + } + if (ordered) + btrfs_put_ordered_extent(ordered); +- clear_extent_bit(&inode->io_tree, start_pos, last_pos, +- EXTENT_DIRTY | EXTENT_DELALLOC | +- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, +- 0, 0, cached_state); ++ + *lockstart = start_pos; + *lockend = last_pos; + ret = 1; + } + ++ /* ++ * It's possible the pages are dirty right now, but we don't want ++ * to clean them yet because copy_from_user may catch a page fault ++ * and we might have to fall back to one page at a time. If that ++ * happens, we'll unlock these pages and we'd have a window where ++ * reclaim could sneak in and drop the once-dirty page on the floor ++ * without writing it. ++ * ++ * We have the pages locked and the extent range locked, so there's ++ * no way someone can start IO on any dirty pages in this range. ++ * ++ * We'll call btrfs_dirty_pages() later on, and that will flip around ++ * delalloc bits and dirty the pages as required. ++ */ + for (i = 0; i < num_pages; i++) { +- if (clear_page_dirty_for_io(pages[i])) +- account_page_redirty(pages[i]); + set_page_extent_mapped(pages[i]); + WARN_ON(!PageLocked(pages[i])); + } diff --git a/queue-4.19/btrfs-enhance-btrfs_trim_fs-function-to-handle-error-better.patch b/queue-4.19/btrfs-enhance-btrfs_trim_fs-function-to-handle-error-better.patch new file mode 100644 index 00000000000..ca896981bf4 --- /dev/null +++ b/queue-4.19/btrfs-enhance-btrfs_trim_fs-function-to-handle-error-better.patch @@ -0,0 +1,149 @@ +From 93bba24d4b5ad1e5cd8b43f64e66ff9d6355dd20 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Fri, 7 Sep 2018 14:16:23 +0800 +Subject: btrfs: Enhance btrfs_trim_fs function to handle error better + +From: Qu Wenruo + +commit 93bba24d4b5ad1e5cd8b43f64e66ff9d6355dd20 upstream. + +Function btrfs_trim_fs() doesn't handle errors in a consistent way. If +error happens when trimming existing block groups, it will skip the +remaining blocks and continue to trim unallocated space for each device. + +The return value will only reflect the final error from device trimming. + +This patch will fix such behavior by: + +1) Recording the last error from block group or device trimming + The return value will also reflect the last error during trimming. + Make developer more aware of the problem. + +2) Continuing trimming if possible + If we failed to trim one block group or device, we could still try + the next block group or device. + +3) Report number of failures during block group and device trimming + It would be less noisy, but still gives user a brief summary of + what's going wrong. + +Such behavior can avoid confusion for cases like failure to trim the +first block group and then only unallocated space is trimmed. + +Reported-by: Chris Murphy +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +[ add bg_ret and dev_ret to the messages ] +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 49 ++++++++++++++++++++++++++++++++++++++----------- + 1 file changed, 38 insertions(+), 11 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -10850,6 +10850,15 @@ static int btrfs_trim_free_extents(struc + return ret; + } + ++/* ++ * Trim the whole filesystem by: ++ * 1) trimming the free space in each block group ++ * 2) trimming the unallocated space on each device ++ * ++ * This will also continue trimming even if a block group or device encounters ++ * an error. The return value will be the last error, or 0 if nothing bad ++ * happens. ++ */ + int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) + { + struct btrfs_block_group_cache *cache = NULL; +@@ -10860,6 +10869,10 @@ int btrfs_trim_fs(struct btrfs_fs_info * + u64 end; + u64 trimmed = 0; + u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); ++ u64 bg_failed = 0; ++ u64 dev_failed = 0; ++ int bg_ret = 0; ++ int dev_ret = 0; + int ret = 0; + + /* +@@ -10870,7 +10883,7 @@ int btrfs_trim_fs(struct btrfs_fs_info * + else + cache = btrfs_lookup_block_group(fs_info, range->start); + +- while (cache) { ++ for (; cache; cache = next_block_group(fs_info, cache)) { + if (cache->key.objectid >= (range->start + range->len)) { + btrfs_put_block_group(cache); + break; +@@ -10884,13 +10897,15 @@ int btrfs_trim_fs(struct btrfs_fs_info * + if (!block_group_cache_done(cache)) { + ret = cache_block_group(cache, 0); + if (ret) { +- btrfs_put_block_group(cache); +- break; ++ bg_failed++; ++ bg_ret = ret; ++ continue; + } + ret = wait_block_group_cache_done(cache); + if (ret) { +- btrfs_put_block_group(cache); +- break; ++ bg_failed++; ++ bg_ret = ret; ++ continue; + } + } + ret = btrfs_trim_block_group(cache, +@@ -10901,28 +10916,40 @@ int btrfs_trim_fs(struct btrfs_fs_info * + + trimmed += group_trimmed; + if (ret) { +- btrfs_put_block_group(cache); +- break; ++ bg_failed++; ++ bg_ret = ret; ++ continue; + } + } +- +- cache = next_block_group(fs_info, cache); + } + ++ if (bg_failed) ++ btrfs_warn(fs_info, ++ "failed to trim %llu block group(s), last error %d", ++ bg_failed, bg_ret); + mutex_lock(&fs_info->fs_devices->device_list_mutex); + devices = &fs_info->fs_devices->alloc_list; + list_for_each_entry(device, devices, dev_alloc_list) { + ret = btrfs_trim_free_extents(device, range->minlen, + &group_trimmed); +- if (ret) ++ if (ret) { ++ dev_failed++; ++ dev_ret = ret; + break; ++ } + + trimmed += group_trimmed; + } + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + ++ if (dev_failed) ++ btrfs_warn(fs_info, ++ "failed to trim %llu device(s), last error %d", ++ dev_failed, dev_ret); + range->len = trimmed; +- return ret; ++ if (bg_ret) ++ return bg_ret; ++ return dev_ret; + } + + /* diff --git a/queue-4.19/btrfs-ensure-btrfs_trim_fs-can-trim-the-whole-filesystem.patch b/queue-4.19/btrfs-ensure-btrfs_trim_fs-can-trim-the-whole-filesystem.patch new file mode 100644 index 00000000000..62ffd878ba7 --- /dev/null +++ b/queue-4.19/btrfs-ensure-btrfs_trim_fs-can-trim-the-whole-filesystem.patch @@ -0,0 +1,100 @@ +From 6ba9fc8e628becf0e3ec94083450d089b0dec5f5 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Fri, 7 Sep 2018 14:16:24 +0800 +Subject: btrfs: Ensure btrfs_trim_fs can trim the whole filesystem + +From: Qu Wenruo + +commit 6ba9fc8e628becf0e3ec94083450d089b0dec5f5 upstream. + +[BUG] +fstrim on some btrfs only trims the unallocated space, not trimming any +space in existing block groups. + +[CAUSE] +Before fstrim_range passed to btrfs_trim_fs(), it gets truncated to +range [0, super->total_bytes). So later btrfs_trim_fs() will only be +able to trim block groups in range [0, super->total_bytes). + +While for btrfs, any bytenr aligned to sectorsize is valid, since btrfs +uses its logical address space, there is nothing limiting the location +where we put block groups. + +For filesystem with frequent balance, it's quite easy to relocate all +block groups and bytenr of block groups will start beyond +super->total_bytes. + +In that case, btrfs will not trim existing block groups. + +[FIX] +Just remove the truncation in btrfs_ioctl_fitrim(), so btrfs_trim_fs() +can get the unmodified range, which is normally set to [0, U64_MAX]. + +Reported-by: Chris Murphy +Fixes: f4c697e6406d ("btrfs: return EINVAL if start > total_bytes in fitrim ioctl") +CC: # v4.4+ +Signed-off-by: Qu Wenruo +Reviewed-by: Nikolay Borisov +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 10 +--------- + fs/btrfs/ioctl.c | 11 +++++++---- + 2 files changed, 8 insertions(+), 13 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -10868,21 +10868,13 @@ int btrfs_trim_fs(struct btrfs_fs_info * + u64 start; + u64 end; + u64 trimmed = 0; +- u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); + u64 bg_failed = 0; + u64 dev_failed = 0; + int bg_ret = 0; + int dev_ret = 0; + int ret = 0; + +- /* +- * try to trim all FS space, our block group may start from non-zero. +- */ +- if (range->len == total_bytes) +- cache = btrfs_lookup_first_block_group(fs_info, range->start); +- else +- cache = btrfs_lookup_block_group(fs_info, range->start); +- ++ cache = btrfs_lookup_first_block_group(fs_info, range->start); + for (; cache; cache = next_block_group(fs_info, cache)) { + if (cache->key.objectid >= (range->start + range->len)) { + btrfs_put_block_group(cache); +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -491,7 +491,6 @@ static noinline int btrfs_ioctl_fitrim(s + struct fstrim_range range; + u64 minlen = ULLONG_MAX; + u64 num_devices = 0; +- u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy); + int ret; + + if (!capable(CAP_SYS_ADMIN)) +@@ -515,11 +514,15 @@ static noinline int btrfs_ioctl_fitrim(s + return -EOPNOTSUPP; + if (copy_from_user(&range, arg, sizeof(range))) + return -EFAULT; +- if (range.start > total_bytes || +- range.len < fs_info->sb->s_blocksize) ++ ++ /* ++ * NOTE: Don't truncate the range using super->total_bytes. Bytenr of ++ * block group is in the logical address space, which can be any ++ * sectorsize aligned bytenr in the range [0, U64_MAX]. ++ */ ++ if (range.len < fs_info->sb->s_blocksize) + return -EINVAL; + +- range.len = min(range.len, total_bytes - range.start); + range.minlen = max(range.minlen, minlen); + ret = btrfs_trim_fs(fs_info, &range); + if (ret < 0) diff --git a/queue-4.19/btrfs-fix-error-handling-in-btrfs_dev_replace_start.patch b/queue-4.19/btrfs-fix-error-handling-in-btrfs_dev_replace_start.patch new file mode 100644 index 00000000000..4fc56561ce8 --- /dev/null +++ b/queue-4.19/btrfs-fix-error-handling-in-btrfs_dev_replace_start.patch @@ -0,0 +1,62 @@ +From 5c06147128fbbdf7a84232c5f0d808f53153defe Mon Sep 17 00:00:00 2001 +From: Jeff Mahoney +Date: Thu, 6 Sep 2018 15:52:17 -0400 +Subject: btrfs: fix error handling in btrfs_dev_replace_start + +From: Jeff Mahoney + +commit 5c06147128fbbdf7a84232c5f0d808f53153defe upstream. + +When we fail to start a transaction in btrfs_dev_replace_start, we leave +dev_replace->replace_start set to STARTED but clear ->srcdev and +->tgtdev. Later, that can result in an Oops in +btrfs_dev_replace_progress when having state set to STARTED or SUSPENDED +implies that ->srcdev is valid. + +Also fix error handling when the state is already STARTED or SUSPENDED +while starting. That, too, will clear ->srcdev and ->tgtdev even though +it doesn't own them. This should be an impossible case to hit since we +should be protected by the BTRFS_FS_EXCL_OP bit being set. Let's add an +ASSERT there while we're at it. + +Fixes: e93c89c1aaaaa (Btrfs: add new sources for device replace code) +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Jeff Mahoney +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/dev-replace.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/dev-replace.c ++++ b/fs/btrfs/dev-replace.c +@@ -440,6 +440,7 @@ int btrfs_dev_replace_start(struct btrfs + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: ++ ASSERT(0); + ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; + goto leave; + } +@@ -482,6 +483,10 @@ int btrfs_dev_replace_start(struct btrfs + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_dev_replace_write_lock(dev_replace); ++ dev_replace->replace_state = ++ BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; ++ dev_replace->srcdev = NULL; ++ dev_replace->tgtdev = NULL; + goto leave; + } + +@@ -503,8 +508,6 @@ int btrfs_dev_replace_start(struct btrfs + return ret; + + leave: +- dev_replace->srcdev = NULL; +- dev_replace->tgtdev = NULL; + btrfs_dev_replace_write_unlock(dev_replace); + btrfs_destroy_dev_replace_tgtdev(tgt_device); + return ret; diff --git a/queue-4.19/btrfs-fix-error-handling-in-free_log_tree.patch b/queue-4.19/btrfs-fix-error-handling-in-free_log_tree.patch new file mode 100644 index 00000000000..73c0ee5fa4a --- /dev/null +++ b/queue-4.19/btrfs-fix-error-handling-in-free_log_tree.patch @@ -0,0 +1,90 @@ +From 374b0e2d6ba5da7fd1cadb3247731ff27d011f6f Mon Sep 17 00:00:00 2001 +From: Jeff Mahoney +Date: Thu, 6 Sep 2018 16:59:33 -0400 +Subject: btrfs: fix error handling in free_log_tree + +From: Jeff Mahoney + +commit 374b0e2d6ba5da7fd1cadb3247731ff27d011f6f upstream. + +When we hit an I/O error in free_log_tree->walk_log_tree during file system +shutdown we can crash due to there not being a valid transaction handle. + +Use btrfs_handle_fs_error when there's no transaction handle to use. + + BUG: unable to handle kernel NULL pointer dereference at 0000000000000060 + IP: free_log_tree+0xd2/0x140 [btrfs] + PGD 0 P4D 0 + Oops: 0000 [#1] SMP DEBUG_PAGEALLOC PTI + Modules linked in: + CPU: 2 PID: 23544 Comm: umount Tainted: G W 4.12.14-kvmsmall #9 SLE15 (unreleased) + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014 + task: ffff96bfd3478880 task.stack: ffffa7cf40d78000 + RIP: 0010:free_log_tree+0xd2/0x140 [btrfs] + RSP: 0018:ffffa7cf40d7bd10 EFLAGS: 00010282 + RAX: 00000000fffffffb RBX: 00000000fffffffb RCX: 0000000000000002 + RDX: 0000000000000000 RSI: ffff96c02f07d4c8 RDI: 0000000000000282 + RBP: ffff96c013cf1000 R08: ffff96c02f07d4c8 R09: ffff96c02f07d4d0 + R10: 0000000000000000 R11: 0000000000000002 R12: 0000000000000000 + R13: ffff96c005e800c0 R14: ffffa7cf40d7bdb8 R15: 0000000000000000 + FS: 00007f17856bcfc0(0000) GS:ffff96c03f600000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 0000000000000060 CR3: 0000000045ed6002 CR4: 00000000003606e0 + DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + Call Trace: + ? wait_for_writer+0xb0/0xb0 [btrfs] + btrfs_free_log+0x17/0x30 [btrfs] + btrfs_drop_and_free_fs_root+0x9a/0xe0 [btrfs] + btrfs_free_fs_roots+0xc0/0x130 [btrfs] + ? wait_for_completion+0xf2/0x100 + close_ctree+0xea/0x2e0 [btrfs] + ? kthread_stop+0x161/0x260 + generic_shutdown_super+0x6c/0x120 + kill_anon_super+0xe/0x20 + btrfs_kill_super+0x13/0x100 [btrfs] + deactivate_locked_super+0x3f/0x70 + cleanup_mnt+0x3b/0x70 + task_work_run+0x78/0x90 + exit_to_usermode_loop+0x77/0xa6 + do_syscall_64+0x1c5/0x1e0 + entry_SYSCALL_64_after_hwframe+0x42/0xb7 + RIP: 0033:0x7f1784f90827 + RSP: 002b:00007ffdeeb03118 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 + RAX: 0000000000000000 RBX: 0000556a60c62970 RCX: 00007f1784f90827 + RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000556a60c62b50 + RBP: 0000000000000000 R08: 0000000000000005 R09: 00000000ffffffff + R10: 0000556a60c63900 R11: 0000000000000246 R12: 0000556a60c62b50 + R13: 00007f17854a81c4 R14: 0000000000000000 R15: 0000000000000000 + RIP: free_log_tree+0xd2/0x140 [btrfs] RSP: ffffa7cf40d7bd10 + CR2: 0000000000000060 + +Fixes: 681ae50917df9 ("Btrfs: cleanup reserved space when freeing tree log on error") +CC: # v3.13 +Signed-off-by: Jeff Mahoney +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-log.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -3196,9 +3196,12 @@ static void free_log_tree(struct btrfs_t + }; + + ret = walk_log_tree(trans, log, &wc); +- /* I don't think this can happen but just in case */ +- if (ret) +- btrfs_abort_transaction(trans, ret); ++ if (ret) { ++ if (trans) ++ btrfs_abort_transaction(trans, ret); ++ else ++ btrfs_handle_fs_error(log->fs_info, ret, NULL); ++ } + + while (1) { + ret = find_first_extent_bit(&log->dirty_log_pages, diff --git a/queue-4.19/btrfs-fix-warning-when-replaying-log-after-fsync-of-a-tmpfile.patch b/queue-4.19/btrfs-fix-warning-when-replaying-log-after-fsync-of-a-tmpfile.patch new file mode 100644 index 00000000000..f1831bd8088 --- /dev/null +++ b/queue-4.19/btrfs-fix-warning-when-replaying-log-after-fsync-of-a-tmpfile.patch @@ -0,0 +1,177 @@ +From f2d72f42d5fa3bf33761d9e47201745f624fcff5 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 8 Oct 2018 11:12:55 +0100 +Subject: Btrfs: fix warning when replaying log after fsync of a tmpfile + +From: Filipe Manana + +commit f2d72f42d5fa3bf33761d9e47201745f624fcff5 upstream. + +When replaying a log which contains a tmpfile (which necessarily has a +link count of 0) we end up calling inc_nlink(), at +fs/btrfs/tree-log.c:replay_one_buffer(), which produces a warning like +the following: + + [195191.943673] WARNING: CPU: 0 PID: 6924 at fs/inode.c:342 inc_nlink+0x33/0x40 + [195191.943723] CPU: 0 PID: 6924 Comm: mount Not tainted 4.19.0-rc6-btrfs-next-38 #1 + [195191.943724] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 + [195191.943726] RIP: 0010:inc_nlink+0x33/0x40 + [195191.943728] RSP: 0018:ffffb96e425e3870 EFLAGS: 00010246 + [195191.943730] RAX: 0000000000000000 RBX: ffff8c0d1e6af4f0 RCX: 0000000000000006 + [195191.943731] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8c0d1e6af4f0 + [195191.943731] RBP: 0000000000000097 R08: 0000000000000001 R09: 0000000000000000 + [195191.943732] R10: 0000000000000000 R11: 0000000000000000 R12: ffffb96e425e3a60 + [195191.943733] R13: ffff8c0d10cff0c8 R14: ffff8c0d0d515348 R15: ffff8c0d78a1b3f8 + [195191.943735] FS: 00007f570ee24480(0000) GS:ffff8c0dfb200000(0000) knlGS:0000000000000000 + [195191.943736] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [195191.943737] CR2: 00005593286277c8 CR3: 00000000bb8f2006 CR4: 00000000003606f0 + [195191.943739] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + [195191.943740] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + [195191.943741] Call Trace: + [195191.943778] replay_one_buffer+0x797/0x7d0 [btrfs] + [195191.943802] walk_up_log_tree+0x1c1/0x250 [btrfs] + [195191.943809] ? rcu_read_lock_sched_held+0x3f/0x70 + [195191.943825] walk_log_tree+0xae/0x1d0 [btrfs] + [195191.943840] btrfs_recover_log_trees+0x1d7/0x4d0 [btrfs] + [195191.943856] ? replay_dir_deletes+0x280/0x280 [btrfs] + [195191.943870] open_ctree+0x1c3b/0x22a0 [btrfs] + [195191.943887] btrfs_mount_root+0x6b4/0x800 [btrfs] + [195191.943894] ? rcu_read_lock_sched_held+0x3f/0x70 + [195191.943899] ? pcpu_alloc+0x55b/0x7c0 + [195191.943906] ? mount_fs+0x3b/0x140 + [195191.943908] mount_fs+0x3b/0x140 + [195191.943912] ? __init_waitqueue_head+0x36/0x50 + [195191.943916] vfs_kern_mount+0x62/0x160 + [195191.943927] btrfs_mount+0x134/0x890 [btrfs] + [195191.943936] ? rcu_read_lock_sched_held+0x3f/0x70 + [195191.943938] ? pcpu_alloc+0x55b/0x7c0 + [195191.943943] ? mount_fs+0x3b/0x140 + [195191.943952] ? btrfs_remount+0x570/0x570 [btrfs] + [195191.943954] mount_fs+0x3b/0x140 + [195191.943956] ? __init_waitqueue_head+0x36/0x50 + [195191.943960] vfs_kern_mount+0x62/0x160 + [195191.943963] do_mount+0x1f9/0xd40 + [195191.943967] ? memdup_user+0x4b/0x70 + [195191.943971] ksys_mount+0x7e/0xd0 + [195191.943974] __x64_sys_mount+0x21/0x30 + [195191.943977] do_syscall_64+0x60/0x1b0 + [195191.943980] entry_SYSCALL_64_after_hwframe+0x49/0xbe + [195191.943983] RIP: 0033:0x7f570e4e524a + [195191.943986] RSP: 002b:00007ffd83589478 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 + [195191.943989] RAX: ffffffffffffffda RBX: 0000563f335b2060 RCX: 00007f570e4e524a + [195191.943990] RDX: 0000563f335b2240 RSI: 0000563f335b2280 RDI: 0000563f335b2260 + [195191.943992] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000020 + [195191.943993] R10: 00000000c0ed0000 R11: 0000000000000206 R12: 0000563f335b2260 + [195191.943994] R13: 0000563f335b2240 R14: 0000000000000000 R15: 00000000ffffffff + [195191.944002] irq event stamp: 8688 + [195191.944010] hardirqs last enabled at (8687): [] console_unlock+0x503/0x640 + [195191.944012] hardirqs last disabled at (8688): [] trace_hardirqs_off_thunk+0x1a/0x1c + [195191.944018] softirqs last enabled at (8638): [] __set_page_dirty_nobuffers+0x101/0x150 + [195191.944020] softirqs last disabled at (8634): [] wb_wakeup_delayed+0x2e/0x60 + [195191.944022] ---[ end trace 5d6e873a9a0b811a ]--- + +This happens because the inode does not have the flag I_LINKABLE set, +which is a runtime only flag, not meant to be persisted, set when the +inode is created through open(2) if the flag O_EXCL is not passed to it. +Except for the warning, there are no other consequences (like corruptions +or metadata inconsistencies). + +Since it's pointless to replay a tmpfile as it would be deleted in a +later phase of the log replay procedure (it has a link count of 0), fix +this by not logging tmpfiles and if a tmpfile is found in a log (created +by a kernel without this change), skip the replay of the inode. + +A test case for fstests follows soon. + +Fixes: 471d557afed1 ("Btrfs: fix loss of prealloc extents past i_size after fsync log replay") +CC: stable@vger.kernel.org # 4.18+ +Reported-by: Martin Steigerwald +Link: https://lore.kernel.org/linux-btrfs/3666619.NTnn27ZJZE@merkaba/ +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-log.c | 42 ++++++++++++++++++++++++++++++++---------- + 1 file changed, 32 insertions(+), 10 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -258,6 +258,13 @@ struct walk_control { + /* what stage of the replay code we're currently in */ + int stage; + ++ /* ++ * Ignore any items from the inode currently being processed. Needs ++ * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in ++ * the LOG_WALK_REPLAY_INODES stage. ++ */ ++ bool ignore_cur_inode; ++ + /* the root we are currently replaying */ + struct btrfs_root *replay_dest; + +@@ -2487,6 +2494,20 @@ static int replay_one_buffer(struct btrf + + inode_item = btrfs_item_ptr(eb, i, + struct btrfs_inode_item); ++ /* ++ * If we have a tmpfile (O_TMPFILE) that got fsync'ed ++ * and never got linked before the fsync, skip it, as ++ * replaying it is pointless since it would be deleted ++ * later. We skip logging tmpfiles, but it's always ++ * possible we are replaying a log created with a kernel ++ * that used to log tmpfiles. ++ */ ++ if (btrfs_inode_nlink(eb, inode_item) == 0) { ++ wc->ignore_cur_inode = true; ++ continue; ++ } else { ++ wc->ignore_cur_inode = false; ++ } + ret = replay_xattr_deletes(wc->trans, root, log, + path, key.objectid); + if (ret) +@@ -2524,16 +2545,8 @@ static int replay_one_buffer(struct btrf + root->fs_info->sectorsize); + ret = btrfs_drop_extents(wc->trans, root, inode, + from, (u64)-1, 1); +- /* +- * If the nlink count is zero here, the iput +- * will free the inode. We bump it to make +- * sure it doesn't get freed until the link +- * count fixup is done. +- */ + if (!ret) { +- if (inode->i_nlink == 0) +- inc_nlink(inode); +- /* Update link count and nbytes. */ ++ /* Update the inode's nbytes. */ + ret = btrfs_update_inode(wc->trans, + root, inode); + } +@@ -2548,6 +2561,9 @@ static int replay_one_buffer(struct btrf + break; + } + ++ if (wc->ignore_cur_inode) ++ continue; ++ + if (key.type == BTRFS_DIR_INDEX_KEY && + wc->stage == LOG_WALK_REPLAY_DIR_INDEX) { + ret = replay_one_dir_item(wc->trans, root, path, +@@ -5643,7 +5659,13 @@ static int btrfs_log_inode_parent(struct + if (ret) + goto end_no_trans; + +- if (btrfs_inode_in_log(inode, trans->transid)) { ++ /* ++ * Skip already logged inodes or inodes corresponding to tmpfiles ++ * (since logging them is pointless, a link count of 0 means they ++ * will never be accessible). ++ */ ++ if (btrfs_inode_in_log(inode, trans->transid) || ++ inode->vfs_inode.i_nlink == 0) { + ret = BTRFS_NO_LOG_SYNC; + goto end_no_trans; + } diff --git a/queue-4.19/btrfs-fix-wrong-dentries-after-fsync-of-file-that-got-its-parent-replaced.patch b/queue-4.19/btrfs-fix-wrong-dentries-after-fsync-of-file-that-got-its-parent-replaced.patch new file mode 100644 index 00000000000..63b2b5c76e0 --- /dev/null +++ b/queue-4.19/btrfs-fix-wrong-dentries-after-fsync-of-file-that-got-its-parent-replaced.patch @@ -0,0 +1,86 @@ +From 0f375eed92b5a407657532637ed9652611a682f5 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 9 Oct 2018 15:05:29 +0100 +Subject: Btrfs: fix wrong dentries after fsync of file that got its parent replaced + +From: Filipe Manana + +commit 0f375eed92b5a407657532637ed9652611a682f5 upstream. + +In a scenario like the following: + + mkdir /mnt/A # inode 258 + mkdir /mnt/B # inode 259 + touch /mnt/B/bar # inode 260 + + sync + + mv /mnt/B/bar /mnt/A/bar + mv -T /mnt/A /mnt/B + fsync /mnt/B/bar + + + +After replaying the log we end up with file bar having 2 hard links, both +with the name 'bar' and one in the directory with inode number 258 and the +other in the directory with inode number 259. Also, we end up with the +directory inode 259 still existing and with the directory inode 258 still +named as 'A', instead of 'B'. In this scenario, file 'bar' should only +have one hard link, located at directory inode 258, the directory inode +259 should not exist anymore and the name for directory inode 258 should +be 'B'. + +This incorrect behaviour happens because when attempting to log the old +parents of an inode, we skip any parents that no longer exist. Fix this +by forcing a full commit if an old parent no longer exists. + +A test case for fstests follows soon. + +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-log.c | 30 +++++++++++++++++++++++++++--- + 1 file changed, 27 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -5583,9 +5583,33 @@ static int btrfs_log_all_parents(struct + + dir_inode = btrfs_iget(fs_info->sb, &inode_key, + root, NULL); +- /* If parent inode was deleted, skip it. */ +- if (IS_ERR(dir_inode)) +- continue; ++ /* ++ * If the parent inode was deleted, return an error to ++ * fallback to a transaction commit. This is to prevent ++ * getting an inode that was moved from one parent A to ++ * a parent B, got its former parent A deleted and then ++ * it got fsync'ed, from existing at both parents after ++ * a log replay (and the old parent still existing). ++ * Example: ++ * ++ * mkdir /mnt/A ++ * mkdir /mnt/B ++ * touch /mnt/B/bar ++ * sync ++ * mv /mnt/B/bar /mnt/A/bar ++ * mv -T /mnt/A /mnt/B ++ * fsync /mnt/B/bar ++ * ++ * ++ * If we ignore the old parent B which got deleted, ++ * after a log replay we would have file bar linked ++ * at both parents and the old parent B would still ++ * exist. ++ */ ++ if (IS_ERR(dir_inode)) { ++ ret = PTR_ERR(dir_inode); ++ goto out; ++ } + + if (ctx) + ctx->log_new_dentries = false; diff --git a/queue-4.19/btrfs-handle-owner-mismatch-gracefully-when-walking-up-tree.patch b/queue-4.19/btrfs-handle-owner-mismatch-gracefully-when-walking-up-tree.patch new file mode 100644 index 00000000000..97dbcbaac61 --- /dev/null +++ b/queue-4.19/btrfs-handle-owner-mismatch-gracefully-when-walking-up-tree.patch @@ -0,0 +1,118 @@ +From 65c6e82becec33731f48786e5a30f98662c86b16 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Tue, 21 Aug 2018 09:42:03 +0800 +Subject: btrfs: Handle owner mismatch gracefully when walking up tree + +From: Qu Wenruo + +commit 65c6e82becec33731f48786e5a30f98662c86b16 upstream. + +[BUG] +When mounting certain crafted image, btrfs will trigger kernel BUG_ON() +when trying to recover balance: + + kernel BUG at fs/btrfs/extent-tree.c:8956! + invalid opcode: 0000 [#1] PREEMPT SMP NOPTI + CPU: 1 PID: 662 Comm: mount Not tainted 4.18.0-rc1-custom+ #10 + RIP: 0010:walk_up_proc+0x336/0x480 [btrfs] + RSP: 0018:ffffb53540c9b890 EFLAGS: 00010202 + Call Trace: + walk_up_tree+0x172/0x1f0 [btrfs] + btrfs_drop_snapshot+0x3a4/0x830 [btrfs] + merge_reloc_roots+0xe1/0x1d0 [btrfs] + btrfs_recover_relocation+0x3ea/0x420 [btrfs] + open_ctree+0x1af3/0x1dd0 [btrfs] + btrfs_mount_root+0x66b/0x740 [btrfs] + mount_fs+0x3b/0x16a + vfs_kern_mount.part.9+0x54/0x140 + btrfs_mount+0x16d/0x890 [btrfs] + mount_fs+0x3b/0x16a + vfs_kern_mount.part.9+0x54/0x140 + do_mount+0x1fd/0xda0 + ksys_mount+0xba/0xd0 + __x64_sys_mount+0x21/0x30 + do_syscall_64+0x60/0x210 + entry_SYSCALL_64_after_hwframe+0x49/0xbe + +[CAUSE] +Extent tree corruption. In this particular case, reloc tree root's +owner is DATA_RELOC_TREE (should be TREE_RELOC), thus its backref is +corrupted and we failed the owner check in walk_up_tree(). + +[FIX] +It's pretty hard to take care of every extent tree corruption, but at +least we can remove such BUG_ON() and exit more gracefully. + +And since in this particular image, DATA_RELOC_TREE and TREE_RELOC share +the same root (which is obviously invalid), we needs to make +__del_reloc_root() more robust to detect such invalid sharing to avoid +possible NULL dereference as root->node can be NULL in this case. + +Link: https://bugzilla.kernel.org/show_bug.cgi?id=200411 +Reported-by: Xu Wen +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 18 ++++++++++++------ + fs/btrfs/relocation.c | 2 +- + 2 files changed, 13 insertions(+), 7 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -8763,15 +8763,14 @@ static noinline int walk_up_proc(struct + if (eb == root->node) { + if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = eb->start; +- else +- BUG_ON(root->root_key.objectid != +- btrfs_header_owner(eb)); ++ else if (root->root_key.objectid != btrfs_header_owner(eb)) ++ goto owner_mismatch; + } else { + if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF) + parent = path->nodes[level + 1]->start; +- else +- BUG_ON(root->root_key.objectid != +- btrfs_header_owner(path->nodes[level + 1])); ++ else if (root->root_key.objectid != ++ btrfs_header_owner(path->nodes[level + 1])) ++ goto owner_mismatch; + } + + btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); +@@ -8779,6 +8778,11 @@ out: + wc->refs[level] = 0; + wc->flags[level] = 0; + return 0; ++ ++owner_mismatch: ++ btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu", ++ btrfs_header_owner(eb), root->root_key.objectid); ++ return -EUCLEAN; + } + + static noinline int walk_down_tree(struct btrfs_trans_handle *trans, +@@ -8832,6 +8836,8 @@ static noinline int walk_up_tree(struct + ret = walk_up_proc(trans, root, path, wc); + if (ret > 0) + return 0; ++ if (ret < 0) ++ return ret; + + if (path->locks[level]) { + btrfs_tree_unlock_rw(path->nodes[level], +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -1281,7 +1281,7 @@ static void __del_reloc_root(struct btrf + struct mapping_node *node = NULL; + struct reloc_control *rc = fs_info->reloc_ctl; + +- if (rc) { ++ if (rc && root->node) { + spin_lock(&rc->reloc_root_tree.lock); + rb_node = tree_search(&rc->reloc_root_tree.rb_root, + root->node->start); diff --git a/queue-4.19/btrfs-iterate-all-devices-during-trim-instead-of-fs_devices-alloc_list.patch b/queue-4.19/btrfs-iterate-all-devices-during-trim-instead-of-fs_devices-alloc_list.patch new file mode 100644 index 00000000000..cc60cf00633 --- /dev/null +++ b/queue-4.19/btrfs-iterate-all-devices-during-trim-instead-of-fs_devices-alloc_list.patch @@ -0,0 +1,42 @@ +From d4e329de5e5e21594df2e0dd59da9acee71f133b Mon Sep 17 00:00:00 2001 +From: Jeff Mahoney +Date: Thu, 6 Sep 2018 17:18:14 -0400 +Subject: btrfs: iterate all devices during trim, instead of fs_devices::alloc_list + +From: Jeff Mahoney + +commit d4e329de5e5e21594df2e0dd59da9acee71f133b upstream. + +btrfs_trim_fs iterates over the fs_devices->alloc_list while holding the +device_list_mutex. The problem is that ->alloc_list is protected by the +chunk mutex. We don't want to hold the chunk mutex over the trim of the +entire file system. Fortunately, the ->dev_list list is protected by +the dev_list mutex and while it will give us all devices, including +read-only devices, we already just skip the read-only devices. Then we +can continue to take and release the chunk mutex while scanning each +device. + +Fixes: 499f377f49f ("btrfs: iterate over unused chunk space in FITRIM") +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Jeff Mahoney +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -10920,8 +10920,8 @@ int btrfs_trim_fs(struct btrfs_fs_info * + "failed to trim %llu block group(s), last error %d", + bg_failed, bg_ret); + mutex_lock(&fs_info->fs_devices->device_list_mutex); +- devices = &fs_info->fs_devices->alloc_list; +- list_for_each_entry(device, devices, dev_alloc_list) { ++ devices = &fs_info->fs_devices->devices; ++ list_for_each_entry(device, devices, dev_list) { + ret = btrfs_trim_free_extents(device, range->minlen, + &group_trimmed); + if (ret) { diff --git a/queue-4.19/btrfs-keep-trim-from-interfering-with-transaction-commits.patch b/queue-4.19/btrfs-keep-trim-from-interfering-with-transaction-commits.patch new file mode 100644 index 00000000000..849581c27d4 --- /dev/null +++ b/queue-4.19/btrfs-keep-trim-from-interfering-with-transaction-commits.patch @@ -0,0 +1,113 @@ +From fee7acc361314df6561208c2d3c0882d663dd537 Mon Sep 17 00:00:00 2001 +From: Jeff Mahoney +Date: Thu, 6 Sep 2018 17:18:16 -0400 +Subject: btrfs: keep trim from interfering with transaction commits + +From: Jeff Mahoney + +commit fee7acc361314df6561208c2d3c0882d663dd537 upstream. + +Commit 499f377f49f08 (btrfs: iterate over unused chunk space in FITRIM) +fixed free space trimming, but introduced latency when it was running. +This is due to it pinning the transaction using both a incremented +refcount and holding the commit root sem for the duration of a single +trim operation. + +This was to ensure safety but it's unnecessary. We already hold the the +chunk mutex so we know that the chunk we're using can't be allocated +while we're trimming it. + +In order to check against chunks allocated already in this transaction, +we need to check the pending chunks list. To to that safely without +joining the transaction (or attaching than then having to commit it) we +need to ensure that the dev root's commit root doesn't change underneath +us and the pending chunk lists stays around until we're done with it. + +We can ensure the former by holding the commit root sem and the latter +by pinning the transaction. We do this now, but the critical section +covers the trim operation itself and we don't need to do that. + +This patch moves the pinning and unpinning logic into helpers and unpins +the transaction after performing the search and check for pending +chunks. + +Limiting the critical section of the transaction pinning improves the +latency substantially on slower storage (e.g. image files over NFS). + +Fixes: 499f377f49f08 ("btrfs: iterate over unused chunk space in FITRIM") +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Jeff Mahoney +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 25 +++++++++++++++++-------- + 1 file changed, 17 insertions(+), 8 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -10772,14 +10772,16 @@ int btrfs_error_unpin_extent_range(struc + * We don't want a transaction for this since the discard may take a + * substantial amount of time. We don't require that a transaction be + * running, but we do need to take a running transaction into account +- * to ensure that we're not discarding chunks that were released in +- * the current transaction. ++ * to ensure that we're not discarding chunks that were released or ++ * allocated in the current transaction. + * + * Holding the chunks lock will prevent other threads from allocating + * or releasing chunks, but it won't prevent a running transaction + * from committing and releasing the memory that the pending chunks + * list head uses. For that, we need to take a reference to the +- * transaction. ++ * transaction and hold the commit root sem. We only need to hold ++ * it while performing the free space search since we have already ++ * held back allocations. + */ + static int btrfs_trim_free_extents(struct btrfs_device *device, + u64 minlen, u64 *trimmed) +@@ -10810,9 +10812,13 @@ static int btrfs_trim_free_extents(struc + + ret = mutex_lock_interruptible(&fs_info->chunk_mutex); + if (ret) +- return ret; ++ break; + +- down_read(&fs_info->commit_root_sem); ++ ret = down_read_killable(&fs_info->commit_root_sem); ++ if (ret) { ++ mutex_unlock(&fs_info->chunk_mutex); ++ break; ++ } + + spin_lock(&fs_info->trans_lock); + trans = fs_info->running_transaction; +@@ -10820,13 +10826,17 @@ static int btrfs_trim_free_extents(struc + refcount_inc(&trans->use_count); + spin_unlock(&fs_info->trans_lock); + ++ if (!trans) ++ up_read(&fs_info->commit_root_sem); ++ + ret = find_free_dev_extent_start(trans, device, minlen, start, + &start, &len); +- if (trans) ++ if (trans) { ++ up_read(&fs_info->commit_root_sem); + btrfs_put_transaction(trans); ++ } + + if (ret) { +- up_read(&fs_info->commit_root_sem); + mutex_unlock(&fs_info->chunk_mutex); + if (ret == -ENOSPC) + ret = 0; +@@ -10834,7 +10844,6 @@ static int btrfs_trim_free_extents(struc + } + + ret = btrfs_issue_discard(device->bdev, start, len, &bytes); +- up_read(&fs_info->commit_root_sem); + mutex_unlock(&fs_info->chunk_mutex); + + if (ret) diff --git a/queue-4.19/btrfs-locking-add-extra-check-in-btrfs_init_new_buffer-to-avoid-deadlock.patch b/queue-4.19/btrfs-locking-add-extra-check-in-btrfs_init_new_buffer-to-avoid-deadlock.patch new file mode 100644 index 00000000000..05916f1b1ce --- /dev/null +++ b/queue-4.19/btrfs-locking-add-extra-check-in-btrfs_init_new_buffer-to-avoid-deadlock.patch @@ -0,0 +1,130 @@ +From b72c3aba09a53fc7c1824250d71180ca154517a7 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Tue, 21 Aug 2018 09:53:47 +0800 +Subject: btrfs: locking: Add extra check in btrfs_init_new_buffer() to avoid deadlock + +From: Qu Wenruo + +commit b72c3aba09a53fc7c1824250d71180ca154517a7 upstream. + +[BUG] +For certain crafted image, whose csum root leaf has missing backref, if +we try to trigger write with data csum, it could cause deadlock with the +following kernel WARN_ON(): + + WARNING: CPU: 1 PID: 41 at fs/btrfs/locking.c:230 btrfs_tree_lock+0x3e2/0x400 + CPU: 1 PID: 41 Comm: kworker/u4:1 Not tainted 4.18.0-rc1+ #8 + Workqueue: btrfs-endio-write btrfs_endio_write_helper + RIP: 0010:btrfs_tree_lock+0x3e2/0x400 + Call Trace: + btrfs_alloc_tree_block+0x39f/0x770 + __btrfs_cow_block+0x285/0x9e0 + btrfs_cow_block+0x191/0x2e0 + btrfs_search_slot+0x492/0x1160 + btrfs_lookup_csum+0xec/0x280 + btrfs_csum_file_blocks+0x2be/0xa60 + add_pending_csums+0xaf/0xf0 + btrfs_finish_ordered_io+0x74b/0xc90 + finish_ordered_fn+0x15/0x20 + normal_work_helper+0xf6/0x500 + btrfs_endio_write_helper+0x12/0x20 + process_one_work+0x302/0x770 + worker_thread+0x81/0x6d0 + kthread+0x180/0x1d0 + ret_from_fork+0x35/0x40 + +[CAUSE] +That crafted image has missing backref for csum tree root leaf. And +when we try to allocate new tree block, since there is no +EXTENT/METADATA_ITEM for csum tree root, btrfs consider it's free slot +and use it. + +The extent tree of the image looks like: + + Normal image | This fuzzed image + ----------------------------------+-------------------------------- + BG 29360128 | BG 29360128 + One empty slot | One empty slot + 29364224: backref to UUID tree | 29364224: backref to UUID tree + Two empty slots | Two empty slots + 29376512: backref to CSUM tree | One empty slot (bad type) <<< + 29380608: backref to D_RELOC tree | 29380608: backref to D_RELOC tree + ... | ... + +Since bytenr 29376512 has no METADATA/EXTENT_ITEM, when btrfs try to +alloc tree block, it's an valid slot for btrfs. + +And for finish_ordered_write, when we need to insert csum, we try to CoW +csum tree root. + +By accident, empty slots at bytenr BG_OFFSET, BG_OFFSET + 8K, +BG_OFFSET + 12K is already used by tree block COW for other trees, the +next empty slot is BG_OFFSET + 16K, which should be the backref for CSUM +tree. + +But due to the bad type, btrfs can recognize it and still consider it as +an empty slot, and will try to use it for csum tree CoW. + +Then in the following call trace, we will try to lock the new tree +block, which turns out to be the old csum tree root which is already +locked: + +btrfs_search_slot() called on csum tree root, which is at 29376512 +|- btrfs_cow_block() + |- btrfs_set_lock_block() + | |- Now locks tree block 29376512 (old csum tree root) + |- __btrfs_cow_block() + |- btrfs_alloc_tree_block() + |- btrfs_reserve_extent() + | Now it returns tree block 29376512, which extent tree + | shows its empty slot, but it's already hold by csum tree + |- btrfs_init_new_buffer() + |- btrfs_tree_lock() + | Triggers WARN_ON(eb->lock_owner == current->pid) + |- wait_event() + Wait lock owner to release the lock, but it's + locked by ourself, so it will deadlock + +[FIX] +This patch will do the lock_owner and current->pid check at +btrfs_init_new_buffer(). +So above deadlock can be avoided. + +Since such problem can only happen in crafted image, we will still +trigger kernel warning for later aborted transaction, but with a little +more meaningful warning message. + +Link: https://bugzilla.kernel.org/show_bug.cgi?id=200405 +Reported-by: Xu Wen +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -8119,6 +8119,19 @@ btrfs_init_new_buffer(struct btrfs_trans + if (IS_ERR(buf)) + return buf; + ++ /* ++ * Extra safety check in case the extent tree is corrupted and extent ++ * allocator chooses to use a tree block which is already used and ++ * locked. ++ */ ++ if (buf->lock_owner == current->pid) { ++ btrfs_err_rl(fs_info, ++"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", ++ buf->start, btrfs_header_owner(buf), current->pid); ++ free_extent_buffer(buf); ++ return ERR_PTR(-EUCLEAN); ++ } ++ + btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); + btrfs_tree_lock(buf); + clean_tree_block(fs_info, buf); diff --git a/queue-4.19/btrfs-make-sure-we-create-all-new-block-groups.patch b/queue-4.19/btrfs-make-sure-we-create-all-new-block-groups.patch new file mode 100644 index 00000000000..64660d4355a --- /dev/null +++ b/queue-4.19/btrfs-make-sure-we-create-all-new-block-groups.patch @@ -0,0 +1,49 @@ +From 545e3366db823dc3342ca9d7fea803f829c9062f Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Fri, 28 Sep 2018 07:18:02 -0400 +Subject: btrfs: make sure we create all new block groups + +From: Josef Bacik + +commit 545e3366db823dc3342ca9d7fea803f829c9062f upstream. + +Allocating new chunks modifies both the extent and chunk tree, which can +trigger new chunk allocations. So instead of doing list_for_each_safe, +just do while (!list_empty()) so we make sure we don't exit with other +pending bg's still on our list. + +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Omar Sandoval +Reviewed-by: Liu Bo +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -10094,7 +10094,7 @@ error: + void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) + { + struct btrfs_fs_info *fs_info = trans->fs_info; +- struct btrfs_block_group_cache *block_group, *tmp; ++ struct btrfs_block_group_cache *block_group; + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_block_group_item item; + struct btrfs_key key; +@@ -10102,7 +10102,10 @@ void btrfs_create_pending_block_groups(s + bool can_flush_pending_bgs = trans->can_flush_pending_bgs; + + trans->can_flush_pending_bgs = false; +- list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { ++ while (!list_empty(&trans->new_bgs)) { ++ block_group = list_first_entry(&trans->new_bgs, ++ struct btrfs_block_group_cache, ++ bg_list); + if (ret) + goto next; + diff --git a/queue-4.19/btrfs-protect-space-cache-inode-alloc-with-gfp_nofs.patch b/queue-4.19/btrfs-protect-space-cache-inode-alloc-with-gfp_nofs.patch new file mode 100644 index 00000000000..1702707ca25 --- /dev/null +++ b/queue-4.19/btrfs-protect-space-cache-inode-alloc-with-gfp_nofs.patch @@ -0,0 +1,57 @@ +From 84de76a2fb217dc1b6bc2965cc397d1648aa1404 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Fri, 28 Sep 2018 07:17:49 -0400 +Subject: btrfs: protect space cache inode alloc with GFP_NOFS + +From: Josef Bacik + +commit 84de76a2fb217dc1b6bc2965cc397d1648aa1404 upstream. + +If we're allocating a new space cache inode it's likely going to be +under a transaction handle, so we need to use memalloc_nofs_save() in +order to avoid deadlocks, and more importantly lockdep messages that +make xfstests fail. + +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Omar Sandoval +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/free-space-cache.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + #include "ctree.h" + #include "free-space-cache.h" + #include "transaction.h" +@@ -47,6 +48,7 @@ static struct inode *__lookup_free_space + struct btrfs_free_space_header *header; + struct extent_buffer *leaf; + struct inode *inode = NULL; ++ unsigned nofs_flag; + int ret; + + key.objectid = BTRFS_FREE_SPACE_OBJECTID; +@@ -68,7 +70,13 @@ static struct inode *__lookup_free_space + btrfs_disk_key_to_cpu(&location, &disk_key); + btrfs_release_path(path); + ++ /* ++ * We are often under a trans handle at this point, so we need to make ++ * sure NOFS is set to keep us from deadlocking. ++ */ ++ nofs_flag = memalloc_nofs_save(); + inode = btrfs_iget(fs_info->sb, &location, root, NULL); ++ memalloc_nofs_restore(nofs_flag); + if (IS_ERR(inode)) + return inode; + diff --git a/queue-4.19/btrfs-qgroup-avoid-calling-qgroup-functions-if-qgroup-is-not-enabled.patch b/queue-4.19/btrfs-qgroup-avoid-calling-qgroup-functions-if-qgroup-is-not-enabled.patch new file mode 100644 index 00000000000..9030d3c9864 --- /dev/null +++ b/queue-4.19/btrfs-qgroup-avoid-calling-qgroup-functions-if-qgroup-is-not-enabled.patch @@ -0,0 +1,69 @@ +From 3628b4ca64f24a4ec55055597d0cb1c814729f8b Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Tue, 9 Oct 2018 14:36:45 +0800 +Subject: btrfs: qgroup: Avoid calling qgroup functions if qgroup is not enabled + +From: Qu Wenruo + +commit 3628b4ca64f24a4ec55055597d0cb1c814729f8b upstream. + +Some qgroup trace events like btrfs_qgroup_release_data() and +btrfs_qgroup_free_delayed_ref() can still be triggered even if qgroup is +not enabled. + +This is caused by the lack of qgroup status check before calling some +qgroup functions. Thankfully the functions can handle quota disabled +case well and just do nothing for qgroup disabled case. + +This patch will do earlier check before triggering related trace events. + +And for enabled <-> disabled race case: + +1) For enabled->disabled case + Disable will wipe out all qgroups data including reservation and + excl/rfer. Even if we leak some reservation or numbers, it will + still be cleared, so nothing will go wrong. + +2) For disabled -> enabled case + Current btrfs_qgroup_release_data() will use extent_io tree to ensure + we won't underflow reservation. And for delayed_ref we use + head->qgroup_reserved to record the reserved space, so in that case + head->qgroup_reserved should be 0 and we won't underflow. + +CC: stable@vger.kernel.org # 4.14+ +Reported-by: Chris Murphy +Link: https://lore.kernel.org/linux-btrfs/CAJCQCtQau7DtuUUeycCkZ36qjbKuxNzsgqJ7+sJ6W0dK_NLE3w@mail.gmail.com/ +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/qgroup.c | 4 ++++ + fs/btrfs/qgroup.h | 2 ++ + 2 files changed, 6 insertions(+) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -3106,6 +3106,10 @@ static int __btrfs_qgroup_release_data(s + int trace_op = QGROUP_RELEASE; + int ret; + ++ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, ++ &BTRFS_I(inode)->root->fs_info->flags)) ++ return 0; ++ + /* In release case, we shouldn't have @reserved */ + WARN_ON(!free && reserved); + if (free && reserved) +--- a/fs/btrfs/qgroup.h ++++ b/fs/btrfs/qgroup.h +@@ -249,6 +249,8 @@ void btrfs_qgroup_free_refroot(struct bt + static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, + u64 ref_root, u64 num_bytes) + { ++ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) ++ return; + trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); + btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes, + BTRFS_QGROUP_RSV_DATA); diff --git a/queue-4.19/btrfs-qgroup-dirty-all-qgroups-before-rescan.patch b/queue-4.19/btrfs-qgroup-dirty-all-qgroups-before-rescan.patch new file mode 100644 index 00000000000..6704da5960a --- /dev/null +++ b/queue-4.19/btrfs-qgroup-dirty-all-qgroups-before-rescan.patch @@ -0,0 +1,85 @@ +From 9c7b0c2e8dbfbcd80a71e2cbfe02704f26c185c6 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Fri, 10 Aug 2018 10:20:26 +0800 +Subject: btrfs: qgroup: Dirty all qgroups before rescan + +From: Qu Wenruo + +commit 9c7b0c2e8dbfbcd80a71e2cbfe02704f26c185c6 upstream. + +[BUG] +In the following case, rescan won't zero out the number of qgroup 1/0: + + $ mkfs.btrfs -fq $DEV + $ mount $DEV /mnt + + $ btrfs quota enable /mnt + $ btrfs qgroup create 1/0 /mnt + $ btrfs sub create /mnt/sub + $ btrfs qgroup assign 0/257 1/0 /mnt + + $ dd if=/dev/urandom of=/mnt/sub/file bs=1k count=1000 + $ btrfs sub snap /mnt/sub /mnt/snap + $ btrfs quota rescan -w /mnt + $ btrfs qgroup show -pcre /mnt + qgroupid rfer excl max_rfer max_excl parent child + -------- ---- ---- -------- -------- ------ ----- + 0/5 16.00KiB 16.00KiB none none --- --- + 0/257 1016.00KiB 16.00KiB none none 1/0 --- + 0/258 1016.00KiB 16.00KiB none none --- --- + 1/0 1016.00KiB 16.00KiB none none --- 0/257 + +So far so good, but: + + $ btrfs qgroup remove 0/257 1/0 /mnt + WARNING: quotas may be inconsistent, rescan needed + $ btrfs quota rescan -w /mnt + $ btrfs qgroup show -pcre /mnt + qgoupid rfer excl max_rfer max_excl parent child + -------- ---- ---- -------- -------- ------ ----- + 0/5 16.00KiB 16.00KiB none none --- --- + 0/257 1016.00KiB 16.00KiB none none --- --- + 0/258 1016.00KiB 16.00KiB none none --- --- + 1/0 1016.00KiB 16.00KiB none none --- --- + ^^^^^^^^^^ ^^^^^^^^ not cleared + +[CAUSE] +Before rescan we call qgroup_rescan_zero_tracking() to zero out all +qgroups' accounting numbers. + +However we don't mark all qgroups dirty, but rely on rescan to do so. + +If we have any high level qgroup without children, it won't be marked +dirty during rescan, since we cannot reach that qgroup. + +This will cause QGROUP_INFO items of childless qgroups never get updated +in the quota tree, thus their numbers will stay the same in "btrfs +qgroup show" output. + +[FIX] +Just mark all qgroups dirty in qgroup_rescan_zero_tracking(), so even if +we have childless qgroups, their QGROUP_INFO items will still get +updated during rescan. + +Reported-by: Misono Tomohiro +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Qu Wenruo +Reviewed-by: Misono Tomohiro +Tested-by: Misono Tomohiro +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/qgroup.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -2897,6 +2897,7 @@ qgroup_rescan_zero_tracking(struct btrfs + qgroup->rfer_cmpr = 0; + qgroup->excl = 0; + qgroup->excl_cmpr = 0; ++ qgroup_dirty(fs_info, qgroup); + } + spin_unlock(&fs_info->qgroup_lock); + } diff --git a/queue-4.19/btrfs-release-metadata-before-running-delayed-refs.patch b/queue-4.19/btrfs-release-metadata-before-running-delayed-refs.patch new file mode 100644 index 00000000000..04e7a57da11 --- /dev/null +++ b/queue-4.19/btrfs-release-metadata-before-running-delayed-refs.patch @@ -0,0 +1,47 @@ +From f45c752b65af46bf42963295c332865d95f97fff Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Fri, 28 Sep 2018 07:17:48 -0400 +Subject: btrfs: release metadata before running delayed refs + +From: Josef Bacik + +commit f45c752b65af46bf42963295c332865d95f97fff upstream. + +We want to release the unused reservation we have since it refills the +delayed refs reserve, which will make everything go smoother when +running the delayed refs if we're short on our reservation. + +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Omar Sandoval +Reviewed-by: Liu Bo +Reviewed-by: Nikolay Borisov +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/transaction.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -1929,6 +1929,9 @@ int btrfs_commit_transaction(struct btrf + return ret; + } + ++ btrfs_trans_release_metadata(trans); ++ trans->block_rsv = NULL; ++ + /* make a pass through all the delayed refs we have so far + * any runnings procs may add more while we are here + */ +@@ -1938,9 +1941,6 @@ int btrfs_commit_transaction(struct btrf + return ret; + } + +- btrfs_trans_release_metadata(trans); +- trans->block_rsv = NULL; +- + cur_trans = trans->transaction; + + /* diff --git a/queue-4.19/btrfs-reset-max_extent_size-on-clear-in-a-bitmap.patch b/queue-4.19/btrfs-reset-max_extent_size-on-clear-in-a-bitmap.patch new file mode 100644 index 00000000000..8efba6e4191 --- /dev/null +++ b/queue-4.19/btrfs-reset-max_extent_size-on-clear-in-a-bitmap.patch @@ -0,0 +1,34 @@ +From 553cceb49681d60975d00892877d4c871bf220f9 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Fri, 28 Sep 2018 07:18:00 -0400 +Subject: btrfs: reset max_extent_size on clear in a bitmap + +From: Josef Bacik + +commit 553cceb49681d60975d00892877d4c871bf220f9 upstream. + +We need to clear the max_extent_size when we clear bits from a bitmap +since it could have been from the range that contains the +max_extent_size. + +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Liu Bo +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/free-space-cache.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -1687,6 +1687,8 @@ static inline void __bitmap_clear_bits(s + bitmap_clear(info->bitmap, start, count); + + info->bytes -= bytes; ++ if (info->max_extent_size > ctl->unit) ++ info->max_extent_size = 0; + } + + static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, diff --git a/queue-4.19/btrfs-wait-on-caching-when-putting-the-bg-cache.patch b/queue-4.19/btrfs-wait-on-caching-when-putting-the-bg-cache.patch new file mode 100644 index 00000000000..017959ce9c9 --- /dev/null +++ b/queue-4.19/btrfs-wait-on-caching-when-putting-the-bg-cache.patch @@ -0,0 +1,85 @@ +From 3aa7c7a31c26321696b92841d5103461c6f3f517 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Wed, 12 Sep 2018 10:45:45 -0400 +Subject: btrfs: wait on caching when putting the bg cache + +From: Josef Bacik + +commit 3aa7c7a31c26321696b92841d5103461c6f3f517 upstream. + +While testing my backport I noticed there was a panic if I ran +generic/416 generic/417 generic/418 all in a row. This just happened to +uncover a race where we had outstanding IO after we destroy all of our +workqueues, and then we'd go to queue the endio work on those free'd +workqueues. + +This is because we aren't waiting for the caching threads to be done +before freeing everything up, so to fix this make sure we wait on any +outstanding caching that's being done before we free up the block group, +so we're sure to be done with all IO by the time we get to +btrfs_stop_all_workers(). This fixes the panic I was seeing +consistently in testing. + +------------[ cut here ]------------ +kernel BUG at fs/btrfs/volumes.c:6112! +SMP PTI +Modules linked in: +CPU: 1 PID: 27165 Comm: kworker/u4:7 Not tainted 4.16.0-02155-g3553e54a578d-dirty #875 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 +Workqueue: btrfs-cache btrfs_cache_helper +RIP: 0010:btrfs_map_bio+0x346/0x370 +RSP: 0000:ffffc900061e79d0 EFLAGS: 00010202 +RAX: 0000000000000000 RBX: ffff880071542e00 RCX: 0000000000533000 +RDX: ffff88006bb74380 RSI: 0000000000000008 RDI: ffff880078160000 +RBP: 0000000000000001 R08: ffff8800781cd200 R09: 0000000000503000 +R10: ffff88006cd21200 R11: 0000000000000000 R12: 0000000000000000 +R13: 0000000000000000 R14: ffff8800781cd200 R15: ffff880071542e00 +FS: 0000000000000000(0000) GS:ffff88007fd00000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 000000000817ffc4 CR3: 0000000078314000 CR4: 00000000000006e0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +Call Trace: + btree_submit_bio_hook+0x8a/0xd0 + submit_one_bio+0x5d/0x80 + read_extent_buffer_pages+0x18a/0x320 + btree_read_extent_buffer_pages+0xbc/0x200 + ? alloc_extent_buffer+0x359/0x3e0 + read_tree_block+0x3d/0x60 + read_block_for_search.isra.30+0x1a5/0x360 + btrfs_search_slot+0x41b/0xa10 + btrfs_next_old_leaf+0x212/0x470 + caching_thread+0x323/0x490 + normal_work_helper+0xc5/0x310 + process_one_work+0x141/0x340 + worker_thread+0x44/0x3c0 + kthread+0xf8/0x130 + ? process_one_work+0x340/0x340 + ? kthread_bind+0x10/0x10 + ret_from_fork+0x35/0x40 +RIP: btrfs_map_bio+0x346/0x370 RSP: ffffc900061e79d0 +---[ end trace 827eb13e50846033 ]--- +Kernel panic - not syncing: Fatal exception +Kernel Offset: disabled +---[ end Kernel panic - not syncing: Fatal exception + +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Josef Bacik +Reviewed-by: Omar Sandoval +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -9632,6 +9632,7 @@ void btrfs_put_block_group_cache(struct + + block_group = btrfs_lookup_first_block_group(info, last); + while (block_group) { ++ wait_block_group_cache_done(block_group); + spin_lock(&block_group->lock); + if (block_group->iref) + break; diff --git a/queue-4.19/series b/queue-4.19/series index be9549d50c6..a3a00096868 100644 --- a/queue-4.19/series +++ b/queue-4.19/series @@ -323,3 +323,22 @@ soc-tegra-pmc-fix-child-node-lookup.patch selftests-ftrace-fix-synthetic-event-test-to-delete-event-correctly.patch selftests-powerpc-fix-ptrace-tm-failure.patch tracing-return-enoent-if-there-is-no-target-synthetic-event.patch +btrfs-qgroup-avoid-calling-qgroup-functions-if-qgroup-is-not-enabled.patch +btrfs-handle-owner-mismatch-gracefully-when-walking-up-tree.patch +btrfs-locking-add-extra-check-in-btrfs_init_new_buffer-to-avoid-deadlock.patch +btrfs-fix-error-handling-in-free_log_tree.patch +btrfs-fix-error-handling-in-btrfs_dev_replace_start.patch +btrfs-enhance-btrfs_trim_fs-function-to-handle-error-better.patch +btrfs-ensure-btrfs_trim_fs-can-trim-the-whole-filesystem.patch +btrfs-iterate-all-devices-during-trim-instead-of-fs_devices-alloc_list.patch +btrfs-don-t-attempt-to-trim-devices-that-don-t-support-it.patch +btrfs-keep-trim-from-interfering-with-transaction-commits.patch +btrfs-wait-on-caching-when-putting-the-bg-cache.patch +btrfs-don-t-clean-dirty-pages-during-buffered-writes.patch +btrfs-release-metadata-before-running-delayed-refs.patch +btrfs-protect-space-cache-inode-alloc-with-gfp_nofs.patch +btrfs-reset-max_extent_size-on-clear-in-a-bitmap.patch +btrfs-make-sure-we-create-all-new-block-groups.patch +btrfs-fix-warning-when-replaying-log-after-fsync-of-a-tmpfile.patch +btrfs-fix-wrong-dentries-after-fsync-of-file-that-got-its-parent-replaced.patch +btrfs-qgroup-dirty-all-qgroups-before-rescan.patch