From: Greg Kroah-Hartman Date: Sat, 5 Mar 2022 10:47:54 +0000 (+0100) Subject: 5.16-stable patches X-Git-Tag: v4.9.305~90 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=daffb0bc440acb25de8f311c8c51061b59b7dd7a;p=thirdparty%2Fkernel%2Fstable-queue.git 5.16-stable patches added patches: btrfs-defrag-bring-back-the-old-file-extent-search-behavior.patch btrfs-defrag-don-t-use-merged-extent-map-for-their-generation-check.patch --- diff --git a/queue-5.16/btrfs-defrag-bring-back-the-old-file-extent-search-behavior.patch b/queue-5.16/btrfs-defrag-bring-back-the-old-file-extent-search-behavior.patch new file mode 100644 index 00000000000..44005fd1f8e --- /dev/null +++ b/queue-5.16/btrfs-defrag-bring-back-the-old-file-extent-search-behavior.patch @@ -0,0 +1,251 @@ +From d5633b0dee02d7d25e93463a03709f11c71500e2 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Fri, 11 Feb 2022 14:46:12 +0800 +Subject: btrfs: defrag: bring back the old file extent search behavior + +From: Qu Wenruo + +commit d5633b0dee02d7d25e93463a03709f11c71500e2 upstream. + +For defrag, we don't really want to use btrfs_get_extent() to iterate +all extent maps of an inode. + +The reasons are: + +- btrfs_get_extent() can merge extent maps + And the result em has the higher generation of the two, causing defrag + to mark unnecessary part of such merged large extent map. + + This in fact can result extra IO for autodefrag in v5.16+ kernels. + + However this patch is not going to completely solve the problem, as + one can still using read() to trigger extent map reading, and got + them merged. + + The completely solution for the extent map merging generation problem + will come as an standalone fix. + +- btrfs_get_extent() caches the extent map result + Normally it's fine, but for defrag the target range may not get + another read/write for a long long time. + Such cache would only increase the memory usage. + +- btrfs_get_extent() doesn't skip older extent map + Unlike the old find_new_extent() which uses btrfs_search_forward() to + skip the older subtree, thus it will pick up unnecessary extent maps. + +This patch will fix the regression by introducing defrag_get_extent() to +replace the btrfs_get_extent() call. + +This helper will: + +- Not cache the file extent we found + It will search the file extent and manually convert it to em. + +- Use btrfs_search_forward() to skip entire ranges which is modified in + the past + +This should reduce the IO for autodefrag. + +Reported-by: Filipe Manana +Fixes: 7b508037d4ca ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()") +Reviewed-by: Filipe Manana +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/ioctl.c | 161 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 157 insertions(+), 4 deletions(-) + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -986,8 +986,155 @@ out: + return ret; + } + ++/* ++ * Defrag specific helper to get an extent map. ++ * ++ * Differences between this and btrfs_get_extent() are: ++ * ++ * - No extent_map will be added to inode->extent_tree ++ * To reduce memory usage in the long run. ++ * ++ * - Extra optimization to skip file extents older than @newer_than ++ * By using btrfs_search_forward() we can skip entire file ranges that ++ * have extents created in past transactions, because btrfs_search_forward() ++ * will not visit leaves and nodes with a generation smaller than given ++ * minimal generation threshold (@newer_than). ++ * ++ * Return valid em if we find a file extent matching the requirement. ++ * Return NULL if we can not find a file extent matching the requirement. ++ * ++ * Return ERR_PTR() for error. ++ */ ++static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, ++ u64 start, u64 newer_than) ++{ ++ struct btrfs_root *root = inode->root; ++ struct btrfs_file_extent_item *fi; ++ struct btrfs_path path = { 0 }; ++ struct extent_map *em; ++ struct btrfs_key key; ++ u64 ino = btrfs_ino(inode); ++ int ret; ++ ++ em = alloc_extent_map(); ++ if (!em) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ key.objectid = ino; ++ key.type = BTRFS_EXTENT_DATA_KEY; ++ key.offset = start; ++ ++ if (newer_than) { ++ ret = btrfs_search_forward(root, &key, &path, newer_than); ++ if (ret < 0) ++ goto err; ++ /* Can't find anything newer */ ++ if (ret > 0) ++ goto not_found; ++ } else { ++ ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); ++ if (ret < 0) ++ goto err; ++ } ++ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { ++ /* ++ * If btrfs_search_slot() makes path to point beyond nritems, ++ * we should not have an empty leaf, as this inode must at ++ * least have its INODE_ITEM. ++ */ ++ ASSERT(btrfs_header_nritems(path.nodes[0])); ++ path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; ++ } ++ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); ++ /* Perfect match, no need to go one slot back */ ++ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && ++ key.offset == start) ++ goto iterate; ++ ++ /* We didn't find a perfect match, needs to go one slot back */ ++ if (path.slots[0] > 0) { ++ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); ++ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) ++ path.slots[0]--; ++ } ++ ++iterate: ++ /* Iterate through the path to find a file extent covering @start */ ++ while (true) { ++ u64 extent_end; ++ ++ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) ++ goto next; ++ ++ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); ++ ++ /* ++ * We may go one slot back to INODE_REF/XATTR item, then ++ * need to go forward until we reach an EXTENT_DATA. ++ * But we should still has the correct ino as key.objectid. ++ */ ++ if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) ++ goto next; ++ ++ /* It's beyond our target range, definitely not extent found */ ++ if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) ++ goto not_found; ++ ++ /* ++ * | |<- File extent ->| ++ * \- start ++ * ++ * This means there is a hole between start and key.offset. ++ */ ++ if (key.offset > start) { ++ em->start = start; ++ em->orig_start = start; ++ em->block_start = EXTENT_MAP_HOLE; ++ em->len = key.offset - start; ++ break; ++ } ++ ++ fi = btrfs_item_ptr(path.nodes[0], path.slots[0], ++ struct btrfs_file_extent_item); ++ extent_end = btrfs_file_extent_end(&path); ++ ++ /* ++ * |<- file extent ->| | ++ * \- start ++ * ++ * We haven't reached start, search next slot. ++ */ ++ if (extent_end <= start) ++ goto next; ++ ++ /* Now this extent covers @start, convert it to em */ ++ btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); ++ break; ++next: ++ ret = btrfs_next_item(root, &path); ++ if (ret < 0) ++ goto err; ++ if (ret > 0) ++ goto not_found; ++ } ++ btrfs_release_path(&path); ++ return em; ++ ++not_found: ++ btrfs_release_path(&path); ++ free_extent_map(em); ++ return NULL; ++ ++err: ++ btrfs_release_path(&path); ++ free_extent_map(em); ++ return ERR_PTR(ret); ++} ++ + static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, +- bool locked) ++ u64 newer_than, bool locked) + { + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; +@@ -1009,7 +1156,7 @@ static struct extent_map *defrag_lookup_ + /* get the big lock and read metadata off disk */ + if (!locked) + lock_extent_bits(io_tree, start, end, &cached); +- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize); ++ em = defrag_get_extent(BTRFS_I(inode), start, newer_than); + if (!locked) + unlock_extent_cached(io_tree, start, end, &cached); + +@@ -1037,7 +1184,12 @@ static bool defrag_check_next_extent(str + if (em->start + em->len >= i_size_read(inode)) + return false; + +- next = defrag_lookup_extent(inode, em->start + em->len, locked); ++ /* ++ * We want to check if the next extent can be merged with the current ++ * one, which can be an extent created in a past generation, so we pass ++ * a minimum generation of 0 to defrag_lookup_extent(). ++ */ ++ next = defrag_lookup_extent(inode, em->start + em->len, 0, locked); + /* No more em or hole */ + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + goto out; +@@ -1188,7 +1340,8 @@ static int defrag_collect_targets(struct + u64 range_len; + + last_is_target = false; +- em = defrag_lookup_extent(&inode->vfs_inode, cur, locked); ++ em = defrag_lookup_extent(&inode->vfs_inode, cur, ++ newer_than, locked); + if (!em) + break; + diff --git a/queue-5.16/btrfs-defrag-don-t-use-merged-extent-map-for-their-generation-check.patch b/queue-5.16/btrfs-defrag-don-t-use-merged-extent-map-for-their-generation-check.patch new file mode 100644 index 00000000000..3c7366bf624 --- /dev/null +++ b/queue-5.16/btrfs-defrag-don-t-use-merged-extent-map-for-their-generation-check.patch @@ -0,0 +1,110 @@ +From 199257a78bb01341c3ba6e85bdcf3a2e6e452c6d Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Fri, 11 Feb 2022 14:46:13 +0800 +Subject: btrfs: defrag: don't use merged extent map for their generation check + +From: Qu Wenruo + +commit 199257a78bb01341c3ba6e85bdcf3a2e6e452c6d upstream. + +For extent maps, if they are not compressed extents and are adjacent by +logical addresses and file offsets, they can be merged into one larger +extent map. + +Such merged extent map will have the higher generation of all the +original ones. + +But this brings a problem for autodefrag, as it relies on accurate +extent_map::generation to determine if one extent should be defragged. + +For merged extent maps, their higher generation can mark some older +extents to be defragged while the original extent map doesn't meet the +minimal generation threshold. + +Thus this will cause extra IO. + +So solve the problem, here we introduce a new flag, EXTENT_FLAG_MERGED, +to indicate if the extent map is merged from one or more ems. + +And for autodefrag, if we find a merged extent map, and its generation +meets the generation requirement, we just don't use this one, and go +back to defrag_get_extent() to read extent maps from subvolume trees. + +This could cause more read IO, but should result less defrag data write, +so in the long run it should be a win for autodefrag. + +Reviewed-by: Filipe Manana +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_map.c | 2 ++ + fs/btrfs/extent_map.h | 8 ++++++++ + fs/btrfs/ioctl.c | 14 ++++++++++++++ + 3 files changed, 24 insertions(+) + +--- a/fs/btrfs/extent_map.c ++++ b/fs/btrfs/extent_map.c +@@ -261,6 +261,7 @@ static void try_merge_map(struct extent_ + em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; + em->mod_start = merge->mod_start; + em->generation = max(em->generation, merge->generation); ++ set_bit(EXTENT_FLAG_MERGED, &em->flags); + + rb_erase_cached(&merge->rb_node, &tree->map); + RB_CLEAR_NODE(&merge->rb_node); +@@ -278,6 +279,7 @@ static void try_merge_map(struct extent_ + RB_CLEAR_NODE(&merge->rb_node); + em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; + em->generation = max(em->generation, merge->generation); ++ set_bit(EXTENT_FLAG_MERGED, &em->flags); + free_extent_map(merge); + } + } +--- a/fs/btrfs/extent_map.h ++++ b/fs/btrfs/extent_map.h +@@ -25,6 +25,8 @@ enum { + EXTENT_FLAG_FILLING, + /* filesystem extent mapping type */ + EXTENT_FLAG_FS_MAPPING, ++ /* This em is merged from two or more physically adjacent ems */ ++ EXTENT_FLAG_MERGED, + }; + + struct extent_map { +@@ -40,6 +42,12 @@ struct extent_map { + u64 ram_bytes; + u64 block_start; + u64 block_len; ++ ++ /* ++ * Generation of the extent map, for merged em it's the highest ++ * generation of all merged ems. ++ * For non-merged extents, it's from btrfs_file_extent_item::generation. ++ */ + u64 generation; + unsigned long flags; + /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */ +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -1149,6 +1149,20 @@ static struct extent_map *defrag_lookup_ + em = lookup_extent_mapping(em_tree, start, sectorsize); + read_unlock(&em_tree->lock); + ++ /* ++ * We can get a merged extent, in that case, we need to re-search ++ * tree to get the original em for defrag. ++ * ++ * If @newer_than is 0 or em::generation < newer_than, we can trust ++ * this em, as either we don't care about the generation, or the ++ * merged extent map will be rejected anyway. ++ */ ++ if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && ++ newer_than && em->generation >= newer_than) { ++ free_extent_map(em); ++ em = NULL; ++ } ++ + if (!em) { + struct extent_state *cached = NULL; + u64 end = start + sectorsize - 1; diff --git a/queue-5.16/series b/queue-5.16/series index c80383fe37a..811deb392ca 100644 --- a/queue-5.16/series +++ b/queue-5.16/series @@ -43,3 +43,5 @@ ata-pata_hpt37x-fix-pci-clock-detection.patch drm-amdgpu-check-vm-ready-by-amdgpu_vm-evicting-flag.patch tracing-add-ustring-operation-to-filtering-string-po.patch ipv6-fix-skb-drops-in-igmp6_event_query-and-igmp6_ev.patch +btrfs-defrag-bring-back-the-old-file-extent-search-behavior.patch +btrfs-defrag-don-t-use-merged-extent-map-for-their-generation-check.patch