--- /dev/null
+From d5633b0dee02d7d25e93463a03709f11c71500e2 Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Fri, 11 Feb 2022 14:46:12 +0800
+Subject: btrfs: defrag: bring back the old file extent search behavior
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit d5633b0dee02d7d25e93463a03709f11c71500e2 upstream.
+
+For defrag, we don't really want to use btrfs_get_extent() to iterate
+all extent maps of an inode.
+
+The reasons are:
+
+- btrfs_get_extent() can merge extent maps
+ And the result em has the higher generation of the two, causing defrag
+ to mark unnecessary part of such merged large extent map.
+
+ This in fact can result extra IO for autodefrag in v5.16+ kernels.
+
+ However this patch is not going to completely solve the problem, as
+ one can still using read() to trigger extent map reading, and got
+ them merged.
+
+ The completely solution for the extent map merging generation problem
+ will come as an standalone fix.
+
+- btrfs_get_extent() caches the extent map result
+ Normally it's fine, but for defrag the target range may not get
+ another read/write for a long long time.
+ Such cache would only increase the memory usage.
+
+- btrfs_get_extent() doesn't skip older extent map
+ Unlike the old find_new_extent() which uses btrfs_search_forward() to
+ skip the older subtree, thus it will pick up unnecessary extent maps.
+
+This patch will fix the regression by introducing defrag_get_extent() to
+replace the btrfs_get_extent() call.
+
+This helper will:
+
+- Not cache the file extent we found
+ It will search the file extent and manually convert it to em.
+
+- Use btrfs_search_forward() to skip entire ranges which is modified in
+ the past
+
+This should reduce the IO for autodefrag.
+
+Reported-by: Filipe Manana <fdmanana@suse.com>
+Fixes: 7b508037d4ca ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()")
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ioctl.c | 161 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
+ 1 file changed, 157 insertions(+), 4 deletions(-)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -986,8 +986,155 @@ out:
+ return ret;
+ }
+
++/*
++ * Defrag specific helper to get an extent map.
++ *
++ * Differences between this and btrfs_get_extent() are:
++ *
++ * - No extent_map will be added to inode->extent_tree
++ * To reduce memory usage in the long run.
++ *
++ * - Extra optimization to skip file extents older than @newer_than
++ * By using btrfs_search_forward() we can skip entire file ranges that
++ * have extents created in past transactions, because btrfs_search_forward()
++ * will not visit leaves and nodes with a generation smaller than given
++ * minimal generation threshold (@newer_than).
++ *
++ * Return valid em if we find a file extent matching the requirement.
++ * Return NULL if we can not find a file extent matching the requirement.
++ *
++ * Return ERR_PTR() for error.
++ */
++static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
++ u64 start, u64 newer_than)
++{
++ struct btrfs_root *root = inode->root;
++ struct btrfs_file_extent_item *fi;
++ struct btrfs_path path = { 0 };
++ struct extent_map *em;
++ struct btrfs_key key;
++ u64 ino = btrfs_ino(inode);
++ int ret;
++
++ em = alloc_extent_map();
++ if (!em) {
++ ret = -ENOMEM;
++ goto err;
++ }
++
++ key.objectid = ino;
++ key.type = BTRFS_EXTENT_DATA_KEY;
++ key.offset = start;
++
++ if (newer_than) {
++ ret = btrfs_search_forward(root, &key, &path, newer_than);
++ if (ret < 0)
++ goto err;
++ /* Can't find anything newer */
++ if (ret > 0)
++ goto not_found;
++ } else {
++ ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
++ if (ret < 0)
++ goto err;
++ }
++ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
++ /*
++ * If btrfs_search_slot() makes path to point beyond nritems,
++ * we should not have an empty leaf, as this inode must at
++ * least have its INODE_ITEM.
++ */
++ ASSERT(btrfs_header_nritems(path.nodes[0]));
++ path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
++ }
++ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
++ /* Perfect match, no need to go one slot back */
++ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
++ key.offset == start)
++ goto iterate;
++
++ /* We didn't find a perfect match, needs to go one slot back */
++ if (path.slots[0] > 0) {
++ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
++ if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
++ path.slots[0]--;
++ }
++
++iterate:
++ /* Iterate through the path to find a file extent covering @start */
++ while (true) {
++ u64 extent_end;
++
++ if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
++ goto next;
++
++ btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
++
++ /*
++ * We may go one slot back to INODE_REF/XATTR item, then
++ * need to go forward until we reach an EXTENT_DATA.
++ * But we should still has the correct ino as key.objectid.
++ */
++ if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
++ goto next;
++
++ /* It's beyond our target range, definitely not extent found */
++ if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
++ goto not_found;
++
++ /*
++ * | |<- File extent ->|
++ * \- start
++ *
++ * This means there is a hole between start and key.offset.
++ */
++ if (key.offset > start) {
++ em->start = start;
++ em->orig_start = start;
++ em->block_start = EXTENT_MAP_HOLE;
++ em->len = key.offset - start;
++ break;
++ }
++
++ fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
++ struct btrfs_file_extent_item);
++ extent_end = btrfs_file_extent_end(&path);
++
++ /*
++ * |<- file extent ->| |
++ * \- start
++ *
++ * We haven't reached start, search next slot.
++ */
++ if (extent_end <= start)
++ goto next;
++
++ /* Now this extent covers @start, convert it to em */
++ btrfs_extent_item_to_extent_map(inode, &path, fi, false, em);
++ break;
++next:
++ ret = btrfs_next_item(root, &path);
++ if (ret < 0)
++ goto err;
++ if (ret > 0)
++ goto not_found;
++ }
++ btrfs_release_path(&path);
++ return em;
++
++not_found:
++ btrfs_release_path(&path);
++ free_extent_map(em);
++ return NULL;
++
++err:
++ btrfs_release_path(&path);
++ free_extent_map(em);
++ return ERR_PTR(ret);
++}
++
+ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
+- bool locked)
++ u64 newer_than, bool locked)
+ {
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+@@ -1009,7 +1156,7 @@ static struct extent_map *defrag_lookup_
+ /* get the big lock and read metadata off disk */
+ if (!locked)
+ lock_extent_bits(io_tree, start, end, &cached);
+- em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize);
++ em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
+ if (!locked)
+ unlock_extent_cached(io_tree, start, end, &cached);
+
+@@ -1037,7 +1184,12 @@ static bool defrag_check_next_extent(str
+ if (em->start + em->len >= i_size_read(inode))
+ return false;
+
+- next = defrag_lookup_extent(inode, em->start + em->len, locked);
++ /*
++ * We want to check if the next extent can be merged with the current
++ * one, which can be an extent created in a past generation, so we pass
++ * a minimum generation of 0 to defrag_lookup_extent().
++ */
++ next = defrag_lookup_extent(inode, em->start + em->len, 0, locked);
+ /* No more em or hole */
+ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+ goto out;
+@@ -1188,7 +1340,8 @@ static int defrag_collect_targets(struct
+ u64 range_len;
+
+ last_is_target = false;
+- em = defrag_lookup_extent(&inode->vfs_inode, cur, locked);
++ em = defrag_lookup_extent(&inode->vfs_inode, cur,
++ newer_than, locked);
+ if (!em)
+ break;
+
--- /dev/null
+From 199257a78bb01341c3ba6e85bdcf3a2e6e452c6d Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Fri, 11 Feb 2022 14:46:13 +0800
+Subject: btrfs: defrag: don't use merged extent map for their generation check
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 199257a78bb01341c3ba6e85bdcf3a2e6e452c6d upstream.
+
+For extent maps, if they are not compressed extents and are adjacent by
+logical addresses and file offsets, they can be merged into one larger
+extent map.
+
+Such merged extent map will have the higher generation of all the
+original ones.
+
+But this brings a problem for autodefrag, as it relies on accurate
+extent_map::generation to determine if one extent should be defragged.
+
+For merged extent maps, their higher generation can mark some older
+extents to be defragged while the original extent map doesn't meet the
+minimal generation threshold.
+
+Thus this will cause extra IO.
+
+So solve the problem, here we introduce a new flag, EXTENT_FLAG_MERGED,
+to indicate if the extent map is merged from one or more ems.
+
+And for autodefrag, if we find a merged extent map, and its generation
+meets the generation requirement, we just don't use this one, and go
+back to defrag_get_extent() to read extent maps from subvolume trees.
+
+This could cause more read IO, but should result less defrag data write,
+so in the long run it should be a win for autodefrag.
+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent_map.c | 2 ++
+ fs/btrfs/extent_map.h | 8 ++++++++
+ fs/btrfs/ioctl.c | 14 ++++++++++++++
+ 3 files changed, 24 insertions(+)
+
+--- a/fs/btrfs/extent_map.c
++++ b/fs/btrfs/extent_map.c
+@@ -261,6 +261,7 @@ static void try_merge_map(struct extent_
+ em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
+ em->mod_start = merge->mod_start;
+ em->generation = max(em->generation, merge->generation);
++ set_bit(EXTENT_FLAG_MERGED, &em->flags);
+
+ rb_erase_cached(&merge->rb_node, &tree->map);
+ RB_CLEAR_NODE(&merge->rb_node);
+@@ -278,6 +279,7 @@ static void try_merge_map(struct extent_
+ RB_CLEAR_NODE(&merge->rb_node);
+ em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
+ em->generation = max(em->generation, merge->generation);
++ set_bit(EXTENT_FLAG_MERGED, &em->flags);
+ free_extent_map(merge);
+ }
+ }
+--- a/fs/btrfs/extent_map.h
++++ b/fs/btrfs/extent_map.h
+@@ -25,6 +25,8 @@ enum {
+ EXTENT_FLAG_FILLING,
+ /* filesystem extent mapping type */
+ EXTENT_FLAG_FS_MAPPING,
++ /* This em is merged from two or more physically adjacent ems */
++ EXTENT_FLAG_MERGED,
+ };
+
+ struct extent_map {
+@@ -40,6 +42,12 @@ struct extent_map {
+ u64 ram_bytes;
+ u64 block_start;
+ u64 block_len;
++
++ /*
++ * Generation of the extent map, for merged em it's the highest
++ * generation of all merged ems.
++ * For non-merged extents, it's from btrfs_file_extent_item::generation.
++ */
+ u64 generation;
+ unsigned long flags;
+ /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -1149,6 +1149,20 @@ static struct extent_map *defrag_lookup_
+ em = lookup_extent_mapping(em_tree, start, sectorsize);
+ read_unlock(&em_tree->lock);
+
++ /*
++ * We can get a merged extent, in that case, we need to re-search
++ * tree to get the original em for defrag.
++ *
++ * If @newer_than is 0 or em::generation < newer_than, we can trust
++ * this em, as either we don't care about the generation, or the
++ * merged extent map will be rejected anyway.
++ */
++ if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
++ newer_than && em->generation >= newer_than) {
++ free_extent_map(em);
++ em = NULL;
++ }
++
+ if (!em) {
+ struct extent_state *cached = NULL;
+ u64 end = start + sectorsize - 1;