From 8b146a9d8683debe4231b33a93930d886c903947 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Mon, 26 Sep 2022 08:54:33 +0200
Subject: [PATCH] 5.19-stable patches

added patches:
	ext4-avoid-unnecessary-spreading-of-allocations-among-groups.patch
	ext4-make-directory-inode-spreading-reflect-flexbg-size.patch
	ext4-use-buckets-for-cr-1-block-scan-instead-of-rbtree.patch
	ext4-use-locality-group-preallocation-for-small-closed-files.patch
---
 ...preading-of-allocations-among-groups.patch |  72 +++
 ...-inode-spreading-reflect-flexbg-size.patch |  39 ++
 ...or-cr-1-block-scan-instead-of-rbtree.patch | 527 ++++++++++++++++++
 ...preallocation-for-small-closed-files.patch |  82 +++
 queue-5.19/series                             |   4 +
 5 files changed, 724 insertions(+)
 create mode 100644 queue-5.19/ext4-avoid-unnecessary-spreading-of-allocations-among-groups.patch
 create mode 100644 queue-5.19/ext4-make-directory-inode-spreading-reflect-flexbg-size.patch
 create mode 100644 queue-5.19/ext4-use-buckets-for-cr-1-block-scan-instead-of-rbtree.patch
 create mode 100644 queue-5.19/ext4-use-locality-group-preallocation-for-small-closed-files.patch

diff --git a/queue-5.19/ext4-avoid-unnecessary-spreading-of-allocations-among-groups.patch b/queue-5.19/ext4-avoid-unnecessary-spreading-of-allocations-among-groups.patch
new file mode 100644
index 00000000000..41edfa8a082
--- /dev/null
+++ b/queue-5.19/ext4-avoid-unnecessary-spreading-of-allocations-among-groups.patch
@@ -0,0 +1,72 @@
+From 1940265ede6683f6317cba0d428ce6505eaca944 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 8 Sep 2022 11:21:25 +0200
+Subject: ext4: avoid unnecessary spreading of allocations among groups
+
+From: Jan Kara <jack@suse.cz>
+
+commit 1940265ede6683f6317cba0d428ce6505eaca944 upstream.
+
+mb_set_largest_free_order() updates lists containing groups with largest
+chunk of free space of given order. The way it updates it leads to
+always moving the group to the tail of the list. Thus allocations
+looking for free space of given order effectively end up cycling through
+all groups (and due to initialization in last to first order). This
+spreads allocations among block groups which reduces performance for
+rotating disks or low-end flash media. Change
+mb_set_largest_free_order() to only update lists if the order of the
+largest free chunk in the group changed.
+
+Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning")
+CC: stable@kernel.org
+Reported-and-tested-by: Stefan Wahren <stefan.wahren@i2se.com>
+Tested-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
+Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/all/0d81a7c2-46b7-6010-62a4-3e6cfc1628d6@i2se.com/
+Link: https://lore.kernel.org/r/20220908092136.11770-2-jack@suse.cz
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/mballoc.c |   24 +++++++++++++-----------
+ 1 file changed, 13 insertions(+), 11 deletions(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -1077,23 +1077,25 @@ mb_set_largest_free_order(struct super_b
+ 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+ 	int i;
+ 
+-	if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) {
++	for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
++		if (grp->bb_counters[i] > 0)
++			break;
++	/* No need to move between order lists? */
++	if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
++	    i == grp->bb_largest_free_order) {
++		grp->bb_largest_free_order = i;
++		return;
++	}
++
++	if (grp->bb_largest_free_order >= 0) {
+ 		write_lock(&sbi->s_mb_largest_free_orders_locks[
+ 					      grp->bb_largest_free_order]);
+ 		list_del_init(&grp->bb_largest_free_order_node);
+ 		write_unlock(&sbi->s_mb_largest_free_orders_locks[
+ 					      grp->bb_largest_free_order]);
+ 	}
+-	grp->bb_largest_free_order = -1; /* uninit */
+-
+-	for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) {
+-		if (grp->bb_counters[i] > 0) {
+-			grp->bb_largest_free_order = i;
+-			break;
+-		}
+-	}
+-	if (test_opt2(sb, MB_OPTIMIZE_SCAN) &&
+-	    grp->bb_largest_free_order >= 0 && grp->bb_free) {
++	grp->bb_largest_free_order = i;
++	if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
+ 		write_lock(&sbi->s_mb_largest_free_orders_locks[
+ 					      grp->bb_largest_free_order]);
+ 		list_add_tail(&grp->bb_largest_free_order_node,
diff --git a/queue-5.19/ext4-make-directory-inode-spreading-reflect-flexbg-size.patch b/queue-5.19/ext4-make-directory-inode-spreading-reflect-flexbg-size.patch
new file mode 100644
index 00000000000..a82e75c65b2
--- /dev/null
+++ b/queue-5.19/ext4-make-directory-inode-spreading-reflect-flexbg-size.patch
@@ -0,0 +1,39 @@
+From 613c5a85898d1cd44e68f28d65eccf64a8ace9cf Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 8 Sep 2022 11:21:26 +0200
+Subject: ext4: make directory inode spreading reflect flexbg size
+
+From: Jan Kara <jack@suse.cz>
+
+commit 613c5a85898d1cd44e68f28d65eccf64a8ace9cf upstream.
+
+Currently the Orlov inode allocator searches for free inodes for a
+directory only in flex block groups with at most inodes_per_group/16
+more directory inodes than average per flex block group. However with
+growing size of flex block group this becomes unnecessarily strict.
+Scale allowed difference from average directory count per flex block
+group with flex block group size as we do with other metrics.
+
+Tested-by: Stefan Wahren <stefan.wahren@i2se.com>
+Tested-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
+Cc: stable@kernel.org
+Link: https://lore.kernel.org/all/0d81a7c2-46b7-6010-62a4-3e6cfc1628d6@i2se.com/
+Signed-off-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/r/20220908092136.11770-3-jack@suse.cz
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/ialloc.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -510,7 +510,7 @@ static int find_group_orlov(struct super
+ 		goto fallback;
+ 	}
+ 
+-	max_dirs = ndirs / ngroups + inodes_per_group / 16;
++	max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16;
+ 	min_inodes = avefreei - inodes_per_group*flex_size / 4;
+ 	if (min_inodes < 1)
+ 		min_inodes = 1;
diff --git a/queue-5.19/ext4-use-buckets-for-cr-1-block-scan-instead-of-rbtree.patch b/queue-5.19/ext4-use-buckets-for-cr-1-block-scan-instead-of-rbtree.patch
new file mode 100644
index 00000000000..0cbc7264277
--- /dev/null
+++ b/queue-5.19/ext4-use-buckets-for-cr-1-block-scan-instead-of-rbtree.patch
@@ -0,0 +1,527 @@
+From 83e80a6e3543f37f74c8e48a5f305b054b65ce2a Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 8 Sep 2022 11:21:28 +0200
+Subject: ext4: use buckets for cr 1 block scan instead of rbtree
+
+From: Jan Kara <jack@suse.cz>
+
+commit 83e80a6e3543f37f74c8e48a5f305b054b65ce2a upstream.
+
+Using rbtree for sorting groups by average fragment size is relatively
+expensive (needs rbtree update on every block freeing or allocation) and
+leads to wide spreading of allocations because selection of block group
+is very sentitive both to changes in free space and amount of blocks
+allocated. Furthermore selecting group with the best matching average
+fragment size is not necessary anyway, even more so because the
+variability of fragment sizes within a group is likely large so average
+is not telling much. We just need a group with large enough average
+fragment size so that we have high probability of finding large enough
+free extent and we don't want average fragment size to be too big so
+that we are likely to find free extent only somewhat larger than what we
+need.
+
+So instead of maintaing rbtree of groups sorted by fragment size keep
+bins (lists) or groups where average fragment size is in the interval
+[2^i, 2^(i+1)). This structure requires less updates on block allocation
+/ freeing, generally avoids chaotic spreading of allocations into block
+groups, and still is able to quickly (even faster that the rbtree)
+provide a block group which is likely to have a suitably sized free
+space extent.
+
+This patch reduces number of block groups used when untarring archive
+with medium sized files (size somewhat above 64k which is default
+mballoc limit for avoiding locality group preallocation) to about half
+and thus improves write speeds for eMMC flash significantly.
+
+Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning")
+CC: stable@kernel.org
+Reported-and-tested-by: Stefan Wahren <stefan.wahren@i2se.com>
+Tested-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
+Link: https://lore.kernel.org/all/0d81a7c2-46b7-6010-62a4-3e6cfc1628d6@i2se.com/
+Link: https://lore.kernel.org/r/20220908092136.11770-5-jack@suse.cz
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/ext4.h    |   10 +-
+ fs/ext4/mballoc.c |  249 ++++++++++++++++++++++--------------------------------
+ fs/ext4/mballoc.h |    1 
+ 3 files changed, 111 insertions(+), 149 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -167,8 +167,6 @@ enum SHIFT_DIRECTION {
+ #define EXT4_MB_CR0_OPTIMIZED		0x8000
+ /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
+ #define EXT4_MB_CR1_OPTIMIZED		0x00010000
+-/* Perform linear traversal for one group */
+-#define EXT4_MB_SEARCH_NEXT_LINEAR	0x00020000
+ struct ext4_allocation_request {
+ 	/* target inode for block we're allocating */
+ 	struct inode *inode;
+@@ -1589,8 +1587,8 @@ struct ext4_sb_info {
+ 	struct list_head s_discard_list;
+ 	struct work_struct s_discard_work;
+ 	atomic_t s_retry_alloc_pending;
+-	struct rb_root s_mb_avg_fragment_size_root;
+-	rwlock_t s_mb_rb_lock;
++	struct list_head *s_mb_avg_fragment_size;
++	rwlock_t *s_mb_avg_fragment_size_locks;
+ 	struct list_head *s_mb_largest_free_orders;
+ 	rwlock_t *s_mb_largest_free_orders_locks;
+ 
+@@ -3402,6 +3400,8 @@ struct ext4_group_info {
+ 	ext4_grpblk_t	bb_first_free;	/* first free block */
+ 	ext4_grpblk_t	bb_free;	/* total free blocks */
+ 	ext4_grpblk_t	bb_fragments;	/* nr of freespace fragments */
++	int		bb_avg_fragment_size_order;	/* order of average
++							   fragment in BG */
+ 	ext4_grpblk_t	bb_largest_free_order;/* order of largest frag in BG */
+ 	ext4_group_t	bb_group;	/* Group number */
+ 	struct          list_head bb_prealloc_list;
+@@ -3409,7 +3409,7 @@ struct ext4_group_info {
+ 	void            *bb_bitmap;
+ #endif
+ 	struct rw_semaphore alloc_sem;
+-	struct rb_node	bb_avg_fragment_size_rb;
++	struct list_head bb_avg_fragment_size_node;
+ 	struct list_head bb_largest_free_order_node;
+ 	ext4_grpblk_t	bb_counters[];	/* Nr of free power-of-two-block
+ 					 * regions, index is order.
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -140,13 +140,15 @@
+  *    number of buddy bitmap orders possible) number of lists. Group-infos are
+  *    placed in appropriate lists.
+  *
+- * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root)
++ * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
+  *
+- *    Locking: sbi->s_mb_rb_lock (rwlock)
++ *    Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
+  *
+- *    This is a red black tree consisting of group infos and the tree is sorted
+- *    by average fragment sizes (which is calculated as ext4_group_info->bb_free
+- *    / ext4_group_info->bb_fragments).
++ *    This is an array of lists where in the i-th list there are groups with
++ *    average fragment size >= 2^i and < 2^(i+1). The average fragment size
++ *    is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
++ *    Note that we don't bother with a special list for completely empty groups
++ *    so we only have MB_NUM_ORDERS(sb) lists.
+  *
+  * When "mb_optimize_scan" mount option is set, mballoc consults the above data
+  * structures to decide the order in which groups are to be traversed for
+@@ -160,7 +162,8 @@
+  *
+  * At CR = 1, we only consider groups where average fragment size > request
+  * size. So, we lookup a group which has average fragment size just above or
+- * equal to request size using our rb tree (data structure 2) in O(log N) time.
++ * equal to request size using our average fragment size group lists (data
++ * structure 2) in O(1) time.
+  *
+  * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
+  * linear order which requires O(N) search time for each CR 0 and CR 1 phase.
+@@ -802,65 +805,51 @@ static void ext4_mb_mark_free_simple(str
+ 	}
+ }
+ 
+-static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new,
+-			int (*cmp)(struct rb_node *, struct rb_node *))
++static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len)
+ {
+-	struct rb_node **iter = &root->rb_node, *parent = NULL;
++	int order;
+ 
+-	while (*iter) {
+-		parent = *iter;
+-		if (cmp(new, *iter) > 0)
+-			iter = &((*iter)->rb_left);
+-		else
+-			iter = &((*iter)->rb_right);
+-	}
+-
+-	rb_link_node(new, parent, iter);
+-	rb_insert_color(new, root);
+-}
+-
+-static int
+-ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2)
+-{
+-	struct ext4_group_info *grp1 = rb_entry(rb1,
+-						struct ext4_group_info,
+-						bb_avg_fragment_size_rb);
+-	struct ext4_group_info *grp2 = rb_entry(rb2,
+-						struct ext4_group_info,
+-						bb_avg_fragment_size_rb);
+-	int num_frags_1, num_frags_2;
+-
+-	num_frags_1 = grp1->bb_fragments ?
+-		grp1->bb_free / grp1->bb_fragments : 0;
+-	num_frags_2 = grp2->bb_fragments ?
+-		grp2->bb_free / grp2->bb_fragments : 0;
+-
+-	return (num_frags_2 - num_frags_1);
++	/*
++	 * We don't bother with a special lists groups with only 1 block free
++	 * extents and for completely empty groups.
++	 */
++	order = fls(len) - 2;
++	if (order < 0)
++		return 0;
++	if (order == MB_NUM_ORDERS(sb))
++		order--;
++	return order;
+ }
+ 
+-/*
+- * Reinsert grpinfo into the avg_fragment_size tree with new average
+- * fragment size.
+- */
++/* Move group to appropriate avg_fragment_size list */
+ static void
+ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
+ {
+ 	struct ext4_sb_info *sbi = EXT4_SB(sb);
++	int new_order;
+ 
+ 	if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
+ 		return;
+ 
+-	write_lock(&sbi->s_mb_rb_lock);
+-	if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) {
+-		rb_erase(&grp->bb_avg_fragment_size_rb,
+-				&sbi->s_mb_avg_fragment_size_root);
+-		RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb);
+-	}
++	new_order = mb_avg_fragment_size_order(sb,
++					grp->bb_free / grp->bb_fragments);
++	if (new_order == grp->bb_avg_fragment_size_order)
++		return;
+ 
+-	ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root,
+-		&grp->bb_avg_fragment_size_rb,
+-		ext4_mb_avg_fragment_size_cmp);
+-	write_unlock(&sbi->s_mb_rb_lock);
++	if (grp->bb_avg_fragment_size_order != -1) {
++		write_lock(&sbi->s_mb_avg_fragment_size_locks[
++					grp->bb_avg_fragment_size_order]);
++		list_del(&grp->bb_avg_fragment_size_node);
++		write_unlock(&sbi->s_mb_avg_fragment_size_locks[
++					grp->bb_avg_fragment_size_order]);
++	}
++	grp->bb_avg_fragment_size_order = new_order;
++	write_lock(&sbi->s_mb_avg_fragment_size_locks[
++					grp->bb_avg_fragment_size_order]);
++	list_add_tail(&grp->bb_avg_fragment_size_node,
++		&sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
++	write_unlock(&sbi->s_mb_avg_fragment_size_locks[
++					grp->bb_avg_fragment_size_order]);
+ }
+ 
+ /*
+@@ -909,86 +898,56 @@ static void ext4_mb_choose_next_group_cr
+ 		*new_cr = 1;
+ 	} else {
+ 		*group = grp->bb_group;
+-		ac->ac_last_optimal_group = *group;
+ 		ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
+ 	}
+ }
+ 
+ /*
+- * Choose next group by traversing average fragment size tree. Updates *new_cr
+- * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that
+- * the linear search should continue for one iteration since there's lock
+- * contention on the rb tree lock.
++ * Choose next group by traversing average fragment size list of suitable
++ * order. Updates *new_cr if cr level needs an update.
+  */
+ static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
+ 		int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+ {
+ 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+-	int avg_fragment_size, best_so_far;
+-	struct rb_node *node, *found;
+-	struct ext4_group_info *grp;
+-
+-	/*
+-	 * If there is contention on the lock, instead of waiting for the lock
+-	 * to become available, just continue searching lineraly. We'll resume
+-	 * our rb tree search later starting at ac->ac_last_optimal_group.
+-	 */
+-	if (!read_trylock(&sbi->s_mb_rb_lock)) {
+-		ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR;
+-		return;
+-	}
++	struct ext4_group_info *grp, *iter;
++	int i;
+ 
+ 	if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
+ 		if (sbi->s_mb_stats)
+ 			atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
+-		/* We have found something at CR 1 in the past */
+-		grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group);
+-		for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL;
+-		     found = rb_next(found)) {
+-			grp = rb_entry(found, struct ext4_group_info,
+-				       bb_avg_fragment_size_rb);
++	}
++
++	for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
++	     i < MB_NUM_ORDERS(ac->ac_sb); i++) {
++		if (list_empty(&sbi->s_mb_avg_fragment_size[i]))
++			continue;
++		read_lock(&sbi->s_mb_avg_fragment_size_locks[i]);
++		if (list_empty(&sbi->s_mb_avg_fragment_size[i])) {
++			read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
++			continue;
++		}
++		grp = NULL;
++		list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i],
++				    bb_avg_fragment_size_node) {
+ 			if (sbi->s_mb_stats)
+ 				atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
+-			if (likely(ext4_mb_good_group(ac, grp->bb_group, 1)))
++			if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) {
++				grp = iter;
+ 				break;
+-		}
+-		goto done;
+-	}
+-
+-	node = sbi->s_mb_avg_fragment_size_root.rb_node;
+-	best_so_far = 0;
+-	found = NULL;
+-
+-	while (node) {
+-		grp = rb_entry(node, struct ext4_group_info,
+-			       bb_avg_fragment_size_rb);
+-		avg_fragment_size = 0;
+-		if (ext4_mb_good_group(ac, grp->bb_group, 1)) {
+-			avg_fragment_size = grp->bb_fragments ?
+-				grp->bb_free / grp->bb_fragments : 0;
+-			if (!best_so_far || avg_fragment_size < best_so_far) {
+-				best_so_far = avg_fragment_size;
+-				found = node;
+ 			}
+ 		}
+-		if (avg_fragment_size > ac->ac_g_ex.fe_len)
+-			node = node->rb_right;
+-		else
+-			node = node->rb_left;
++		read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
++		if (grp)
++			break;
+ 	}
+ 
+-done:
+-	if (found) {
+-		grp = rb_entry(found, struct ext4_group_info,
+-			       bb_avg_fragment_size_rb);
++	if (grp) {
+ 		*group = grp->bb_group;
+ 		ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
+ 	} else {
+ 		*new_cr = 2;
+ 	}
+-
+-	read_unlock(&sbi->s_mb_rb_lock);
+-	ac->ac_last_optimal_group = *group;
+ }
+ 
+ static inline int should_optimize_scan(struct ext4_allocation_context *ac)
+@@ -1017,11 +976,6 @@ next_linear_group(struct ext4_allocation
+ 		goto inc_and_return;
+ 	}
+ 
+-	if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) {
+-		ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR;
+-		goto inc_and_return;
+-	}
+-
+ 	return group;
+ inc_and_return:
+ 	/*
+@@ -1152,13 +1106,13 @@ void ext4_mb_generate_buddy(struct super
+ 					EXT4_GROUP_INFO_BBITMAP_CORRUPT);
+ 	}
+ 	mb_set_largest_free_order(sb, grp);
++	mb_update_avg_fragment_size(sb, grp);
+ 
+ 	clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+ 
+ 	period = get_cycles() - period;
+ 	atomic_inc(&sbi->s_mb_buddies_generated);
+ 	atomic64_add(period, &sbi->s_mb_generation_time);
+-	mb_update_avg_fragment_size(sb, grp);
+ }
+ 
+ /* The buddy information is attached the buddy cache inode
+@@ -2705,7 +2659,6 @@ repeat:
+ 		 * from the goal value specified
+ 		 */
+ 		group = ac->ac_g_ex.fe_group;
+-		ac->ac_last_optimal_group = group;
+ 		ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
+ 		prefetch_grp = group;
+ 
+@@ -2987,9 +2940,7 @@ __acquires(&EXT4_SB(sb)->s_mb_rb_lock)
+ 	struct super_block *sb = pde_data(file_inode(seq->file));
+ 	unsigned long position;
+ 
+-	read_lock(&EXT4_SB(sb)->s_mb_rb_lock);
+-
+-	if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
++	if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
+ 		return NULL;
+ 	position = *pos + 1;
+ 	return (void *) ((unsigned long) position);
+@@ -3001,7 +2952,7 @@ static void *ext4_mb_seq_structs_summary
+ 	unsigned long position;
+ 
+ 	++*pos;
+-	if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1)
++	if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb))
+ 		return NULL;
+ 	position = *pos + 1;
+ 	return (void *) ((unsigned long) position);
+@@ -3013,29 +2964,22 @@ static int ext4_mb_seq_structs_summary_s
+ 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+ 	unsigned long position = ((unsigned long) v);
+ 	struct ext4_group_info *grp;
+-	struct rb_node *n;
+-	unsigned int count, min, max;
++	unsigned int count;
+ 
+ 	position--;
+ 	if (position >= MB_NUM_ORDERS(sb)) {
+-		seq_puts(seq, "fragment_size_tree:\n");
+-		n = rb_first(&sbi->s_mb_avg_fragment_size_root);
+-		if (!n) {
+-			seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n");
+-			return 0;
+-		}
+-		grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
+-		min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
+-		count = 1;
+-		while (rb_next(n)) {
+-			count++;
+-			n = rb_next(n);
+-		}
+-		grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb);
+-		max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0;
++		position -= MB_NUM_ORDERS(sb);
++		if (position == 0)
++			seq_puts(seq, "avg_fragment_size_lists:\n");
+ 
+-		seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n",
+-			   min, max, count);
++		count = 0;
++		read_lock(&sbi->s_mb_avg_fragment_size_locks[position]);
++		list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position],
++				    bb_avg_fragment_size_node)
++			count++;
++		read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]);
++		seq_printf(seq, "\tlist_order_%u_groups: %u\n",
++					(unsigned int)position, count);
+ 		return 0;
+ 	}
+ 
+@@ -3045,9 +2989,11 @@ static int ext4_mb_seq_structs_summary_s
+ 		seq_puts(seq, "max_free_order_lists:\n");
+ 	}
+ 	count = 0;
++	read_lock(&sbi->s_mb_largest_free_orders_locks[position]);
+ 	list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
+ 			    bb_largest_free_order_node)
+ 		count++;
++	read_unlock(&sbi->s_mb_largest_free_orders_locks[position]);
+ 	seq_printf(seq, "\tlist_order_%u_groups: %u\n",
+ 		   (unsigned int)position, count);
+ 
+@@ -3055,11 +3001,7 @@ static int ext4_mb_seq_structs_summary_s
+ }
+ 
+ static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v)
+-__releases(&EXT4_SB(sb)->s_mb_rb_lock)
+ {
+-	struct super_block *sb = pde_data(file_inode(seq->file));
+-
+-	read_unlock(&EXT4_SB(sb)->s_mb_rb_lock);
+ }
+ 
+ const struct seq_operations ext4_mb_seq_structs_summary_ops = {
+@@ -3172,8 +3114,9 @@ int ext4_mb_add_groupinfo(struct super_b
+ 	init_rwsem(&meta_group_info[i]->alloc_sem);
+ 	meta_group_info[i]->bb_free_root = RB_ROOT;
+ 	INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
+-	RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb);
++	INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
+ 	meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
++	meta_group_info[i]->bb_avg_fragment_size_order = -1;  /* uninit */
+ 	meta_group_info[i]->bb_group = group;
+ 
+ 	mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group);
+@@ -3422,7 +3365,24 @@ int ext4_mb_init(struct super_block *sb)
+ 		i++;
+ 	} while (i < MB_NUM_ORDERS(sb));
+ 
+-	sbi->s_mb_avg_fragment_size_root = RB_ROOT;
++	sbi->s_mb_avg_fragment_size =
++		kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
++			GFP_KERNEL);
++	if (!sbi->s_mb_avg_fragment_size) {
++		ret = -ENOMEM;
++		goto out;
++	}
++	sbi->s_mb_avg_fragment_size_locks =
++		kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
++			GFP_KERNEL);
++	if (!sbi->s_mb_avg_fragment_size_locks) {
++		ret = -ENOMEM;
++		goto out;
++	}
++	for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
++		INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
++		rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
++	}
+ 	sbi->s_mb_largest_free_orders =
+ 		kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
+ 			GFP_KERNEL);
+@@ -3441,7 +3401,6 @@ int ext4_mb_init(struct super_block *sb)
+ 		INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
+ 		rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
+ 	}
+-	rwlock_init(&sbi->s_mb_rb_lock);
+ 
+ 	spin_lock_init(&sbi->s_md_lock);
+ 	sbi->s_mb_free_pending = 0;
+@@ -3512,6 +3471,8 @@ out_free_locality_groups:
+ 	free_percpu(sbi->s_locality_groups);
+ 	sbi->s_locality_groups = NULL;
+ out:
++	kfree(sbi->s_mb_avg_fragment_size);
++	kfree(sbi->s_mb_avg_fragment_size_locks);
+ 	kfree(sbi->s_mb_largest_free_orders);
+ 	kfree(sbi->s_mb_largest_free_orders_locks);
+ 	kfree(sbi->s_mb_offsets);
+@@ -3578,6 +3539,8 @@ int ext4_mb_release(struct super_block *
+ 		kvfree(group_info);
+ 		rcu_read_unlock();
+ 	}
++	kfree(sbi->s_mb_avg_fragment_size);
++	kfree(sbi->s_mb_avg_fragment_size_locks);
+ 	kfree(sbi->s_mb_largest_free_orders);
+ 	kfree(sbi->s_mb_largest_free_orders_locks);
+ 	kfree(sbi->s_mb_offsets);
+--- a/fs/ext4/mballoc.h
++++ b/fs/ext4/mballoc.h
+@@ -178,7 +178,6 @@ struct ext4_allocation_context {
+ 	/* copy of the best found extent taken before preallocation efforts */
+ 	struct ext4_free_extent ac_f_ex;
+ 
+-	ext4_group_t ac_last_optimal_group;
+ 	__u32 ac_groups_considered;
+ 	__u32 ac_flags;		/* allocation hints */
+ 	__u16 ac_groups_scanned;
diff --git a/queue-5.19/ext4-use-locality-group-preallocation-for-small-closed-files.patch b/queue-5.19/ext4-use-locality-group-preallocation-for-small-closed-files.patch
new file mode 100644
index 00000000000..1f0064fe5fc
--- /dev/null
+++ b/queue-5.19/ext4-use-locality-group-preallocation-for-small-closed-files.patch
@@ -0,0 +1,82 @@
+From a9f2a2931d0e197ab28c6007966053fdababd53f Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Thu, 8 Sep 2022 11:21:27 +0200
+Subject: ext4: use locality group preallocation for small closed files
+
+From: Jan Kara <jack@suse.cz>
+
+commit a9f2a2931d0e197ab28c6007966053fdababd53f upstream.
+
+Curently we don't use any preallocation when a file is already closed
+when allocating blocks (from writeback code when converting delayed
+allocation). However for small files, using locality group preallocation
+is actually desirable as that is not specific to a particular file.
+Rather it is a method to pack small files together to reduce
+fragmentation and for that the fact the file is closed is actually even
+stronger hint the file would benefit from packing. So change the logic
+to allow locality group preallocation in this case.
+
+Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning")
+CC: stable@kernel.org
+Reported-and-tested-by: Stefan Wahren <stefan.wahren@i2se.com>
+Tested-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
+Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Link: https://lore.kernel.org/all/0d81a7c2-46b7-6010-62a4-3e6cfc1628d6@i2se.com/
+Link: https://lore.kernel.org/r/20220908092136.11770-4-jack@suse.cz
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/mballoc.c |   27 +++++++++++++++------------
+ 1 file changed, 15 insertions(+), 12 deletions(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -5189,6 +5189,7 @@ static void ext4_mb_group_or_file(struct
+ 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ 	int bsbits = ac->ac_sb->s_blocksize_bits;
+ 	loff_t size, isize;
++	bool inode_pa_eligible, group_pa_eligible;
+ 
+ 	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+ 		return;
+@@ -5196,25 +5197,27 @@ static void ext4_mb_group_or_file(struct
+ 	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+ 		return;
+ 
++	group_pa_eligible = sbi->s_mb_group_prealloc > 0;
++	inode_pa_eligible = true;
+ 	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
+ 	isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+ 		>> bsbits;
+ 
++	/* No point in using inode preallocation for closed files */
+ 	if ((size == isize) && !ext4_fs_is_busy(sbi) &&
+-	    !inode_is_open_for_write(ac->ac_inode)) {
+-		ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
+-		return;
+-	}
+-
+-	if (sbi->s_mb_group_prealloc <= 0) {
+-		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+-		return;
+-	}
++	    !inode_is_open_for_write(ac->ac_inode))
++		inode_pa_eligible = false;
+ 
+-	/* don't use group allocation for large files */
+ 	size = max(size, isize);
+-	if (size > sbi->s_mb_stream_request) {
+-		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
++	/* Don't use group allocation for large files */
++	if (size > sbi->s_mb_stream_request)
++		group_pa_eligible = false;
++
++	if (!group_pa_eligible) {
++		if (inode_pa_eligible)
++			ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
++		else
++			ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
+ 		return;
+ 	}
+ 
diff --git a/queue-5.19/series b/queue-5.19/series
index 1ad4f6e0af8..3d4d98115c5 100644
--- a/queue-5.19/series
+++ b/queue-5.19/series
@@ -200,3 +200,7 @@ devdax-fix-soft-reservation-memory-description.patch
 ext4-fix-bug-in-extents-parsing-when-eh_entries-0-and-eh_depth-0.patch
 ext4-limit-the-number-of-retries-after-discarding-preallocations-blocks.patch
 ext4-make-mballoc-try-target-group-first-even-with-mb_optimize_scan.patch
+ext4-avoid-unnecessary-spreading-of-allocations-among-groups.patch
+ext4-make-directory-inode-spreading-reflect-flexbg-size.patch
+ext4-use-locality-group-preallocation-for-small-closed-files.patch
+ext4-use-buckets-for-cr-1-block-scan-instead-of-rbtree.patch
-- 
2.47.3