From 8b146a9d8683debe4231b33a93930d886c903947 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 26 Sep 2022 08:54:33 +0200 Subject: [PATCH] 5.19-stable patches added patches: ext4-avoid-unnecessary-spreading-of-allocations-among-groups.patch ext4-make-directory-inode-spreading-reflect-flexbg-size.patch ext4-use-buckets-for-cr-1-block-scan-instead-of-rbtree.patch ext4-use-locality-group-preallocation-for-small-closed-files.patch --- ...preading-of-allocations-among-groups.patch | 72 +++ ...-inode-spreading-reflect-flexbg-size.patch | 39 ++ ...or-cr-1-block-scan-instead-of-rbtree.patch | 527 ++++++++++++++++++ ...preallocation-for-small-closed-files.patch | 82 +++ queue-5.19/series | 4 + 5 files changed, 724 insertions(+) create mode 100644 queue-5.19/ext4-avoid-unnecessary-spreading-of-allocations-among-groups.patch create mode 100644 queue-5.19/ext4-make-directory-inode-spreading-reflect-flexbg-size.patch create mode 100644 queue-5.19/ext4-use-buckets-for-cr-1-block-scan-instead-of-rbtree.patch create mode 100644 queue-5.19/ext4-use-locality-group-preallocation-for-small-closed-files.patch diff --git a/queue-5.19/ext4-avoid-unnecessary-spreading-of-allocations-among-groups.patch b/queue-5.19/ext4-avoid-unnecessary-spreading-of-allocations-among-groups.patch new file mode 100644 index 00000000000..41edfa8a082 --- /dev/null +++ b/queue-5.19/ext4-avoid-unnecessary-spreading-of-allocations-among-groups.patch @@ -0,0 +1,72 @@ +From 1940265ede6683f6317cba0d428ce6505eaca944 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Thu, 8 Sep 2022 11:21:25 +0200 +Subject: ext4: avoid unnecessary spreading of allocations among groups + +From: Jan Kara + +commit 1940265ede6683f6317cba0d428ce6505eaca944 upstream. + +mb_set_largest_free_order() updates lists containing groups with largest +chunk of free space of given order. The way it updates it leads to +always moving the group to the tail of the list. Thus allocations +looking for free space of given order effectively end up cycling through +all groups (and due to initialization in last to first order). This +spreads allocations among block groups which reduces performance for +rotating disks or low-end flash media. Change +mb_set_largest_free_order() to only update lists if the order of the +largest free chunk in the group changed. + +Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning") +CC: stable@kernel.org +Reported-and-tested-by: Stefan Wahren +Tested-by: Ojaswin Mujoo +Reviewed-by: Ritesh Harjani (IBM) +Signed-off-by: Jan Kara +Link: https://lore.kernel.org/all/0d81a7c2-46b7-6010-62a4-3e6cfc1628d6@i2se.com/ +Link: https://lore.kernel.org/r/20220908092136.11770-2-jack@suse.cz +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 24 +++++++++++++----------- + 1 file changed, 13 insertions(+), 11 deletions(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -1077,23 +1077,25 @@ mb_set_largest_free_order(struct super_b + struct ext4_sb_info *sbi = EXT4_SB(sb); + int i; + +- if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) { ++ for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) ++ if (grp->bb_counters[i] > 0) ++ break; ++ /* No need to move between order lists? */ ++ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || ++ i == grp->bb_largest_free_order) { ++ grp->bb_largest_free_order = i; ++ return; ++ } ++ ++ if (grp->bb_largest_free_order >= 0) { + write_lock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + list_del_init(&grp->bb_largest_free_order_node); + write_unlock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + } +- grp->bb_largest_free_order = -1; /* uninit */ +- +- for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) { +- if (grp->bb_counters[i] > 0) { +- grp->bb_largest_free_order = i; +- break; +- } +- } +- if (test_opt2(sb, MB_OPTIMIZE_SCAN) && +- grp->bb_largest_free_order >= 0 && grp->bb_free) { ++ grp->bb_largest_free_order = i; ++ if (grp->bb_largest_free_order >= 0 && grp->bb_free) { + write_lock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + list_add_tail(&grp->bb_largest_free_order_node, diff --git a/queue-5.19/ext4-make-directory-inode-spreading-reflect-flexbg-size.patch b/queue-5.19/ext4-make-directory-inode-spreading-reflect-flexbg-size.patch new file mode 100644 index 00000000000..a82e75c65b2 --- /dev/null +++ b/queue-5.19/ext4-make-directory-inode-spreading-reflect-flexbg-size.patch @@ -0,0 +1,39 @@ +From 613c5a85898d1cd44e68f28d65eccf64a8ace9cf Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Thu, 8 Sep 2022 11:21:26 +0200 +Subject: ext4: make directory inode spreading reflect flexbg size + +From: Jan Kara + +commit 613c5a85898d1cd44e68f28d65eccf64a8ace9cf upstream. + +Currently the Orlov inode allocator searches for free inodes for a +directory only in flex block groups with at most inodes_per_group/16 +more directory inodes than average per flex block group. However with +growing size of flex block group this becomes unnecessarily strict. +Scale allowed difference from average directory count per flex block +group with flex block group size as we do with other metrics. + +Tested-by: Stefan Wahren +Tested-by: Ojaswin Mujoo +Cc: stable@kernel.org +Link: https://lore.kernel.org/all/0d81a7c2-46b7-6010-62a4-3e6cfc1628d6@i2se.com/ +Signed-off-by: Jan Kara +Link: https://lore.kernel.org/r/20220908092136.11770-3-jack@suse.cz +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ialloc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -510,7 +510,7 @@ static int find_group_orlov(struct super + goto fallback; + } + +- max_dirs = ndirs / ngroups + inodes_per_group / 16; ++ max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16; + min_inodes = avefreei - inodes_per_group*flex_size / 4; + if (min_inodes < 1) + min_inodes = 1; diff --git a/queue-5.19/ext4-use-buckets-for-cr-1-block-scan-instead-of-rbtree.patch b/queue-5.19/ext4-use-buckets-for-cr-1-block-scan-instead-of-rbtree.patch new file mode 100644 index 00000000000..0cbc7264277 --- /dev/null +++ b/queue-5.19/ext4-use-buckets-for-cr-1-block-scan-instead-of-rbtree.patch @@ -0,0 +1,527 @@ +From 83e80a6e3543f37f74c8e48a5f305b054b65ce2a Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Thu, 8 Sep 2022 11:21:28 +0200 +Subject: ext4: use buckets for cr 1 block scan instead of rbtree + +From: Jan Kara + +commit 83e80a6e3543f37f74c8e48a5f305b054b65ce2a upstream. + +Using rbtree for sorting groups by average fragment size is relatively +expensive (needs rbtree update on every block freeing or allocation) and +leads to wide spreading of allocations because selection of block group +is very sentitive both to changes in free space and amount of blocks +allocated. Furthermore selecting group with the best matching average +fragment size is not necessary anyway, even more so because the +variability of fragment sizes within a group is likely large so average +is not telling much. We just need a group with large enough average +fragment size so that we have high probability of finding large enough +free extent and we don't want average fragment size to be too big so +that we are likely to find free extent only somewhat larger than what we +need. + +So instead of maintaing rbtree of groups sorted by fragment size keep +bins (lists) or groups where average fragment size is in the interval +[2^i, 2^(i+1)). This structure requires less updates on block allocation +/ freeing, generally avoids chaotic spreading of allocations into block +groups, and still is able to quickly (even faster that the rbtree) +provide a block group which is likely to have a suitably sized free +space extent. + +This patch reduces number of block groups used when untarring archive +with medium sized files (size somewhat above 64k which is default +mballoc limit for avoiding locality group preallocation) to about half +and thus improves write speeds for eMMC flash significantly. + +Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning") +CC: stable@kernel.org +Reported-and-tested-by: Stefan Wahren +Tested-by: Ojaswin Mujoo +Signed-off-by: Jan Kara +Reviewed-by: Ritesh Harjani (IBM) +Link: https://lore.kernel.org/all/0d81a7c2-46b7-6010-62a4-3e6cfc1628d6@i2se.com/ +Link: https://lore.kernel.org/r/20220908092136.11770-5-jack@suse.cz +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 10 +- + fs/ext4/mballoc.c | 249 ++++++++++++++++++++++-------------------------------- + fs/ext4/mballoc.h | 1 + 3 files changed, 111 insertions(+), 149 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -167,8 +167,6 @@ enum SHIFT_DIRECTION { + #define EXT4_MB_CR0_OPTIMIZED 0x8000 + /* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ + #define EXT4_MB_CR1_OPTIMIZED 0x00010000 +-/* Perform linear traversal for one group */ +-#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000 + struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; +@@ -1589,8 +1587,8 @@ struct ext4_sb_info { + struct list_head s_discard_list; + struct work_struct s_discard_work; + atomic_t s_retry_alloc_pending; +- struct rb_root s_mb_avg_fragment_size_root; +- rwlock_t s_mb_rb_lock; ++ struct list_head *s_mb_avg_fragment_size; ++ rwlock_t *s_mb_avg_fragment_size_locks; + struct list_head *s_mb_largest_free_orders; + rwlock_t *s_mb_largest_free_orders_locks; + +@@ -3402,6 +3400,8 @@ struct ext4_group_info { + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ ++ int bb_avg_fragment_size_order; /* order of average ++ fragment in BG */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + ext4_group_t bb_group; /* Group number */ + struct list_head bb_prealloc_list; +@@ -3409,7 +3409,7 @@ struct ext4_group_info { + void *bb_bitmap; + #endif + struct rw_semaphore alloc_sem; +- struct rb_node bb_avg_fragment_size_rb; ++ struct list_head bb_avg_fragment_size_node; + struct list_head bb_largest_free_order_node; + ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block + * regions, index is order. +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -140,13 +140,15 @@ + * number of buddy bitmap orders possible) number of lists. Group-infos are + * placed in appropriate lists. + * +- * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root) ++ * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) + * +- * Locking: sbi->s_mb_rb_lock (rwlock) ++ * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) + * +- * This is a red black tree consisting of group infos and the tree is sorted +- * by average fragment sizes (which is calculated as ext4_group_info->bb_free +- * / ext4_group_info->bb_fragments). ++ * This is an array of lists where in the i-th list there are groups with ++ * average fragment size >= 2^i and < 2^(i+1). The average fragment size ++ * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. ++ * Note that we don't bother with a special list for completely empty groups ++ * so we only have MB_NUM_ORDERS(sb) lists. + * + * When "mb_optimize_scan" mount option is set, mballoc consults the above data + * structures to decide the order in which groups are to be traversed for +@@ -160,7 +162,8 @@ + * + * At CR = 1, we only consider groups where average fragment size > request + * size. So, we lookup a group which has average fragment size just above or +- * equal to request size using our rb tree (data structure 2) in O(log N) time. ++ * equal to request size using our average fragment size group lists (data ++ * structure 2) in O(1) time. + * + * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in + * linear order which requires O(N) search time for each CR 0 and CR 1 phase. +@@ -802,65 +805,51 @@ static void ext4_mb_mark_free_simple(str + } + } + +-static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new, +- int (*cmp)(struct rb_node *, struct rb_node *)) ++static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) + { +- struct rb_node **iter = &root->rb_node, *parent = NULL; ++ int order; + +- while (*iter) { +- parent = *iter; +- if (cmp(new, *iter) > 0) +- iter = &((*iter)->rb_left); +- else +- iter = &((*iter)->rb_right); +- } +- +- rb_link_node(new, parent, iter); +- rb_insert_color(new, root); +-} +- +-static int +-ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2) +-{ +- struct ext4_group_info *grp1 = rb_entry(rb1, +- struct ext4_group_info, +- bb_avg_fragment_size_rb); +- struct ext4_group_info *grp2 = rb_entry(rb2, +- struct ext4_group_info, +- bb_avg_fragment_size_rb); +- int num_frags_1, num_frags_2; +- +- num_frags_1 = grp1->bb_fragments ? +- grp1->bb_free / grp1->bb_fragments : 0; +- num_frags_2 = grp2->bb_fragments ? +- grp2->bb_free / grp2->bb_fragments : 0; +- +- return (num_frags_2 - num_frags_1); ++ /* ++ * We don't bother with a special lists groups with only 1 block free ++ * extents and for completely empty groups. ++ */ ++ order = fls(len) - 2; ++ if (order < 0) ++ return 0; ++ if (order == MB_NUM_ORDERS(sb)) ++ order--; ++ return order; + } + +-/* +- * Reinsert grpinfo into the avg_fragment_size tree with new average +- * fragment size. +- */ ++/* Move group to appropriate avg_fragment_size list */ + static void + mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); ++ int new_order; + + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) + return; + +- write_lock(&sbi->s_mb_rb_lock); +- if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) { +- rb_erase(&grp->bb_avg_fragment_size_rb, +- &sbi->s_mb_avg_fragment_size_root); +- RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb); +- } ++ new_order = mb_avg_fragment_size_order(sb, ++ grp->bb_free / grp->bb_fragments); ++ if (new_order == grp->bb_avg_fragment_size_order) ++ return; + +- ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root, +- &grp->bb_avg_fragment_size_rb, +- ext4_mb_avg_fragment_size_cmp); +- write_unlock(&sbi->s_mb_rb_lock); ++ if (grp->bb_avg_fragment_size_order != -1) { ++ write_lock(&sbi->s_mb_avg_fragment_size_locks[ ++ grp->bb_avg_fragment_size_order]); ++ list_del(&grp->bb_avg_fragment_size_node); ++ write_unlock(&sbi->s_mb_avg_fragment_size_locks[ ++ grp->bb_avg_fragment_size_order]); ++ } ++ grp->bb_avg_fragment_size_order = new_order; ++ write_lock(&sbi->s_mb_avg_fragment_size_locks[ ++ grp->bb_avg_fragment_size_order]); ++ list_add_tail(&grp->bb_avg_fragment_size_node, ++ &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); ++ write_unlock(&sbi->s_mb_avg_fragment_size_locks[ ++ grp->bb_avg_fragment_size_order]); + } + + /* +@@ -909,86 +898,56 @@ static void ext4_mb_choose_next_group_cr + *new_cr = 1; + } else { + *group = grp->bb_group; +- ac->ac_last_optimal_group = *group; + ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; + } + } + + /* +- * Choose next group by traversing average fragment size tree. Updates *new_cr +- * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that +- * the linear search should continue for one iteration since there's lock +- * contention on the rb tree lock. ++ * Choose next group by traversing average fragment size list of suitable ++ * order. Updates *new_cr if cr level needs an update. + */ + static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) + { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); +- int avg_fragment_size, best_so_far; +- struct rb_node *node, *found; +- struct ext4_group_info *grp; +- +- /* +- * If there is contention on the lock, instead of waiting for the lock +- * to become available, just continue searching lineraly. We'll resume +- * our rb tree search later starting at ac->ac_last_optimal_group. +- */ +- if (!read_trylock(&sbi->s_mb_rb_lock)) { +- ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR; +- return; +- } ++ struct ext4_group_info *grp, *iter; ++ int i; + + if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { + if (sbi->s_mb_stats) + atomic_inc(&sbi->s_bal_cr1_bad_suggestions); +- /* We have found something at CR 1 in the past */ +- grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group); +- for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL; +- found = rb_next(found)) { +- grp = rb_entry(found, struct ext4_group_info, +- bb_avg_fragment_size_rb); ++ } ++ ++ for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); ++ i < MB_NUM_ORDERS(ac->ac_sb); i++) { ++ if (list_empty(&sbi->s_mb_avg_fragment_size[i])) ++ continue; ++ read_lock(&sbi->s_mb_avg_fragment_size_locks[i]); ++ if (list_empty(&sbi->s_mb_avg_fragment_size[i])) { ++ read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); ++ continue; ++ } ++ grp = NULL; ++ list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i], ++ bb_avg_fragment_size_node) { + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); +- if (likely(ext4_mb_good_group(ac, grp->bb_group, 1))) ++ if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) { ++ grp = iter; + break; +- } +- goto done; +- } +- +- node = sbi->s_mb_avg_fragment_size_root.rb_node; +- best_so_far = 0; +- found = NULL; +- +- while (node) { +- grp = rb_entry(node, struct ext4_group_info, +- bb_avg_fragment_size_rb); +- avg_fragment_size = 0; +- if (ext4_mb_good_group(ac, grp->bb_group, 1)) { +- avg_fragment_size = grp->bb_fragments ? +- grp->bb_free / grp->bb_fragments : 0; +- if (!best_so_far || avg_fragment_size < best_so_far) { +- best_so_far = avg_fragment_size; +- found = node; + } + } +- if (avg_fragment_size > ac->ac_g_ex.fe_len) +- node = node->rb_right; +- else +- node = node->rb_left; ++ read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]); ++ if (grp) ++ break; + } + +-done: +- if (found) { +- grp = rb_entry(found, struct ext4_group_info, +- bb_avg_fragment_size_rb); ++ if (grp) { + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; + } else { + *new_cr = 2; + } +- +- read_unlock(&sbi->s_mb_rb_lock); +- ac->ac_last_optimal_group = *group; + } + + static inline int should_optimize_scan(struct ext4_allocation_context *ac) +@@ -1017,11 +976,6 @@ next_linear_group(struct ext4_allocation + goto inc_and_return; + } + +- if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) { +- ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR; +- goto inc_and_return; +- } +- + return group; + inc_and_return: + /* +@@ -1152,13 +1106,13 @@ void ext4_mb_generate_buddy(struct super + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + } + mb_set_largest_free_order(sb, grp); ++ mb_update_avg_fragment_size(sb, grp); + + clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + + period = get_cycles() - period; + atomic_inc(&sbi->s_mb_buddies_generated); + atomic64_add(period, &sbi->s_mb_generation_time); +- mb_update_avg_fragment_size(sb, grp); + } + + /* The buddy information is attached the buddy cache inode +@@ -2705,7 +2659,6 @@ repeat: + * from the goal value specified + */ + group = ac->ac_g_ex.fe_group; +- ac->ac_last_optimal_group = group; + ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; + prefetch_grp = group; + +@@ -2987,9 +2940,7 @@ __acquires(&EXT4_SB(sb)->s_mb_rb_lock) + struct super_block *sb = pde_data(file_inode(seq->file)); + unsigned long position; + +- read_lock(&EXT4_SB(sb)->s_mb_rb_lock); +- +- if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) ++ if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) + return NULL; + position = *pos + 1; + return (void *) ((unsigned long) position); +@@ -3001,7 +2952,7 @@ static void *ext4_mb_seq_structs_summary + unsigned long position; + + ++*pos; +- if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) ++ if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) + return NULL; + position = *pos + 1; + return (void *) ((unsigned long) position); +@@ -3013,29 +2964,22 @@ static int ext4_mb_seq_structs_summary_s + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned long position = ((unsigned long) v); + struct ext4_group_info *grp; +- struct rb_node *n; +- unsigned int count, min, max; ++ unsigned int count; + + position--; + if (position >= MB_NUM_ORDERS(sb)) { +- seq_puts(seq, "fragment_size_tree:\n"); +- n = rb_first(&sbi->s_mb_avg_fragment_size_root); +- if (!n) { +- seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n"); +- return 0; +- } +- grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); +- min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; +- count = 1; +- while (rb_next(n)) { +- count++; +- n = rb_next(n); +- } +- grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); +- max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; ++ position -= MB_NUM_ORDERS(sb); ++ if (position == 0) ++ seq_puts(seq, "avg_fragment_size_lists:\n"); + +- seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n", +- min, max, count); ++ count = 0; ++ read_lock(&sbi->s_mb_avg_fragment_size_locks[position]); ++ list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position], ++ bb_avg_fragment_size_node) ++ count++; ++ read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]); ++ seq_printf(seq, "\tlist_order_%u_groups: %u\n", ++ (unsigned int)position, count); + return 0; + } + +@@ -3045,9 +2989,11 @@ static int ext4_mb_seq_structs_summary_s + seq_puts(seq, "max_free_order_lists:\n"); + } + count = 0; ++ read_lock(&sbi->s_mb_largest_free_orders_locks[position]); + list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], + bb_largest_free_order_node) + count++; ++ read_unlock(&sbi->s_mb_largest_free_orders_locks[position]); + seq_printf(seq, "\tlist_order_%u_groups: %u\n", + (unsigned int)position, count); + +@@ -3055,11 +3001,7 @@ static int ext4_mb_seq_structs_summary_s + } + + static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) +-__releases(&EXT4_SB(sb)->s_mb_rb_lock) + { +- struct super_block *sb = pde_data(file_inode(seq->file)); +- +- read_unlock(&EXT4_SB(sb)->s_mb_rb_lock); + } + + const struct seq_operations ext4_mb_seq_structs_summary_ops = { +@@ -3172,8 +3114,9 @@ int ext4_mb_add_groupinfo(struct super_b + init_rwsem(&meta_group_info[i]->alloc_sem); + meta_group_info[i]->bb_free_root = RB_ROOT; + INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); +- RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb); ++ INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); + meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ ++ meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ + meta_group_info[i]->bb_group = group; + + mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); +@@ -3422,7 +3365,24 @@ int ext4_mb_init(struct super_block *sb) + i++; + } while (i < MB_NUM_ORDERS(sb)); + +- sbi->s_mb_avg_fragment_size_root = RB_ROOT; ++ sbi->s_mb_avg_fragment_size = ++ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), ++ GFP_KERNEL); ++ if (!sbi->s_mb_avg_fragment_size) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ sbi->s_mb_avg_fragment_size_locks = ++ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), ++ GFP_KERNEL); ++ if (!sbi->s_mb_avg_fragment_size_locks) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ for (i = 0; i < MB_NUM_ORDERS(sb); i++) { ++ INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); ++ rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); ++ } + sbi->s_mb_largest_free_orders = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + GFP_KERNEL); +@@ -3441,7 +3401,6 @@ int ext4_mb_init(struct super_block *sb) + INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); + rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); + } +- rwlock_init(&sbi->s_mb_rb_lock); + + spin_lock_init(&sbi->s_md_lock); + sbi->s_mb_free_pending = 0; +@@ -3512,6 +3471,8 @@ out_free_locality_groups: + free_percpu(sbi->s_locality_groups); + sbi->s_locality_groups = NULL; + out: ++ kfree(sbi->s_mb_avg_fragment_size); ++ kfree(sbi->s_mb_avg_fragment_size_locks); + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); + kfree(sbi->s_mb_offsets); +@@ -3578,6 +3539,8 @@ int ext4_mb_release(struct super_block * + kvfree(group_info); + rcu_read_unlock(); + } ++ kfree(sbi->s_mb_avg_fragment_size); ++ kfree(sbi->s_mb_avg_fragment_size_locks); + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); + kfree(sbi->s_mb_offsets); +--- a/fs/ext4/mballoc.h ++++ b/fs/ext4/mballoc.h +@@ -178,7 +178,6 @@ struct ext4_allocation_context { + /* copy of the best found extent taken before preallocation efforts */ + struct ext4_free_extent ac_f_ex; + +- ext4_group_t ac_last_optimal_group; + __u32 ac_groups_considered; + __u32 ac_flags; /* allocation hints */ + __u16 ac_groups_scanned; diff --git a/queue-5.19/ext4-use-locality-group-preallocation-for-small-closed-files.patch b/queue-5.19/ext4-use-locality-group-preallocation-for-small-closed-files.patch new file mode 100644 index 00000000000..1f0064fe5fc --- /dev/null +++ b/queue-5.19/ext4-use-locality-group-preallocation-for-small-closed-files.patch @@ -0,0 +1,82 @@ +From a9f2a2931d0e197ab28c6007966053fdababd53f Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Thu, 8 Sep 2022 11:21:27 +0200 +Subject: ext4: use locality group preallocation for small closed files + +From: Jan Kara + +commit a9f2a2931d0e197ab28c6007966053fdababd53f upstream. + +Curently we don't use any preallocation when a file is already closed +when allocating blocks (from writeback code when converting delayed +allocation). However for small files, using locality group preallocation +is actually desirable as that is not specific to a particular file. +Rather it is a method to pack small files together to reduce +fragmentation and for that the fact the file is closed is actually even +stronger hint the file would benefit from packing. So change the logic +to allow locality group preallocation in this case. + +Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning") +CC: stable@kernel.org +Reported-and-tested-by: Stefan Wahren +Tested-by: Ojaswin Mujoo +Reviewed-by: Ritesh Harjani (IBM) +Signed-off-by: Jan Kara +Link: https://lore.kernel.org/all/0d81a7c2-46b7-6010-62a4-3e6cfc1628d6@i2se.com/ +Link: https://lore.kernel.org/r/20220908092136.11770-4-jack@suse.cz +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 27 +++++++++++++++------------ + 1 file changed, 15 insertions(+), 12 deletions(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -5189,6 +5189,7 @@ static void ext4_mb_group_or_file(struct + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int bsbits = ac->ac_sb->s_blocksize_bits; + loff_t size, isize; ++ bool inode_pa_eligible, group_pa_eligible; + + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; +@@ -5196,25 +5197,27 @@ static void ext4_mb_group_or_file(struct + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + return; + ++ group_pa_eligible = sbi->s_mb_group_prealloc > 0; ++ inode_pa_eligible = true; + size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) + >> bsbits; + ++ /* No point in using inode preallocation for closed files */ + if ((size == isize) && !ext4_fs_is_busy(sbi) && +- !inode_is_open_for_write(ac->ac_inode)) { +- ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; +- return; +- } +- +- if (sbi->s_mb_group_prealloc <= 0) { +- ac->ac_flags |= EXT4_MB_STREAM_ALLOC; +- return; +- } ++ !inode_is_open_for_write(ac->ac_inode)) ++ inode_pa_eligible = false; + +- /* don't use group allocation for large files */ + size = max(size, isize); +- if (size > sbi->s_mb_stream_request) { +- ac->ac_flags |= EXT4_MB_STREAM_ALLOC; ++ /* Don't use group allocation for large files */ ++ if (size > sbi->s_mb_stream_request) ++ group_pa_eligible = false; ++ ++ if (!group_pa_eligible) { ++ if (inode_pa_eligible) ++ ac->ac_flags |= EXT4_MB_STREAM_ALLOC; ++ else ++ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; + return; + } + diff --git a/queue-5.19/series b/queue-5.19/series index 1ad4f6e0af8..3d4d98115c5 100644 --- a/queue-5.19/series +++ b/queue-5.19/series @@ -200,3 +200,7 @@ devdax-fix-soft-reservation-memory-description.patch ext4-fix-bug-in-extents-parsing-when-eh_entries-0-and-eh_depth-0.patch ext4-limit-the-number-of-retries-after-discarding-preallocations-blocks.patch ext4-make-mballoc-try-target-group-first-even-with-mb_optimize_scan.patch +ext4-avoid-unnecessary-spreading-of-allocations-among-groups.patch +ext4-make-directory-inode-spreading-reflect-flexbg-size.patch +ext4-use-locality-group-preallocation-for-small-closed-files.patch +ext4-use-buckets-for-cr-1-block-scan-instead-of-rbtree.patch -- 2.47.3