]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/commitdiff
xfs: compute the maximum height of the rmap btree when reflink enabled
authorDarrick J. Wong <djwong@kernel.org>
Thu, 28 Apr 2022 19:39:03 +0000 (15:39 -0400)
committerEric Sandeen <sandeen@sandeen.net>
Thu, 28 Apr 2022 19:39:03 +0000 (15:39 -0400)
Source kernel commit: 9ec691205e7d4a11190519df6561a168ae6af3a4

Instead of assuming that the hardcoded XFS_BTREE_MAXLEVELS value is big
enough to handle the maximally tall rmap btree when all blocks are in
use and maximally shared, let's compute the maximum height assuming the
rmapbt consumes as many blocks as possible.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Chandan Babu R <chandan.babu@oracle.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
libxfs/xfs_btree.c
libxfs/xfs_btree.h
libxfs/xfs_rmap_btree.c
libxfs/xfs_trans_resv.c
libxfs/xfs_trans_space.h

index 5ada6cc4d380330569453025a4f7795d4ba51ead..e541b061cea71d605f9a88e2cd6202efe5a7846a 100644 (file)
@@ -4554,6 +4554,39 @@ xfs_btree_calc_size(
        return blocks;
 }
 
+/*
+ * Given a number of available blocks for the btree to consume with records and
+ * pointers, calculate the height of the tree needed to index all the records
+ * that space can hold based on the number of pointers each interior node
+ * holds.
+ *
+ * We start by assuming a single level tree consumes a single block, then track
+ * the number of blocks each node level consumes until we no longer have space
+ * to store the next node level. At this point, we are indexing all the leaf
+ * blocks in the space, and there's no more free space to split the tree any
+ * further. That's our maximum btree height.
+ */
+unsigned int
+xfs_btree_space_to_height(
+       const unsigned int      *limits,
+       unsigned long long      leaf_blocks)
+{
+       unsigned long long      node_blocks = limits[1];
+       unsigned long long      blocks_left = leaf_blocks - 1;
+       unsigned int            height = 1;
+
+       if (leaf_blocks < 1)
+               return 0;
+
+       while (node_blocks < blocks_left) {
+               blocks_left -= node_blocks;
+               node_blocks *= limits[1];
+               height++;
+       }
+
+       return height;
+}
+
 /*
  * Query a regular btree for all records overlapping a given interval.
  * Start with a LE lookup of the key of low_rec and return all records
index 3bd69fe425a7291bb85b0752839521691f74f587..e488bfcc1fc0e79d095d0cd0d06e7bc9a5dbb5a5 100644 (file)
@@ -491,6 +491,8 @@ unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits,
                unsigned long long records);
 unsigned long long xfs_btree_calc_size(const unsigned int *limits,
                unsigned long long records);
+unsigned int xfs_btree_space_to_height(const unsigned int *limits,
+               unsigned long long blocks);
 
 /*
  * Return codes for the query range iterator function are 0 to continue
index 4c281b71f5e57a5b3acfa27d3c76a1b943bfbdc9..eeaa1e283be230a345a8803c32fb78a69b39cc03 100644 (file)
@@ -538,26 +538,35 @@ void
 xfs_rmapbt_compute_maxlevels(
        struct xfs_mount                *mp)
 {
-       /*
-        * On a non-reflink filesystem, the maximum number of rmap
-        * records is the number of blocks in the AG, hence the max
-        * rmapbt height is log_$maxrecs($agblocks).  However, with
-        * reflink each AG block can have up to 2^32 (per the refcount
-        * record format) owners, which means that theoretically we
-        * could face up to 2^64 rmap records.
-        *
-        * That effectively means that the max rmapbt height must be
-        * XFS_BTREE_MAXLEVELS.  "Fortunately" we'll run out of AG
-        * blocks to feed the rmapbt long before the rmapbt reaches
-        * maximum height.  The reflink code uses ag_resv_critical to
-        * disallow reflinking when less than 10% of the per-AG metadata
-        * block reservation since the fallback is a regular file copy.
-        */
-       if (xfs_has_reflink(mp))
-               mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
-       else
+       if (!xfs_has_rmapbt(mp)) {
+               mp->m_rmap_maxlevels = 0;
+               return;
+       }
+
+       if (xfs_has_reflink(mp)) {
+               /*
+                * Compute the asymptotic maxlevels for an rmap btree on a
+                * filesystem that supports reflink.
+                *
+                * On a reflink filesystem, each AG block can have up to 2^32
+                * (per the refcount record format) owners, which means that
+                * theoretically we could face up to 2^64 rmap records.
+                * However, we're likely to run out of blocks in the AG long
+                * before that happens, which means that we must compute the
+                * max height based on what the btree will look like if it
+                * consumes almost all the blocks in the AG due to maximal
+                * sharing factor.
+                */
+               mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr,
+                               mp->m_sb.sb_agblocks);
+       } else {
+               /*
+                * If there's no block sharing, compute the maximum rmapbt
+                * height assuming one rmap record per AG block.
+                */
                mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
                                mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+       }
 }
 
 /* Calculate the refcount btree size for some records. */
index 68daefd417372dd678a4859f1ce5f3e03c105ee4..61a0a1ac2baebb5a14e52b2c4e38903dd0a89426 100644 (file)
@@ -813,6 +813,19 @@ xfs_trans_resv_calc(
        struct xfs_mount        *mp,
        struct xfs_trans_resv   *resp)
 {
+       unsigned int            rmap_maxlevels = mp->m_rmap_maxlevels;
+
+       /*
+        * In the early days of rmap+reflink, we always set the rmap maxlevels
+        * to 9 even if the AG was small enough that it would never grow to
+        * that height.  Transaction reservation sizes influence the minimum
+        * log size calculation, which influences the size of the log that mkfs
+        * creates.  Use the old value here to ensure that newly formatted
+        * small filesystems will mount on older kernels.
+        */
+       if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
+               mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
+
        /*
         * The following transactions are logged in physical format and
         * require a permanent reservation on space.
@@ -915,4 +928,7 @@ xfs_trans_resv_calc(
        resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp);
        resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp);
        resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp);
+
+       /* Put everything back the way it was.  This goes at the end. */
+       mp->m_rmap_maxlevels = rmap_maxlevels;
 }
index bd04cb836419ebbea52f2d7c9a23ba6e1dff8ff0..87b31c69a7732e2aaadd16ac9b8957c3b050053d 100644 (file)
 /* Adding one rmap could split every level up to the top of the tree. */
 #define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels)
 
+/*
+ * Note that we historically set m_rmap_maxlevels to 9 when reflink is enabled,
+ * so we must preserve this behavior to avoid changing the transaction space
+ * reservations and minimum log size calculations for existing filesystems.
+ */
+#define XFS_OLD_REFLINK_RMAP_MAXLEVELS         9
+
 /* Blocks we might need to add "b" rmaps to a tree. */
 #define XFS_NRMAPADD_SPACE_RES(mp, b)\
        (((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \