libxfs/xfs_rmap_btree.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2014 Red Hat, Inc.
   4  * All Rights Reserved.
   5  */
   6 #include "libxfs_priv.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_bit.h"
  13 #include "xfs_sb.h"
  14 #include "xfs_mount.h"
  15 #include "xfs_defer.h"
  16 #include "xfs_inode.h"
  17 #include "xfs_trans.h"
  18 #include "xfs_alloc.h"
  19 #include "xfs_btree.h"
  20 #include "xfs_rmap.h"
  21 #include "xfs_rmap_btree.h"
  22 #include "xfs_trace.h"
  23 #include "xfs_cksum.h"
  24 #include "xfs_ag_resv.h"
  25
  26 /*
  27  * Reverse map btree.
  28  *
  29  * This is a per-ag tree used to track the owner(s) of a given extent. With
  30  * reflink it is possible for there to be multiple owners, which is a departure
  31  * from classic XFS. Owner records for data extents are inserted when the
  32  * extent is mapped and removed when an extent is unmapped.  Owner records for
  33  * all other block types (i.e. metadata) are inserted when an extent is
  34  * allocated and removed when an extent is freed. There can only be one owner
  35  * of a metadata extent, usually an inode or some other metadata structure like
  36  * an AG btree.
  37  *
  38  * The rmap btree is part of the free space management, so blocks for the tree
  39  * are sourced from the agfl. Hence we need transaction reservation support for
  40  * this tree so that the freelist is always large enough. This also impacts on
  41  * the minimum space we need to leave free in the AG.
  42  *
  43  * The tree is ordered by [ag block, owner, offset]. This is a large key size,
  44  * but it is the only way to enforce unique keys when a block can be owned by
  45  * multiple files at any offset. There's no need to order/search by extent
  46  * size for online updating/management of the tree. It is intended that most
  47  * reverse lookups will be to find the owner(s) of a particular block, or to
  48  * try to recover tree and file data from corrupt primary metadata.
  49  */
  50
  51 static struct xfs_btree_cur *
  52 xfs_rmapbt_dup_cursor(
  53         struct xfs_btree_cur    *cur)
  54 {
  55         return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
  56                         cur->bc_private.a.agbp, cur->bc_private.a.agno);
  57 }
  58
  59 STATIC void
  60 xfs_rmapbt_set_root(
  61         struct xfs_btree_cur    *cur,
  62         union xfs_btree_ptr     *ptr,
  63         int                     inc)
  64 {
  65         struct xfs_buf          *agbp = cur->bc_private.a.agbp;
  66         struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
  67         xfs_agnumber_t          seqno = be32_to_cpu(agf->agf_seqno);
  68         int                     btnum = cur->bc_btnum;
  69         struct xfs_perag        *pag = xfs_perag_get(cur->bc_mp, seqno);
  70
  71         ASSERT(ptr->s != 0);
  72
  73         agf->agf_roots[btnum] = ptr->s;
  74         be32_add_cpu(&agf->agf_levels[btnum], inc);
  75         pag->pagf_levels[btnum] += inc;
  76         xfs_perag_put(pag);
  77
  78         xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
  79 }
  80
  81 STATIC int
  82 xfs_rmapbt_alloc_block(
  83         struct xfs_btree_cur    *cur,
  84         union xfs_btree_ptr     *start,
  85         union xfs_btree_ptr     *new,
  86         int                     *stat)
  87 {
  88         struct xfs_buf          *agbp = cur->bc_private.a.agbp;
  89         struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
  90         int                     error;
  91         xfs_agblock_t           bno;
  92
  93         /* Allocate the new block from the freelist. If we can't, give up.  */
  94         error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
  95                                        &bno, 1);
  96         if (error)
  97                 return error;
  98
  99         trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
 100                         bno, 1);
 101         if (bno == NULLAGBLOCK) {
 102                 *stat = 0;
 103                 return 0;
 104         }
 105
 106         xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1,
 107                         false);
 108
 109         xfs_trans_agbtree_delta(cur->bc_tp, 1);
 110         new->s = cpu_to_be32(bno);
 111         be32_add_cpu(&agf->agf_rmap_blocks, 1);
 112         xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
 113
 114         xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno);
 115
 116         *stat = 1;
 117         return 0;
 118 }
 119
 120 STATIC int
 121 xfs_rmapbt_free_block(
 122         struct xfs_btree_cur    *cur,
 123         struct xfs_buf          *bp)
 124 {
 125         struct xfs_buf          *agbp = cur->bc_private.a.agbp;
 126         struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
 127         xfs_agblock_t           bno;
 128         int                     error;
 129
 130         bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
 131         trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
 132                         bno, 1);
 133         be32_add_cpu(&agf->agf_rmap_blocks, -1);
 134         xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
 135         error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
 136         if (error)
 137                 return error;
 138
 139         xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
 140                               XFS_EXTENT_BUSY_SKIP_DISCARD);
 141         xfs_trans_agbtree_delta(cur->bc_tp, -1);
 142
 143         xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno);
 144
 145         return 0;
 146 }
 147
 148 STATIC int
 149 xfs_rmapbt_get_minrecs(
 150         struct xfs_btree_cur    *cur,
 151         int                     level)
 152 {
 153         return cur->bc_mp->m_rmap_mnr[level != 0];
 154 }
 155
 156 STATIC int
 157 xfs_rmapbt_get_maxrecs(
 158         struct xfs_btree_cur    *cur,
 159         int                     level)
 160 {
 161         return cur->bc_mp->m_rmap_mxr[level != 0];
 162 }
 163
 164 STATIC void
 165 xfs_rmapbt_init_key_from_rec(
 166         union xfs_btree_key     *key,
 167         union xfs_btree_rec     *rec)
 168 {
 169         key->rmap.rm_startblock = rec->rmap.rm_startblock;
 170         key->rmap.rm_owner = rec->rmap.rm_owner;
 171         key->rmap.rm_offset = rec->rmap.rm_offset;
 172 }
 173
 174 /*
 175  * The high key for a reverse mapping record can be computed by shifting
 176  * the startblock and offset to the highest value that would still map
 177  * to that record.  In practice this means that we add blockcount-1 to
 178  * the startblock for all records, and if the record is for a data/attr
 179  * fork mapping, we add blockcount-1 to the offset too.
 180  */
 181 STATIC void
 182 xfs_rmapbt_init_high_key_from_rec(
 183         union xfs_btree_key     *key,
 184         union xfs_btree_rec     *rec)
 185 {
 186         uint64_t                off;
 187         int                     adj;
 188
 189         adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
 190
 191         key->rmap.rm_startblock = rec->rmap.rm_startblock;
 192         be32_add_cpu(&key->rmap.rm_startblock, adj);
 193         key->rmap.rm_owner = rec->rmap.rm_owner;
 194         key->rmap.rm_offset = rec->rmap.rm_offset;
 195         if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
 196             XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
 197                 return;
 198         off = be64_to_cpu(key->rmap.rm_offset);
 199         off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
 200         key->rmap.rm_offset = cpu_to_be64(off);
 201 }
 202
 203 STATIC void
 204 xfs_rmapbt_init_rec_from_cur(
 205         struct xfs_btree_cur    *cur,
 206         union xfs_btree_rec     *rec)
 207 {
 208         rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
 209         rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
 210         rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
 211         rec->rmap.rm_offset = cpu_to_be64(
 212                         xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
 213 }
 214
 215 STATIC void
 216 xfs_rmapbt_init_ptr_from_cur(
 217         struct xfs_btree_cur    *cur,
 218         union xfs_btree_ptr     *ptr)
 219 {
 220         struct xfs_agf          *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
 221
 222         ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
 223
 224         ptr->s = agf->agf_roots[cur->bc_btnum];
 225 }
 226
 227 STATIC int64_t
 228 xfs_rmapbt_key_diff(
 229         struct xfs_btree_cur    *cur,
 230         union xfs_btree_key     *key)
 231 {
 232         struct xfs_rmap_irec    *rec = &cur->bc_rec.r;
 233         struct xfs_rmap_key     *kp = &key->rmap;
 234         __u64                   x, y;
 235         int64_t                 d;
 236
 237         d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
 238         if (d)
 239                 return d;
 240
 241         x = be64_to_cpu(kp->rm_owner);
 242         y = rec->rm_owner;
 243         if (x > y)
 244                 return 1;
 245         else if (y > x)
 246                 return -1;
 247
 248         x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
 249         y = rec->rm_offset;
 250         if (x > y)
 251                 return 1;
 252         else if (y > x)
 253                 return -1;
 254         return 0;
 255 }
 256
 257 STATIC int64_t
 258 xfs_rmapbt_diff_two_keys(
 259         struct xfs_btree_cur    *cur,
 260         union xfs_btree_key     *k1,
 261         union xfs_btree_key     *k2)
 262 {
 263         struct xfs_rmap_key     *kp1 = &k1->rmap;
 264         struct xfs_rmap_key     *kp2 = &k2->rmap;
 265         int64_t                 d;
 266         __u64                   x, y;
 267
 268         d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
 269                        be32_to_cpu(kp2->rm_startblock);
 270         if (d)
 271                 return d;
 272
 273         x = be64_to_cpu(kp1->rm_owner);
 274         y = be64_to_cpu(kp2->rm_owner);
 275         if (x > y)
 276                 return 1;
 277         else if (y > x)
 278                 return -1;
 279
 280         x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
 281         y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
 282         if (x > y)
 283                 return 1;
 284         else if (y > x)
 285                 return -1;
 286         return 0;
 287 }
 288
 289 static xfs_failaddr_t
 290 xfs_rmapbt_verify(
 291         struct xfs_buf          *bp)
 292 {
 293         struct xfs_mount        *mp = bp->b_target->bt_mount;
 294         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 295         struct xfs_perag        *pag = bp->b_pag;
 296         xfs_failaddr_t          fa;
 297         unsigned int            level;
 298
 299         /*
 300          * magic number and level verification
 301          *
 302          * During growfs operations, we can't verify the exact level or owner as
 303          * the perag is not fully initialised and hence not attached to the
 304          * buffer.  In this case, check against the maximum tree depth.
 305          *
 306          * Similarly, during log recovery we will have a perag structure
 307          * attached, but the agf information will not yet have been initialised
 308          * from the on disk AGF. Again, we can only check against maximum limits
 309          * in this case.
 310          */
 311         if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
 312                 return __this_address;
 313
 314         if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
 315                 return __this_address;
 316         fa = xfs_btree_sblock_v5hdr_verify(bp);
 317         if (fa)
 318                 return fa;
 319
 320         level = be16_to_cpu(block->bb_level);
 321         if (pag && pag->pagf_init) {
 322                 if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
 323                         return __this_address;
 324         } else if (level >= mp->m_rmap_maxlevels)
 325                 return __this_address;
 326
 327         return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
 328 }
 329
 330 static void
 331 xfs_rmapbt_read_verify(
 332         struct xfs_buf  *bp)
 333 {
 334         xfs_failaddr_t  fa;
 335
 336         if (!xfs_btree_sblock_verify_crc(bp))
 337                 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
 338         else {
 339                 fa = xfs_rmapbt_verify(bp);
 340                 if (fa)
 341                         xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 342         }
 343
 344         if (bp->b_error)
 345                 trace_xfs_btree_corrupt(bp, _RET_IP_);
 346 }
 347
 348 static void
 349 xfs_rmapbt_write_verify(
 350         struct xfs_buf  *bp)
 351 {
 352         xfs_failaddr_t  fa;
 353
 354         fa = xfs_rmapbt_verify(bp);
 355         if (fa) {
 356                 trace_xfs_btree_corrupt(bp, _RET_IP_);
 357                 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 358                 return;
 359         }
 360         xfs_btree_sblock_calc_crc(bp);
 361
 362 }
 363
 364 const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
 365         .name                   = "xfs_rmapbt",
 366         .verify_read            = xfs_rmapbt_read_verify,
 367         .verify_write           = xfs_rmapbt_write_verify,
 368         .verify_struct          = xfs_rmapbt_verify,
 369 };
 370
 371 STATIC int
 372 xfs_rmapbt_keys_inorder(
 373         struct xfs_btree_cur    *cur,
 374         union xfs_btree_key     *k1,
 375         union xfs_btree_key     *k2)
 376 {
 377         uint32_t                x;
 378         uint32_t                y;
 379         uint64_t                a;
 380         uint64_t                b;
 381
 382         x = be32_to_cpu(k1->rmap.rm_startblock);
 383         y = be32_to_cpu(k2->rmap.rm_startblock);
 384         if (x < y)
 385                 return 1;
 386         else if (x > y)
 387                 return 0;
 388         a = be64_to_cpu(k1->rmap.rm_owner);
 389         b = be64_to_cpu(k2->rmap.rm_owner);
 390         if (a < b)
 391                 return 1;
 392         else if (a > b)
 393                 return 0;
 394         a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
 395         b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
 396         if (a <= b)
 397                 return 1;
 398         return 0;
 399 }
 400
 401 STATIC int
 402 xfs_rmapbt_recs_inorder(
 403         struct xfs_btree_cur    *cur,
 404         union xfs_btree_rec     *r1,
 405         union xfs_btree_rec     *r2)
 406 {
 407         uint32_t                x;
 408         uint32_t                y;
 409         uint64_t                a;
 410         uint64_t                b;
 411
 412         x = be32_to_cpu(r1->rmap.rm_startblock);
 413         y = be32_to_cpu(r2->rmap.rm_startblock);
 414         if (x < y)
 415                 return 1;
 416         else if (x > y)
 417                 return 0;
 418         a = be64_to_cpu(r1->rmap.rm_owner);
 419         b = be64_to_cpu(r2->rmap.rm_owner);
 420         if (a < b)
 421                 return 1;
 422         else if (a > b)
 423                 return 0;
 424         a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
 425         b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
 426         if (a <= b)
 427                 return 1;
 428         return 0;
 429 }
 430
 431 static const struct xfs_btree_ops xfs_rmapbt_ops = {
 432         .rec_len                = sizeof(struct xfs_rmap_rec),
 433         .key_len                = 2 * sizeof(struct xfs_rmap_key),
 434
 435         .dup_cursor             = xfs_rmapbt_dup_cursor,
 436         .set_root               = xfs_rmapbt_set_root,
 437         .alloc_block            = xfs_rmapbt_alloc_block,
 438         .free_block             = xfs_rmapbt_free_block,
 439         .get_minrecs            = xfs_rmapbt_get_minrecs,
 440         .get_maxrecs            = xfs_rmapbt_get_maxrecs,
 441         .init_key_from_rec      = xfs_rmapbt_init_key_from_rec,
 442         .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
 443         .init_rec_from_cur      = xfs_rmapbt_init_rec_from_cur,
 444         .init_ptr_from_cur      = xfs_rmapbt_init_ptr_from_cur,
 445         .key_diff               = xfs_rmapbt_key_diff,
 446         .buf_ops                = &xfs_rmapbt_buf_ops,
 447         .diff_two_keys          = xfs_rmapbt_diff_two_keys,
 448         .keys_inorder           = xfs_rmapbt_keys_inorder,
 449         .recs_inorder           = xfs_rmapbt_recs_inorder,
 450 };
 451
 452 /*
 453  * Allocate a new allocation btree cursor.
 454  */
 455 struct xfs_btree_cur *
 456 xfs_rmapbt_init_cursor(
 457         struct xfs_mount        *mp,
 458         struct xfs_trans        *tp,
 459         struct xfs_buf          *agbp,
 460         xfs_agnumber_t          agno)
 461 {
 462         struct xfs_agf          *agf = XFS_BUF_TO_AGF(agbp);
 463         struct xfs_btree_cur    *cur;
 464
 465         cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
 466         cur->bc_tp = tp;
 467         cur->bc_mp = mp;
 468         /* Overlapping btree; 2 keys per pointer. */
 469         cur->bc_btnum = XFS_BTNUM_RMAP;
 470         cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
 471         cur->bc_blocklog = mp->m_sb.sb_blocklog;
 472         cur->bc_ops = &xfs_rmapbt_ops;
 473         cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
 474         cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
 475
 476         cur->bc_private.a.agbp = agbp;
 477         cur->bc_private.a.agno = agno;
 478
 479         return cur;
 480 }
 481
 482 /*
 483  * Calculate number of records in an rmap btree block.
 484  */
 485 int
 486 xfs_rmapbt_maxrecs(
 487         int                     blocklen,
 488         int                     leaf)
 489 {
 490         blocklen -= XFS_RMAP_BLOCK_LEN;
 491
 492         if (leaf)
 493                 return blocklen / sizeof(struct xfs_rmap_rec);
 494         return blocklen /
 495                 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
 496 }
 497
 498 /* Compute the maximum height of an rmap btree. */
 499 void
 500 xfs_rmapbt_compute_maxlevels(
 501         struct xfs_mount                *mp)
 502 {
 503         /*
 504          * On a non-reflink filesystem, the maximum number of rmap
 505          * records is the number of blocks in the AG, hence the max
 506          * rmapbt height is log_$maxrecs($agblocks).  However, with
 507          * reflink each AG block can have up to 2^32 (per the refcount
 508          * record format) owners, which means that theoretically we
 509          * could face up to 2^64 rmap records.
 510          *
 511          * That effectively means that the max rmapbt height must be
 512          * XFS_BTREE_MAXLEVELS.  "Fortunately" we'll run out of AG
 513          * blocks to feed the rmapbt long before the rmapbt reaches
 514          * maximum height.  The reflink code uses ag_resv_critical to
 515          * disallow reflinking when less than 10% of the per-AG metadata
 516          * block reservation since the fallback is a regular file copy.
 517          */
 518         if (xfs_sb_version_hasreflink(&mp->m_sb))
 519                 mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
 520         else
 521                 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
 522                                 mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
 523 }
 524
 525 /* Calculate the refcount btree size for some records. */
 526 xfs_extlen_t
 527 xfs_rmapbt_calc_size(
 528         struct xfs_mount        *mp,
 529         unsigned long long      len)
 530 {
 531         return xfs_btree_calc_size(mp->m_rmap_mnr, len);
 532 }
 533
 534 /*
 535  * Calculate the maximum refcount btree size.
 536  */
 537 xfs_extlen_t
 538 xfs_rmapbt_max_size(
 539         struct xfs_mount        *mp,
 540         xfs_agblock_t           agblocks)
 541 {
 542         /* Bail out if we're uninitialized, which can happen in mkfs. */
 543         if (mp->m_rmap_mxr[0] == 0)
 544                 return 0;
 545
 546         return xfs_rmapbt_calc_size(mp, agblocks);
 547 }
 548
 549 /*
 550  * Figure out how many blocks to reserve and how many are used by this btree.
 551  */
 552 int
 553 xfs_rmapbt_calc_reserves(
 554         struct xfs_mount        *mp,
 555         struct xfs_trans        *tp,
 556         xfs_agnumber_t          agno,
 557         xfs_extlen_t            *ask,
 558         xfs_extlen_t            *used)
 559 {
 560         struct xfs_buf          *agbp;
 561         struct xfs_agf          *agf;
 562         xfs_agblock_t           agblocks;
 563         xfs_extlen_t            tree_len;
 564         int                     error;
 565
 566         if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
 567                 return 0;
 568
 569         error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
 570         if (error)
 571                 return error;
 572
 573         agf = XFS_BUF_TO_AGF(agbp);
 574         agblocks = be32_to_cpu(agf->agf_length);
 575         tree_len = be32_to_cpu(agf->agf_rmap_blocks);
 576         xfs_trans_brelse(tp, agbp);
 577
 578         /* Reserve 1% of the AG or enough for 1 block per record. */
 579         *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
 580         *used += tree_len;
 581
 582         return error;
 583 }