1 // SPDX-License-Identifier: GPL-2.0+
3 * Copyright (C) 2016 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
8 #include "err_protos.h"
19 # define dbg_printf(f, a...) do {printf(f, ## a); fflush(stdout); } while (0)
21 # define dbg_printf(f, a...)
24 /* per-AG rmap object anchor */
26 struct xfs_slab
*ar_rmaps
; /* rmap observations, p4 */
27 struct xfs_slab
*ar_raw_rmaps
; /* unmerged rmaps */
28 int ar_flcount
; /* agfl entries from leftover */
29 /* agbt allocations */
30 struct xfs_rmap_irec ar_last_rmap
; /* last rmap seen */
31 struct xfs_slab
*ar_refcount_items
; /* refcount items, p4-5 */
34 static struct xfs_ag_rmap
*ag_rmaps
;
35 static bool rmapbt_suspect
;
36 static bool refcbt_suspect
;
38 static inline int rmap_compare(const void *a
, const void *b
)
40 return libxfs_rmap_compare(a
, b
);
44 * Returns true if we must reconstruct either the reference count or reverse
51 return xfs_sb_version_hasreflink(&mp
->m_sb
) ||
52 xfs_sb_version_hasrmapbt(&mp
->m_sb
);
56 * Initialize per-AG reverse map data.
65 if (!rmap_needs_work(mp
))
68 ag_rmaps
= calloc(mp
->m_sb
.sb_agcount
, sizeof(struct xfs_ag_rmap
));
70 do_error(_("couldn't allocate per-AG reverse map roots\n"));
72 for (i
= 0; i
< mp
->m_sb
.sb_agcount
; i
++) {
73 error
= init_slab(&ag_rmaps
[i
].ar_rmaps
,
74 sizeof(struct xfs_rmap_irec
));
77 _("Insufficient memory while allocating reverse mapping slabs."));
78 error
= init_slab(&ag_rmaps
[i
].ar_raw_rmaps
,
79 sizeof(struct xfs_rmap_irec
));
82 _("Insufficient memory while allocating raw metadata reverse mapping slabs."));
83 ag_rmaps
[i
].ar_last_rmap
.rm_owner
= XFS_RMAP_OWN_UNKNOWN
;
84 error
= init_slab(&ag_rmaps
[i
].ar_refcount_items
,
85 sizeof(struct xfs_refcount_irec
));
88 _("Insufficient memory while allocating refcount item slabs."));
93 * Free the per-AG reverse-mapping data.
101 if (!rmap_needs_work(mp
))
104 for (i
= 0; i
< mp
->m_sb
.sb_agcount
; i
++) {
105 free_slab(&ag_rmaps
[i
].ar_rmaps
);
106 free_slab(&ag_rmaps
[i
].ar_raw_rmaps
);
107 free_slab(&ag_rmaps
[i
].ar_refcount_items
);
114 * Decide if two reverse-mapping records can be merged.
118 struct xfs_rmap_irec
*r1
,
119 struct xfs_rmap_irec
*r2
)
121 if (r1
->rm_owner
!= r2
->rm_owner
)
123 if (r1
->rm_startblock
+ r1
->rm_blockcount
!= r2
->rm_startblock
)
125 if ((unsigned long long)r1
->rm_blockcount
+ r2
->rm_blockcount
>
128 if (XFS_RMAP_NON_INODE_OWNER(r2
->rm_owner
))
130 /* must be an inode owner below here */
131 if (r1
->rm_flags
!= r2
->rm_flags
)
133 if (r1
->rm_flags
& XFS_RMAP_BMBT_BLOCK
)
135 return r1
->rm_offset
+ r1
->rm_blockcount
== r2
->rm_offset
;
139 * Add an observation about a block mapping in an inode's data or attribute
140 * fork for later btree reconstruction.
144 struct xfs_mount
*mp
,
147 struct xfs_bmbt_irec
*irec
)
149 struct xfs_rmap_irec rmap
;
152 struct xfs_rmap_irec
*last_rmap
;
155 if (!rmap_needs_work(mp
))
158 agno
= XFS_FSB_TO_AGNO(mp
, irec
->br_startblock
);
159 agbno
= XFS_FSB_TO_AGBNO(mp
, irec
->br_startblock
);
160 ASSERT(agno
!= NULLAGNUMBER
);
161 ASSERT(agno
< mp
->m_sb
.sb_agcount
);
162 ASSERT(agbno
+ irec
->br_blockcount
<= mp
->m_sb
.sb_agblocks
);
163 ASSERT(ino
!= NULLFSINO
);
164 ASSERT(whichfork
== XFS_DATA_FORK
|| whichfork
== XFS_ATTR_FORK
);
167 rmap
.rm_offset
= irec
->br_startoff
;
169 if (whichfork
== XFS_ATTR_FORK
)
170 rmap
.rm_flags
|= XFS_RMAP_ATTR_FORK
;
171 rmap
.rm_startblock
= agbno
;
172 rmap
.rm_blockcount
= irec
->br_blockcount
;
173 if (irec
->br_state
== XFS_EXT_UNWRITTEN
)
174 rmap
.rm_flags
|= XFS_RMAP_UNWRITTEN
;
175 last_rmap
= &ag_rmaps
[agno
].ar_last_rmap
;
176 if (last_rmap
->rm_owner
== XFS_RMAP_OWN_UNKNOWN
)
178 else if (rmaps_are_mergeable(last_rmap
, &rmap
))
179 last_rmap
->rm_blockcount
+= rmap
.rm_blockcount
;
181 error
= slab_add(ag_rmaps
[agno
].ar_rmaps
, last_rmap
);
190 /* Finish collecting inode data/attr fork rmaps. */
192 rmap_finish_collecting_fork_recs(
193 struct xfs_mount
*mp
,
196 if (!rmap_needs_work(mp
) ||
197 ag_rmaps
[agno
].ar_last_rmap
.rm_owner
== XFS_RMAP_OWN_UNKNOWN
)
199 return slab_add(ag_rmaps
[agno
].ar_rmaps
, &ag_rmaps
[agno
].ar_last_rmap
);
202 /* add a raw rmap; these will be merged later */
205 struct xfs_mount
*mp
,
213 struct xfs_rmap_irec rmap
;
216 rmap
.rm_owner
= owner
;
220 rmap
.rm_flags
|= XFS_RMAP_ATTR_FORK
;
222 rmap
.rm_flags
|= XFS_RMAP_BMBT_BLOCK
;
223 rmap
.rm_startblock
= agbno
;
224 rmap
.rm_blockcount
= len
;
225 return slab_add(ag_rmaps
[agno
].ar_raw_rmaps
, &rmap
);
229 * Add a reverse mapping for an inode fork's block mapping btree block.
233 struct xfs_mount
*mp
,
241 if (!rmap_needs_work(mp
))
244 agno
= XFS_FSB_TO_AGNO(mp
, fsbno
);
245 agbno
= XFS_FSB_TO_AGBNO(mp
, fsbno
);
246 ASSERT(agno
!= NULLAGNUMBER
);
247 ASSERT(agno
< mp
->m_sb
.sb_agcount
);
248 ASSERT(agbno
+ 1 <= mp
->m_sb
.sb_agblocks
);
250 return __rmap_add_raw_rec(mp
, agno
, agbno
, 1, ino
,
251 whichfork
== XFS_ATTR_FORK
, true);
255 * Add a reverse mapping for a per-AG fixed metadata extent.
259 struct xfs_mount
*mp
,
265 if (!rmap_needs_work(mp
))
268 ASSERT(agno
!= NULLAGNUMBER
);
269 ASSERT(agno
< mp
->m_sb
.sb_agcount
);
270 ASSERT(agbno
+ len
<= mp
->m_sb
.sb_agblocks
);
272 return __rmap_add_raw_rec(mp
, agno
, agbno
, len
, owner
, false, false);
276 * Merge adjacent raw rmaps and add them to the main rmap list.
280 struct xfs_mount
*mp
,
283 struct xfs_slab_cursor
*cur
= NULL
;
284 struct xfs_rmap_irec
*prev
, *rec
;
288 old_sz
= slab_count(ag_rmaps
[agno
].ar_rmaps
);
289 if (slab_count(ag_rmaps
[agno
].ar_raw_rmaps
) == 0)
291 qsort_slab(ag_rmaps
[agno
].ar_raw_rmaps
, rmap_compare
);
292 error
= init_slab_cursor(ag_rmaps
[agno
].ar_raw_rmaps
, rmap_compare
,
297 prev
= pop_slab_cursor(cur
);
298 rec
= pop_slab_cursor(cur
);
299 while (prev
&& rec
) {
300 if (rmaps_are_mergeable(prev
, rec
)) {
301 prev
->rm_blockcount
+= rec
->rm_blockcount
;
302 rec
= pop_slab_cursor(cur
);
305 error
= slab_add(ag_rmaps
[agno
].ar_rmaps
, prev
);
309 rec
= pop_slab_cursor(cur
);
312 error
= slab_add(ag_rmaps
[agno
].ar_rmaps
, prev
);
316 free_slab(&ag_rmaps
[agno
].ar_raw_rmaps
);
317 error
= init_slab(&ag_rmaps
[agno
].ar_raw_rmaps
,
318 sizeof(struct xfs_rmap_irec
));
321 _("Insufficient memory while allocating raw metadata reverse mapping slabs."));
324 qsort_slab(ag_rmaps
[agno
].ar_rmaps
, rmap_compare
);
326 free_slab_cursor(&cur
);
337 for (n
= 0; n
< sizeof(mask
) * NBBY
&& (mask
& 1); n
++, mask
>>= 1)
353 for (n
= 0; n
< sizeof(mask
) * NBBY
; n
++, mask
>>= 1)
361 * Add an allocation group's fixed metadata to the rmap list. This includes
362 * sb/agi/agf/agfl headers, inode chunks, and the log.
365 rmap_add_fixed_ag_rec(
366 struct xfs_mount
*mp
,
371 ino_tree_node_t
*ino_rec
;
377 if (!rmap_needs_work(mp
))
380 /* sb/agi/agf/agfl headers */
381 error
= rmap_add_ag_rec(mp
, agno
, 0, XFS_BNO_BLOCK(mp
),
387 ino_rec
= findfirst_inode_rec(agno
);
388 for (; ino_rec
!= NULL
; ino_rec
= next_ino_rec(ino_rec
)) {
389 if (xfs_sb_version_hassparseinodes(&mp
->m_sb
)) {
390 startidx
= find_first_zero_bit(ino_rec
->ir_sparse
);
391 nr
= XFS_INODES_PER_CHUNK
- popcnt(ino_rec
->ir_sparse
);
394 nr
= XFS_INODES_PER_CHUNK
;
396 nr
/= mp
->m_sb
.sb_inopblock
;
399 agino
= ino_rec
->ino_startnum
+ startidx
;
400 agbno
= XFS_AGINO_TO_AGBNO(mp
, agino
);
401 if (XFS_AGINO_TO_OFFSET(mp
, agino
) == 0) {
402 error
= rmap_add_ag_rec(mp
, agno
, agbno
, nr
,
403 XFS_RMAP_OWN_INODES
);
410 fsbno
= mp
->m_sb
.sb_logstart
;
411 if (fsbno
&& XFS_FSB_TO_AGNO(mp
, fsbno
) == agno
) {
412 agbno
= XFS_FSB_TO_AGBNO(mp
, mp
->m_sb
.sb_logstart
);
413 error
= rmap_add_ag_rec(mp
, agno
, agbno
, mp
->m_sb
.sb_logblocks
,
423 * Copy the per-AG btree reverse-mapping data into the rmapbt.
425 * At rmapbt reconstruction time, the rmapbt will be populated _only_ with
426 * rmaps for file extents, inode chunks, AG headers, and bmbt blocks. While
427 * building the AG btrees we can record all the blocks allocated for each
428 * btree, but we cannot resolve the conflict between the fact that one has to
429 * finish allocating the space for the rmapbt before building the bnobt and the
430 * fact that allocating blocks for the bnobt requires adding rmapbt entries.
431 * Therefore we record in-core the rmaps for each btree and here use the
432 * libxfs rmap functions to finish building the rmap btree.
434 * During AGF/AGFL reconstruction in phase 5, rmaps for the AG btrees are
435 * recorded in memory. The rmapbt has not been set up yet, so we need to be
436 * able to "expand" the AGFL without updating the rmapbt. After we've written
437 * out the new AGF header the new rmapbt is available, so this function reads
438 * each AGFL to generate rmap entries. These entries are merged with the AG
439 * btree rmap entries, and then we use libxfs' rmap functions to add them to
440 * the rmapbt, after which it is fully regenerated.
443 rmap_store_ag_btree_rec(
444 struct xfs_mount
*mp
,
447 struct xfs_slab_cursor
*rm_cur
;
448 struct xfs_rmap_irec
*rm_rec
= NULL
;
449 struct xfs_buf
*agbp
= NULL
;
450 struct xfs_buf
*agflbp
= NULL
;
451 struct xfs_trans
*tp
;
452 __be32
*agfl_bno
, *b
;
454 struct xfs_owner_info oinfo
;
456 if (!xfs_sb_version_hasrmapbt(&mp
->m_sb
))
459 /* Release the ar_rmaps; they were put into the rmapbt during p5. */
460 free_slab(&ag_rmaps
[agno
].ar_rmaps
);
461 error
= init_slab(&ag_rmaps
[agno
].ar_rmaps
,
462 sizeof(struct xfs_rmap_irec
));
466 /* Add the AGFL blocks to the rmap list */
467 error
= -libxfs_trans_read_buf(
468 mp
, NULL
, mp
->m_ddev_targp
,
469 XFS_AG_DADDR(mp
, agno
, XFS_AGFL_DADDR(mp
)),
470 XFS_FSS_TO_BB(mp
, 1), 0, &agflbp
, &xfs_agfl_buf_ops
);
475 * Sometimes, the blocks at the beginning of the AGFL are there
476 * because we overestimated how many blocks we needed to rebuild
477 * the freespace btrees. ar_flcount records the number of
478 * blocks in this situation. Since those blocks already have an
479 * rmap, we only need to add rmap records for AGFL blocks past
480 * that point in the AGFL because those blocks are a result of a
481 * no-rmap no-shrink freelist fixup that we did earlier.
483 agfl_bno
= XFS_BUF_TO_AGFL_BNO(mp
, agflbp
);
484 b
= agfl_bno
+ ag_rmaps
[agno
].ar_flcount
;
485 while (*b
!= cpu_to_be32(NULLAGBLOCK
) &&
486 b
- agfl_bno
< libxfs_agfl_size(mp
)) {
487 error
= rmap_add_ag_rec(mp
, agno
, be32_to_cpu(*b
), 1,
493 libxfs_putbuf(agflbp
);
496 /* Merge all the raw rmaps into the main list */
497 error
= rmap_fold_raw_recs(mp
, agno
);
501 /* Create cursors to refcount structures */
502 error
= init_slab_cursor(ag_rmaps
[agno
].ar_rmaps
, rmap_compare
,
507 /* Insert rmaps into the btree one at a time */
508 rm_rec
= pop_slab_cursor(rm_cur
);
510 error
= -libxfs_trans_alloc_rollable(mp
, 16, &tp
);
514 error
= -libxfs_alloc_read_agf(mp
, tp
, agno
, 0, &agbp
);
518 ASSERT(XFS_RMAP_NON_INODE_OWNER(rm_rec
->rm_owner
));
519 libxfs_rmap_ag_owner(&oinfo
, rm_rec
->rm_owner
);
520 error
= -libxfs_rmap_alloc(tp
, agbp
, agno
, rm_rec
->rm_startblock
,
521 rm_rec
->rm_blockcount
, &oinfo
);
525 error
= -libxfs_trans_commit(tp
);
529 fix_freelist(mp
, agno
, false);
531 rm_rec
= pop_slab_cursor(rm_cur
);
534 free_slab_cursor(&rm_cur
);
538 libxfs_trans_cancel(tp
);
540 free_slab_cursor(&rm_cur
);
543 libxfs_putbuf(agflbp
);
552 struct xfs_rmap_irec
*rmap
)
554 printf("%s: %p agno=%u pblk=%llu own=%lld lblk=%llu len=%u flags=0x%x\n",
557 (unsigned long long)rmap
->rm_startblock
,
558 (unsigned long long)rmap
->rm_owner
,
559 (unsigned long long)rmap
->rm_offset
,
560 (unsigned int)rmap
->rm_blockcount
,
561 (unsigned int)rmap
->rm_flags
);
564 # define rmap_dump(m, a, r)
568 * Rebuilding the Reference Count & Reverse Mapping Btrees
570 * The reference count (refcnt) and reverse mapping (rmap) btrees are
571 * rebuilt during phase 5, like all other AG btrees. Therefore, reverse
572 * mappings must be processed into reference counts at the end of phase
573 * 4, and the rmaps must be recorded during phase 4. There is a need to
574 * access the rmaps in physical block order, but no particular need for
575 * random access, so the slab.c code provides a big logical array
576 * (consisting of smaller slabs) and some inorder iterator functions.
578 * Once we've recorded all the reverse mappings, we're ready to
579 * translate the rmaps into refcount entries. Imagine the rmap entries
580 * as rectangles representing extents of physical blocks, and that the
581 * rectangles can be laid down to allow them to overlap each other; then
582 * we know that we must emit a refcnt btree entry wherever the amount of
583 * overlap changes, i.e. the emission stimulus is level-triggered:
586 * -- ----- ---- --- ------
587 * -- ---- ----------- ---- ---------
588 * -------------------------------- -----------
589 * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^
590 * 2 1 23 21 3 43 234 2123 1 01 2 3 0
592 * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
594 * Note that in the actual refcnt btree we don't store the refcount < 2
595 * cases because the bnobt tells us which blocks are free; single-use
596 * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt
597 * supports storing multiple entries covering a given block we could
598 * theoretically dispense with the refcntbt and simply count rmaps, but
599 * that's inefficient in the (hot) write path, so we'll take the cost of
600 * the extra tree to save time. Also there's no guarantee that rmap
603 * Given an array of rmaps sorted by physical block number, a starting
604 * physical block (sp), a bag to hold rmaps that cover sp, and the next
605 * physical block where the level changes (np), we can reconstruct the
606 * refcount btree as follows:
608 * While there are still unprocessed rmaps in the array,
609 * - Set sp to the physical block (pblk) of the next unprocessed rmap.
610 * - Add to the bag all rmaps in the array where startblock == sp.
611 * - Set np to the physical block where the bag size will change. This
612 * is the minimum of (the pblk of the next unprocessed rmap) and
613 * (startblock + len of each rmap in the bag).
614 * - Record the bag size as old_bag_size.
616 * - While the bag isn't empty,
617 * - Remove from the bag all rmaps where startblock + len == np.
618 * - Add to the bag all rmaps in the array where startblock == np.
619 * - If the bag size isn't old_bag_size, store the refcount entry
620 * (sp, np - sp, bag_size) in the refcnt btree.
621 * - If the bag is empty, break out of the inner loop.
622 * - Set old_bag_size to the bag size
624 * - Set np to the physical block where the bag size will change.
625 * This is the minimum of (the pblk of the next unprocessed rmap)
626 * and (startblock + len of each rmap in the bag).
628 * An implementation detail is that because this processing happens
629 * during phase 4, the refcount entries are stored in an array so that
630 * phase 5 can load them into the refcount btree. The rmaps can be
631 * loaded directly into the rmap btree during phase 5 as well.
635 * Mark all inodes in the reverse-mapping observation stack as requiring the
636 * reflink inode flag, if the stack depth is greater than 1.
640 struct xfs_mount
*mp
,
641 struct xfs_bag
*rmaps
)
643 xfs_agnumber_t iagno
;
644 struct xfs_rmap_irec
*rmap
;
645 struct ino_tree_node
*irec
;
650 if (bag_count(rmaps
) < 2)
653 /* Reflink flag accounting */
654 foreach_bag_ptr(rmaps
, idx
, rmap
) {
655 ASSERT(!XFS_RMAP_NON_INODE_OWNER(rmap
->rm_owner
));
656 iagno
= XFS_INO_TO_AGNO(mp
, rmap
->rm_owner
);
657 ino
= XFS_INO_TO_AGINO(mp
, rmap
->rm_owner
);
658 pthread_mutex_lock(&ag_locks
[iagno
].lock
);
659 irec
= find_inode_rec(mp
, iagno
, ino
);
660 off
= get_inode_offset(mp
, rmap
->rm_owner
, irec
);
661 /* lock here because we might go outside this ag */
662 set_inode_is_rl(irec
, off
);
663 pthread_mutex_unlock(&ag_locks
[iagno
].lock
);
668 * Emit a refcount object for refcntbt reconstruction during phase 5.
670 #define REFCOUNT_CLAMP(nr) ((nr) > MAXREFCOUNT ? MAXREFCOUNT : (nr))
673 struct xfs_mount
*mp
,
679 struct xfs_refcount_irec rlrec
;
681 struct xfs_slab
*rlslab
;
683 rlslab
= ag_rmaps
[agno
].ar_refcount_items
;
684 ASSERT(nr_rmaps
> 0);
686 dbg_printf("REFL: agno=%u pblk=%u, len=%u -> refcount=%zu\n",
687 agno
, agbno
, len
, nr_rmaps
);
688 rlrec
.rc_startblock
= agbno
;
689 rlrec
.rc_blockcount
= len
;
690 rlrec
.rc_refcount
= REFCOUNT_CLAMP(nr_rmaps
);
691 error
= slab_add(rlslab
, &rlrec
);
694 _("Insufficient memory while recreating refcount tree."));
696 #undef REFCOUNT_CLAMP
699 * Transform a pile of physical block mapping observations into refcount data
700 * for eventual rebuilding of the btrees.
702 #define RMAP_END(r) ((r)->rm_startblock + (r)->rm_blockcount)
705 struct xfs_mount
*mp
,
708 struct xfs_bag
*stack_top
= NULL
;
709 struct xfs_slab
*rmaps
;
710 struct xfs_slab_cursor
*rmaps_cur
;
711 struct xfs_rmap_irec
*array_cur
;
712 struct xfs_rmap_irec
*rmap
;
713 xfs_agblock_t sbno
; /* first bno of this rmap set */
714 xfs_agblock_t cbno
; /* first bno of this refcount set */
715 xfs_agblock_t nbno
; /* next bno where rmap set changes */
720 if (!xfs_sb_version_hasreflink(&mp
->m_sb
))
723 rmaps
= ag_rmaps
[agno
].ar_rmaps
;
725 error
= init_slab_cursor(rmaps
, rmap_compare
, &rmaps_cur
);
729 error
= init_bag(&stack_top
);
733 /* While there are rmaps to be processed... */
735 while (n
< slab_count(rmaps
)) {
736 array_cur
= peek_slab_cursor(rmaps_cur
);
737 sbno
= cbno
= array_cur
->rm_startblock
;
738 /* Push all rmaps with pblk == sbno onto the stack */
740 array_cur
&& array_cur
->rm_startblock
== sbno
;
741 array_cur
= peek_slab_cursor(rmaps_cur
)) {
742 advance_slab_cursor(rmaps_cur
); n
++;
743 rmap_dump("push0", agno
, array_cur
);
744 error
= bag_add(stack_top
, array_cur
);
748 mark_inode_rl(mp
, stack_top
);
750 /* Set nbno to the bno of the next refcount change */
751 if (n
< slab_count(rmaps
) && array_cur
)
752 nbno
= array_cur
->rm_startblock
;
755 foreach_bag_ptr(stack_top
, idx
, rmap
) {
756 nbno
= min(nbno
, RMAP_END(rmap
));
759 /* Emit reverse mappings, if needed */
761 old_stack_nr
= bag_count(stack_top
);
763 /* While stack isn't empty... */
764 while (bag_count(stack_top
)) {
765 /* Pop all rmaps that end at nbno */
766 foreach_bag_ptr_reverse(stack_top
, idx
, rmap
) {
767 if (RMAP_END(rmap
) != nbno
)
769 rmap_dump("pop", agno
, rmap
);
770 error
= bag_remove(stack_top
, idx
);
775 /* Push array items that start at nbno */
777 array_cur
&& array_cur
->rm_startblock
== nbno
;
778 array_cur
= peek_slab_cursor(rmaps_cur
)) {
779 advance_slab_cursor(rmaps_cur
); n
++;
780 rmap_dump("push1", agno
, array_cur
);
781 error
= bag_add(stack_top
, array_cur
);
785 mark_inode_rl(mp
, stack_top
);
787 /* Emit refcount if necessary */
789 if (bag_count(stack_top
) != old_stack_nr
) {
790 if (old_stack_nr
> 1) {
791 refcount_emit(mp
, agno
, cbno
,
798 /* Stack empty, go find the next rmap */
799 if (bag_count(stack_top
) == 0)
801 old_stack_nr
= bag_count(stack_top
);
804 /* Set nbno to the bno of the next refcount change */
805 if (n
< slab_count(rmaps
))
806 nbno
= array_cur
->rm_startblock
;
809 foreach_bag_ptr(stack_top
, idx
, rmap
) {
810 nbno
= min(nbno
, RMAP_END(rmap
));
813 /* Emit reverse mappings, if needed */
818 free_bag(&stack_top
);
819 free_slab_cursor(&rmaps_cur
);
826 * Return the number of rmap objects for an AG.
830 struct xfs_mount
*mp
,
833 return slab_count(ag_rmaps
[agno
].ar_rmaps
);
837 * Return a slab cursor that will return rmap objects in order.
842 struct xfs_slab_cursor
**cur
)
844 return init_slab_cursor(ag_rmaps
[agno
].ar_rmaps
, rmap_compare
, cur
);
848 * Disable the refcount btree check.
851 rmap_avoid_check(void)
853 rmapbt_suspect
= true;
856 /* Look for an rmap in the rmapbt that matches a given rmap. */
859 struct xfs_btree_cur
*bt_cur
,
860 struct xfs_rmap_irec
*rm_rec
,
861 struct xfs_rmap_irec
*tmp
,
866 /* Use the regular btree retrieval routine. */
867 error
= -libxfs_rmap_lookup_le(bt_cur
, rm_rec
->rm_startblock
,
868 rm_rec
->rm_blockcount
,
869 rm_rec
->rm_owner
, rm_rec
->rm_offset
,
870 rm_rec
->rm_flags
, have
);
875 return -libxfs_rmap_get_rec(bt_cur
, tmp
, have
);
878 /* Look for an rmap in the rmapbt that matches a given rmap. */
880 rmap_lookup_overlapped(
881 struct xfs_btree_cur
*bt_cur
,
882 struct xfs_rmap_irec
*rm_rec
,
883 struct xfs_rmap_irec
*tmp
,
886 /* Have to use our fancy version for overlapped */
887 return -libxfs_rmap_lookup_le_range(bt_cur
, rm_rec
->rm_startblock
,
888 rm_rec
->rm_owner
, rm_rec
->rm_offset
,
889 rm_rec
->rm_flags
, tmp
, have
);
892 /* Does the btree rmap cover the observed rmap? */
893 #define NEXTP(x) ((x)->rm_startblock + (x)->rm_blockcount)
894 #define NEXTL(x) ((x)->rm_offset + (x)->rm_blockcount)
897 struct xfs_rmap_irec
*observed
,
898 struct xfs_rmap_irec
*btree
)
900 /* Can't have mismatches in the flags or the owner. */
901 if (btree
->rm_flags
!= observed
->rm_flags
||
902 btree
->rm_owner
!= observed
->rm_owner
)
906 * Btree record can't physically start after the observed
907 * record, nor can it end before the observed record.
909 if (btree
->rm_startblock
> observed
->rm_startblock
||
910 NEXTP(btree
) < NEXTP(observed
))
913 /* If this is metadata or bmbt, we're done. */
914 if (XFS_RMAP_NON_INODE_OWNER(observed
->rm_owner
) ||
915 (observed
->rm_flags
& XFS_RMAP_BMBT_BLOCK
))
918 * Btree record can't logically start after the observed
919 * record, nor can it end before the observed record.
921 if (btree
->rm_offset
> observed
->rm_offset
||
922 NEXTL(btree
) < NEXTL(observed
))
931 * Compare the observed reverse mappings against what's in the ag btree.
935 struct xfs_mount
*mp
,
938 struct xfs_slab_cursor
*rm_cur
;
939 struct xfs_btree_cur
*bt_cur
= NULL
;
942 struct xfs_buf
*agbp
= NULL
;
943 struct xfs_rmap_irec
*rm_rec
;
944 struct xfs_rmap_irec tmp
;
945 struct xfs_perag
*pag
; /* per allocation group data */
947 if (!xfs_sb_version_hasrmapbt(&mp
->m_sb
))
949 if (rmapbt_suspect
) {
950 if (no_modify
&& agno
== 0)
951 do_warn(_("would rebuild corrupt rmap btrees.\n"));
955 /* Create cursors to refcount structures */
956 error
= rmap_init_cursor(agno
, &rm_cur
);
960 error
= -libxfs_alloc_read_agf(mp
, NULL
, agno
, 0, &agbp
);
964 /* Leave the per-ag data "uninitialized" since we rewrite it later */
965 pag
= libxfs_perag_get(mp
, agno
);
967 libxfs_perag_put(pag
);
969 bt_cur
= libxfs_rmapbt_init_cursor(mp
, NULL
, agbp
, agno
);
975 rm_rec
= pop_slab_cursor(rm_cur
);
977 error
= rmap_lookup(bt_cur
, rm_rec
, &tmp
, &have
);
981 * Using the range query is expensive, so only do it if
982 * the regular lookup doesn't find anything or if it doesn't
983 * match the observed rmap.
985 if (xfs_sb_version_hasreflink(&bt_cur
->bc_mp
->m_sb
) &&
986 (!have
|| !rmap_is_good(rm_rec
, &tmp
))) {
987 error
= rmap_lookup_overlapped(bt_cur
, rm_rec
,
994 _("Missing reverse-mapping record for (%u/%u) %slen %u owner %"PRId64
" \
995 %s%soff %"PRIu64
"\n"),
996 agno
, rm_rec
->rm_startblock
,
997 (rm_rec
->rm_flags
& XFS_RMAP_UNWRITTEN
) ?
998 _("unwritten ") : "",
999 rm_rec
->rm_blockcount
,
1001 (rm_rec
->rm_flags
& XFS_RMAP_ATTR_FORK
) ?
1003 (rm_rec
->rm_flags
& XFS_RMAP_BMBT_BLOCK
) ?
1009 /* Compare each refcount observation against the btree's */
1010 if (!rmap_is_good(rm_rec
, &tmp
)) {
1012 _("Incorrect reverse-mapping: saw (%u/%u) %slen %u owner %"PRId64
" %s%soff \
1013 %"PRIu64
"; should be (%u/%u) %slen %u owner %"PRId64
" %s%soff %"PRIu64
"\n"),
1014 agno
, tmp
.rm_startblock
,
1015 (tmp
.rm_flags
& XFS_RMAP_UNWRITTEN
) ?
1016 _("unwritten ") : "",
1019 (tmp
.rm_flags
& XFS_RMAP_ATTR_FORK
) ?
1021 (tmp
.rm_flags
& XFS_RMAP_BMBT_BLOCK
) ?
1024 agno
, rm_rec
->rm_startblock
,
1025 (rm_rec
->rm_flags
& XFS_RMAP_UNWRITTEN
) ?
1026 _("unwritten ") : "",
1027 rm_rec
->rm_blockcount
,
1029 (rm_rec
->rm_flags
& XFS_RMAP_ATTR_FORK
) ?
1031 (rm_rec
->rm_flags
& XFS_RMAP_BMBT_BLOCK
) ?
1037 rm_rec
= pop_slab_cursor(rm_cur
);
1042 libxfs_btree_del_cursor(bt_cur
, XFS_BTREE_NOERROR
);
1044 libxfs_putbuf(agbp
);
1045 free_slab_cursor(&rm_cur
);
1050 * Compare the key fields of two rmap records -- positive if key1 > key2,
1051 * negative if key1 < key2, and zero if equal.
1055 struct xfs_rmap_irec
*kp1
,
1056 struct xfs_rmap_irec
*kp2
)
1061 struct xfs_rmap_irec tmp
;
1064 tmp
.rm_flags
&= ~XFS_RMAP_REC_FLAGS
;
1065 oa
= libxfs_rmap_irec_offset_pack(&tmp
);
1067 tmp
.rm_flags
&= ~XFS_RMAP_REC_FLAGS
;
1068 ob
= libxfs_rmap_irec_offset_pack(&tmp
);
1070 d
= (int64_t)kp1
->rm_startblock
- kp2
->rm_startblock
;
1074 if (kp1
->rm_owner
> kp2
->rm_owner
)
1076 else if (kp2
->rm_owner
> kp1
->rm_owner
)
1086 /* Compute the high key of an rmap record. */
1088 rmap_high_key_from_rec(
1089 struct xfs_rmap_irec
*rec
,
1090 struct xfs_rmap_irec
*key
)
1094 adj
= rec
->rm_blockcount
- 1;
1096 key
->rm_startblock
= rec
->rm_startblock
+ adj
;
1097 key
->rm_owner
= rec
->rm_owner
;
1098 key
->rm_offset
= rec
->rm_offset
;
1099 key
->rm_flags
= rec
->rm_flags
& XFS_RMAP_KEY_FLAGS
;
1100 if (XFS_RMAP_NON_INODE_OWNER(rec
->rm_owner
) ||
1101 (rec
->rm_flags
& XFS_RMAP_BMBT_BLOCK
))
1103 key
->rm_offset
+= adj
;
1107 * Record that an inode had the reflink flag set when repair started. The
1108 * inode reflink flag will be adjusted as necessary.
1111 record_inode_reflink_flag(
1112 struct xfs_mount
*mp
,
1113 struct xfs_dinode
*dino
,
1114 xfs_agnumber_t agno
,
1118 struct ino_tree_node
*irec
;
1121 ASSERT(XFS_AGINO_TO_INO(mp
, agno
, ino
) == be64_to_cpu(dino
->di_ino
));
1122 if (!(be64_to_cpu(dino
->di_flags2
) & XFS_DIFLAG2_REFLINK
))
1124 irec
= find_inode_rec(mp
, agno
, ino
);
1125 off
= get_inode_offset(mp
, lino
, irec
);
1126 ASSERT(!inode_was_rl(irec
, off
));
1127 set_inode_was_rl(irec
, off
);
1128 dbg_printf("set was_rl lino=%llu was=0x%llx\n",
1129 (unsigned long long)lino
, (unsigned long long)irec
->ino_was_rl
);
1133 * Fix an inode's reflink flag.
1136 fix_inode_reflink_flag(
1137 struct xfs_mount
*mp
,
1138 xfs_agnumber_t agno
,
1142 struct xfs_dinode
*dino
;
1143 struct xfs_buf
*buf
;
1147 _("setting reflink flag on inode %"PRIu64
"\n"),
1148 XFS_AGINO_TO_INO(mp
, agno
, agino
));
1149 else if (!no_modify
) /* && !set */
1151 _("clearing reflink flag on inode %"PRIu64
"\n"),
1152 XFS_AGINO_TO_INO(mp
, agno
, agino
));
1156 buf
= get_agino_buf(mp
, agno
, agino
, &dino
);
1159 ASSERT(XFS_AGINO_TO_INO(mp
, agno
, agino
) == be64_to_cpu(dino
->di_ino
));
1161 dino
->di_flags2
|= cpu_to_be64(XFS_DIFLAG2_REFLINK
);
1163 dino
->di_flags2
&= cpu_to_be64(~XFS_DIFLAG2_REFLINK
);
1164 libxfs_dinode_calc_crc(mp
, dino
);
1165 libxfs_writebuf(buf
, 0);
1171 * Fix discrepancies between the state of the inode reflink flag and our
1172 * observations as to whether or not the inode really needs it.
1175 fix_inode_reflink_flags(
1176 struct xfs_mount
*mp
,
1177 xfs_agnumber_t agno
)
1179 struct ino_tree_node
*irec
;
1189 * Update the reflink flag for any inode where there's a discrepancy
1190 * between the inode flag and whether or not we found any reflinked
1193 for (irec
= findfirst_inode_rec(agno
);
1195 irec
= next_ino_rec(irec
)) {
1196 ASSERT((irec
->ino_was_rl
& irec
->ir_free
) == 0);
1197 ASSERT((irec
->ino_is_rl
& irec
->ir_free
) == 0);
1198 was
= irec
->ino_was_rl
;
1199 is
= irec
->ino_is_rl
;
1203 dbg_printf("mismatch ino=%llu was=0x%lx is=0x%lx dif=0x%lx\n",
1204 (unsigned long long)XFS_AGINO_TO_INO(mp
, agno
,
1205 irec
->ino_startnum
),
1208 for (bit
= 0, mask
= 1; bit
< 64; bit
++, mask
<<= 1) {
1209 agino
= bit
+ irec
->ino_startnum
;
1212 else if (was
& mask
)
1213 error
= fix_inode_reflink_flag(mp
, agno
, agino
,
1216 error
= fix_inode_reflink_flag(mp
, agno
, agino
,
1222 _("Unable to fix reflink flag on inode %"PRIu64
".\n"),
1223 XFS_AGINO_TO_INO(mp
, agno
, agino
));
1231 * Return the number of refcount objects for an AG.
1234 refcount_record_count(
1235 struct xfs_mount
*mp
,
1236 xfs_agnumber_t agno
)
1238 return slab_count(ag_rmaps
[agno
].ar_refcount_items
);
1242 * Return a slab cursor that will return refcount objects in order.
1245 init_refcount_cursor(
1246 xfs_agnumber_t agno
,
1247 struct xfs_slab_cursor
**cur
)
1249 return init_slab_cursor(ag_rmaps
[agno
].ar_refcount_items
, NULL
, cur
);
1253 * Disable the refcount btree check.
1256 refcount_avoid_check(void)
1258 refcbt_suspect
= true;
1262 * Compare the observed reference counts against what's in the ag btree.
1266 struct xfs_mount
*mp
,
1267 xfs_agnumber_t agno
)
1269 struct xfs_slab_cursor
*rl_cur
;
1270 struct xfs_btree_cur
*bt_cur
= NULL
;
1274 struct xfs_buf
*agbp
= NULL
;
1275 struct xfs_refcount_irec
*rl_rec
;
1276 struct xfs_refcount_irec tmp
;
1277 struct xfs_perag
*pag
; /* per allocation group data */
1279 if (!xfs_sb_version_hasreflink(&mp
->m_sb
))
1281 if (refcbt_suspect
) {
1282 if (no_modify
&& agno
== 0)
1283 do_warn(_("would rebuild corrupt refcount btrees.\n"));
1287 /* Create cursors to refcount structures */
1288 error
= init_refcount_cursor(agno
, &rl_cur
);
1292 error
= -libxfs_alloc_read_agf(mp
, NULL
, agno
, 0, &agbp
);
1296 /* Leave the per-ag data "uninitialized" since we rewrite it later */
1297 pag
= libxfs_perag_get(mp
, agno
);
1299 libxfs_perag_put(pag
);
1301 bt_cur
= libxfs_refcountbt_init_cursor(mp
, NULL
, agbp
, agno
);
1307 rl_rec
= pop_slab_cursor(rl_cur
);
1309 /* Look for a refcount record in the btree */
1310 error
= -libxfs_refcount_lookup_le(bt_cur
,
1311 rl_rec
->rc_startblock
, &have
);
1316 _("Missing reference count record for (%u/%u) len %u count %u\n"),
1317 agno
, rl_rec
->rc_startblock
,
1318 rl_rec
->rc_blockcount
, rl_rec
->rc_refcount
);
1322 error
= -libxfs_refcount_get_rec(bt_cur
, &tmp
, &i
);
1327 _("Missing reference count record for (%u/%u) len %u count %u\n"),
1328 agno
, rl_rec
->rc_startblock
,
1329 rl_rec
->rc_blockcount
, rl_rec
->rc_refcount
);
1333 /* Compare each refcount observation against the btree's */
1334 if (tmp
.rc_startblock
!= rl_rec
->rc_startblock
||
1335 tmp
.rc_blockcount
< rl_rec
->rc_blockcount
||
1336 tmp
.rc_refcount
< rl_rec
->rc_refcount
)
1338 _("Incorrect reference count: saw (%u/%u) len %u nlinks %u; should be (%u/%u) len %u nlinks %u\n"),
1339 agno
, tmp
.rc_startblock
, tmp
.rc_blockcount
,
1340 tmp
.rc_refcount
, agno
, rl_rec
->rc_startblock
,
1341 rl_rec
->rc_blockcount
, rl_rec
->rc_refcount
);
1343 rl_rec
= pop_slab_cursor(rl_cur
);
1348 libxfs_btree_del_cursor(bt_cur
, error
? XFS_BTREE_ERROR
:
1351 libxfs_putbuf(agbp
);
1352 free_slab_cursor(&rl_cur
);
1357 * Regenerate the AGFL so that we don't run out of it while rebuilding the
1358 * rmap btree. If skip_rmapbt is true, don't update the rmapbt (most probably
1359 * because we're updating the rmapbt).
1363 struct xfs_mount
*mp
,
1364 xfs_agnumber_t agno
,
1367 xfs_alloc_arg_t args
;
1372 memset(&args
, 0, sizeof(args
));
1376 args
.pag
= libxfs_perag_get(mp
, agno
);
1377 error
= -libxfs_trans_alloc_rollable(mp
,
1378 libxfs_alloc_min_freelist(mp
, args
.pag
), &tp
);
1380 do_error(_("failed to fix AGFL on AG %d, error %d\n"),
1385 * Prior to rmapbt, all we had to do to fix the freelist is "expand"
1386 * the fresh AGFL header from empty to full. That hasn't changed. For
1387 * rmapbt, however, things change a bit.
1389 * When we're stuffing the rmapbt with the AG btree rmaps the tree can
1390 * expand, so we need to keep the AGFL well-stocked for the expansion.
1391 * However, this expansion can cause the bnobt/cntbt to shrink, which
1392 * can make the AGFL eligible for shrinking. Shrinking involves
1393 * freeing rmapbt entries, but since we haven't finished loading the
1394 * rmapbt with the btree rmaps it's possible for the remove operation
1395 * to fail. The AGFL block is large enough at this point to absorb any
1396 * blocks freed from the bnobt/cntbt, so we can disable shrinking.
1398 * During the initial AGFL regeneration during AGF generation in phase5
1399 * we must also disable rmapbt modifications because the AGF that
1400 * libxfs reads does not yet point to the new rmapbt. These initial
1401 * AGFL entries are added just prior to adding the AG btree block rmaps
1402 * to the rmapbt. It's ok to pass NOSHRINK here too, since the AGFL is
1403 * empty and cannot shrink.
1405 flags
= XFS_ALLOC_FLAG_NOSHRINK
;
1407 flags
|= XFS_ALLOC_FLAG_NORMAP
;
1408 error
= -libxfs_alloc_fix_freelist(&args
, flags
);
1409 libxfs_perag_put(args
.pag
);
1411 do_error(_("failed to fix AGFL on AG %d, error %d\n"),
1414 error
= -libxfs_trans_commit(tp
);
1416 do_error(_("%s: commit failed, error %d\n"), __func__
, error
);
1420 * Remember how many AGFL entries came from excess AG btree allocations and
1421 * therefore already have rmap entries.
1424 rmap_store_agflcount(
1425 struct xfs_mount
*mp
,
1426 xfs_agnumber_t agno
,
1429 if (!rmap_needs_work(mp
))
1432 ag_rmaps
[agno
].ar_flcount
= count
;