1 // SPDX-License-Identifier: GPL-2.0+
3 * Copyright (C) 2016 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
8 #include "err_protos.h"
15 #include "libfrog/bitmap.h"
20 # define dbg_printf(f, a...) do {printf(f, ## a); fflush(stdout); } while (0)
22 # define dbg_printf(f, a...)
25 /* per-AG rmap object anchor */
27 struct xfs_slab
*ar_rmaps
; /* rmap observations, p4 */
28 struct xfs_slab
*ar_raw_rmaps
; /* unmerged rmaps */
29 int ar_flcount
; /* agfl entries from leftover */
30 /* agbt allocations */
31 struct xfs_rmap_irec ar_last_rmap
; /* last rmap seen */
32 struct xfs_slab
*ar_refcount_items
; /* refcount items, p4-5 */
35 static struct xfs_ag_rmap
*ag_rmaps
;
36 static bool rmapbt_suspect
;
37 static bool refcbt_suspect
;
39 static inline int rmap_compare(const void *a
, const void *b
)
41 return libxfs_rmap_compare(a
, b
);
45 * Returns true if we must reconstruct either the reference count or reverse
52 return xfs_sb_version_hasreflink(&mp
->m_sb
) ||
53 xfs_sb_version_hasrmapbt(&mp
->m_sb
);
57 * Initialize per-AG reverse map data.
66 if (!rmap_needs_work(mp
))
69 ag_rmaps
= calloc(mp
->m_sb
.sb_agcount
, sizeof(struct xfs_ag_rmap
));
71 do_error(_("couldn't allocate per-AG reverse map roots\n"));
73 for (i
= 0; i
< mp
->m_sb
.sb_agcount
; i
++) {
74 error
= init_slab(&ag_rmaps
[i
].ar_rmaps
,
75 sizeof(struct xfs_rmap_irec
));
78 _("Insufficient memory while allocating reverse mapping slabs."));
79 error
= init_slab(&ag_rmaps
[i
].ar_raw_rmaps
,
80 sizeof(struct xfs_rmap_irec
));
83 _("Insufficient memory while allocating raw metadata reverse mapping slabs."));
84 ag_rmaps
[i
].ar_last_rmap
.rm_owner
= XFS_RMAP_OWN_UNKNOWN
;
85 error
= init_slab(&ag_rmaps
[i
].ar_refcount_items
,
86 sizeof(struct xfs_refcount_irec
));
89 _("Insufficient memory while allocating refcount item slabs."));
94 * Free the per-AG reverse-mapping data.
102 if (!rmap_needs_work(mp
))
105 for (i
= 0; i
< mp
->m_sb
.sb_agcount
; i
++) {
106 free_slab(&ag_rmaps
[i
].ar_rmaps
);
107 free_slab(&ag_rmaps
[i
].ar_raw_rmaps
);
108 free_slab(&ag_rmaps
[i
].ar_refcount_items
);
115 * Decide if two reverse-mapping records can be merged.
119 struct xfs_rmap_irec
*r1
,
120 struct xfs_rmap_irec
*r2
)
122 if (r1
->rm_owner
!= r2
->rm_owner
)
124 if (r1
->rm_startblock
+ r1
->rm_blockcount
!= r2
->rm_startblock
)
126 if ((unsigned long long)r1
->rm_blockcount
+ r2
->rm_blockcount
>
129 if (XFS_RMAP_NON_INODE_OWNER(r2
->rm_owner
))
131 /* must be an inode owner below here */
132 if (r1
->rm_flags
!= r2
->rm_flags
)
134 if (r1
->rm_flags
& XFS_RMAP_BMBT_BLOCK
)
136 return r1
->rm_offset
+ r1
->rm_blockcount
== r2
->rm_offset
;
140 * Add an observation about a block mapping in an inode's data or attribute
141 * fork for later btree reconstruction.
145 struct xfs_mount
*mp
,
148 struct xfs_bmbt_irec
*irec
)
150 struct xfs_rmap_irec rmap
;
153 struct xfs_rmap_irec
*last_rmap
;
156 if (!rmap_needs_work(mp
))
159 agno
= XFS_FSB_TO_AGNO(mp
, irec
->br_startblock
);
160 agbno
= XFS_FSB_TO_AGBNO(mp
, irec
->br_startblock
);
161 ASSERT(agno
!= NULLAGNUMBER
);
162 ASSERT(agno
< mp
->m_sb
.sb_agcount
);
163 ASSERT(agbno
+ irec
->br_blockcount
<= mp
->m_sb
.sb_agblocks
);
164 ASSERT(ino
!= NULLFSINO
);
165 ASSERT(whichfork
== XFS_DATA_FORK
|| whichfork
== XFS_ATTR_FORK
);
168 rmap
.rm_offset
= irec
->br_startoff
;
170 if (whichfork
== XFS_ATTR_FORK
)
171 rmap
.rm_flags
|= XFS_RMAP_ATTR_FORK
;
172 rmap
.rm_startblock
= agbno
;
173 rmap
.rm_blockcount
= irec
->br_blockcount
;
174 if (irec
->br_state
== XFS_EXT_UNWRITTEN
)
175 rmap
.rm_flags
|= XFS_RMAP_UNWRITTEN
;
176 last_rmap
= &ag_rmaps
[agno
].ar_last_rmap
;
177 if (last_rmap
->rm_owner
== XFS_RMAP_OWN_UNKNOWN
)
179 else if (rmaps_are_mergeable(last_rmap
, &rmap
))
180 last_rmap
->rm_blockcount
+= rmap
.rm_blockcount
;
182 error
= slab_add(ag_rmaps
[agno
].ar_rmaps
, last_rmap
);
191 /* Finish collecting inode data/attr fork rmaps. */
193 rmap_finish_collecting_fork_recs(
194 struct xfs_mount
*mp
,
197 if (!rmap_needs_work(mp
) ||
198 ag_rmaps
[agno
].ar_last_rmap
.rm_owner
== XFS_RMAP_OWN_UNKNOWN
)
200 return slab_add(ag_rmaps
[agno
].ar_rmaps
, &ag_rmaps
[agno
].ar_last_rmap
);
203 /* add a raw rmap; these will be merged later */
206 struct xfs_mount
*mp
,
214 struct xfs_rmap_irec rmap
;
217 rmap
.rm_owner
= owner
;
221 rmap
.rm_flags
|= XFS_RMAP_ATTR_FORK
;
223 rmap
.rm_flags
|= XFS_RMAP_BMBT_BLOCK
;
224 rmap
.rm_startblock
= agbno
;
225 rmap
.rm_blockcount
= len
;
226 return slab_add(ag_rmaps
[agno
].ar_raw_rmaps
, &rmap
);
230 * Add a reverse mapping for an inode fork's block mapping btree block.
234 struct xfs_mount
*mp
,
242 if (!rmap_needs_work(mp
))
245 agno
= XFS_FSB_TO_AGNO(mp
, fsbno
);
246 agbno
= XFS_FSB_TO_AGBNO(mp
, fsbno
);
247 ASSERT(agno
!= NULLAGNUMBER
);
248 ASSERT(agno
< mp
->m_sb
.sb_agcount
);
249 ASSERT(agbno
+ 1 <= mp
->m_sb
.sb_agblocks
);
251 return __rmap_add_raw_rec(mp
, agno
, agbno
, 1, ino
,
252 whichfork
== XFS_ATTR_FORK
, true);
256 * Add a reverse mapping for a per-AG fixed metadata extent.
260 struct xfs_mount
*mp
,
266 if (!rmap_needs_work(mp
))
269 ASSERT(agno
!= NULLAGNUMBER
);
270 ASSERT(agno
< mp
->m_sb
.sb_agcount
);
271 ASSERT(agbno
+ len
<= mp
->m_sb
.sb_agblocks
);
273 return __rmap_add_raw_rec(mp
, agno
, agbno
, len
, owner
, false, false);
277 * Merge adjacent raw rmaps and add them to the main rmap list.
281 struct xfs_mount
*mp
,
284 struct xfs_slab_cursor
*cur
= NULL
;
285 struct xfs_rmap_irec
*prev
, *rec
;
289 old_sz
= slab_count(ag_rmaps
[agno
].ar_rmaps
);
290 if (slab_count(ag_rmaps
[agno
].ar_raw_rmaps
) == 0)
292 qsort_slab(ag_rmaps
[agno
].ar_raw_rmaps
, rmap_compare
);
293 error
= init_slab_cursor(ag_rmaps
[agno
].ar_raw_rmaps
, rmap_compare
,
298 prev
= pop_slab_cursor(cur
);
299 rec
= pop_slab_cursor(cur
);
300 while (prev
&& rec
) {
301 if (rmaps_are_mergeable(prev
, rec
)) {
302 prev
->rm_blockcount
+= rec
->rm_blockcount
;
303 rec
= pop_slab_cursor(cur
);
306 error
= slab_add(ag_rmaps
[agno
].ar_rmaps
, prev
);
310 rec
= pop_slab_cursor(cur
);
313 error
= slab_add(ag_rmaps
[agno
].ar_rmaps
, prev
);
317 free_slab(&ag_rmaps
[agno
].ar_raw_rmaps
);
318 error
= init_slab(&ag_rmaps
[agno
].ar_raw_rmaps
,
319 sizeof(struct xfs_rmap_irec
));
322 _("Insufficient memory while allocating raw metadata reverse mapping slabs."));
325 qsort_slab(ag_rmaps
[agno
].ar_rmaps
, rmap_compare
);
327 free_slab_cursor(&cur
);
338 for (n
= 0; n
< sizeof(mask
) * NBBY
&& (mask
& 1); n
++, mask
>>= 1)
354 for (n
= 0; n
< sizeof(mask
) * NBBY
; n
++, mask
>>= 1)
362 * Add an allocation group's fixed metadata to the rmap list. This includes
363 * sb/agi/agf/agfl headers, inode chunks, and the log.
366 rmap_add_fixed_ag_rec(
367 struct xfs_mount
*mp
,
372 ino_tree_node_t
*ino_rec
;
378 if (!rmap_needs_work(mp
))
381 /* sb/agi/agf/agfl headers */
382 error
= rmap_add_ag_rec(mp
, agno
, 0, XFS_BNO_BLOCK(mp
),
388 ino_rec
= findfirst_inode_rec(agno
);
389 for (; ino_rec
!= NULL
; ino_rec
= next_ino_rec(ino_rec
)) {
390 if (xfs_sb_version_hassparseinodes(&mp
->m_sb
)) {
391 startidx
= find_first_zero_bit(ino_rec
->ir_sparse
);
392 nr
= XFS_INODES_PER_CHUNK
- popcnt(ino_rec
->ir_sparse
);
395 nr
= XFS_INODES_PER_CHUNK
;
397 nr
/= mp
->m_sb
.sb_inopblock
;
400 agino
= ino_rec
->ino_startnum
+ startidx
;
401 agbno
= XFS_AGINO_TO_AGBNO(mp
, agino
);
402 if (XFS_AGINO_TO_OFFSET(mp
, agino
) == 0) {
403 error
= rmap_add_ag_rec(mp
, agno
, agbno
, nr
,
404 XFS_RMAP_OWN_INODES
);
411 fsbno
= mp
->m_sb
.sb_logstart
;
412 if (fsbno
&& XFS_FSB_TO_AGNO(mp
, fsbno
) == agno
) {
413 agbno
= XFS_FSB_TO_AGBNO(mp
, mp
->m_sb
.sb_logstart
);
414 error
= rmap_add_ag_rec(mp
, agno
, agbno
, mp
->m_sb
.sb_logblocks
,
424 * Copy the per-AG btree reverse-mapping data into the rmapbt.
426 * At rmapbt reconstruction time, the rmapbt will be populated _only_ with
427 * rmaps for file extents, inode chunks, AG headers, and bmbt blocks. While
428 * building the AG btrees we can record all the blocks allocated for each
429 * btree, but we cannot resolve the conflict between the fact that one has to
430 * finish allocating the space for the rmapbt before building the bnobt and the
431 * fact that allocating blocks for the bnobt requires adding rmapbt entries.
432 * Therefore we record in-core the rmaps for each btree and here use the
433 * libxfs rmap functions to finish building the rmap btree.
435 * During AGF/AGFL reconstruction in phase 5, rmaps for the AG btrees are
436 * recorded in memory. The rmapbt has not been set up yet, so we need to be
437 * able to "expand" the AGFL without updating the rmapbt. After we've written
438 * out the new AGF header the new rmapbt is available, so this function reads
439 * each AGFL to generate rmap entries. These entries are merged with the AG
440 * btree rmap entries, and then we use libxfs' rmap functions to add them to
441 * the rmapbt, after which it is fully regenerated.
444 rmap_store_ag_btree_rec(
445 struct xfs_mount
*mp
,
448 struct xfs_slab_cursor
*rm_cur
;
449 struct xfs_rmap_irec
*rm_rec
= NULL
;
450 struct xfs_buf
*agbp
= NULL
;
451 struct xfs_buf
*agflbp
= NULL
;
452 struct xfs_trans
*tp
;
453 __be32
*agfl_bno
, *b
;
454 struct xfs_ag_rmap
*ag_rmap
= &ag_rmaps
[agno
];
455 struct bitmap
*own_ag_bitmap
= NULL
;
458 if (!xfs_sb_version_hasrmapbt(&mp
->m_sb
))
461 /* Release the ar_rmaps; they were put into the rmapbt during p5. */
462 free_slab(&ag_rmap
->ar_rmaps
);
463 error
= init_slab(&ag_rmap
->ar_rmaps
, sizeof(struct xfs_rmap_irec
));
467 /* Add the AGFL blocks to the rmap list */
468 error
= -libxfs_trans_read_buf(
469 mp
, NULL
, mp
->m_ddev_targp
,
470 XFS_AG_DADDR(mp
, agno
, XFS_AGFL_DADDR(mp
)),
471 XFS_FSS_TO_BB(mp
, 1), 0, &agflbp
, &xfs_agfl_buf_ops
);
476 * Sometimes, the blocks at the beginning of the AGFL are there
477 * because we overestimated how many blocks we needed to rebuild
478 * the freespace btrees. ar_flcount records the number of
479 * blocks in this situation. Since those blocks already have an
480 * rmap, we only need to add rmap records for AGFL blocks past
481 * that point in the AGFL because those blocks are a result of a
482 * no-rmap no-shrink freelist fixup that we did earlier.
484 * However, some blocks end up on the AGFL because the free space
485 * btrees shed blocks as a result of allocating space to fix the
486 * freelist. We already created in-core rmap records for the free
487 * space btree blocks, so we must be careful not to create those
488 * records again. Create a bitmap of already-recorded OWN_AG rmaps.
490 error
= init_slab_cursor(ag_rmap
->ar_raw_rmaps
, rmap_compare
, &rm_cur
);
493 error
= -bitmap_alloc(&own_ag_bitmap
);
496 while ((rm_rec
= pop_slab_cursor(rm_cur
)) != NULL
) {
497 if (rm_rec
->rm_owner
!= XFS_RMAP_OWN_AG
)
499 error
= -bitmap_set(own_ag_bitmap
, rm_rec
->rm_startblock
,
500 rm_rec
->rm_blockcount
);
503 * If this range is already set, then the incore rmap
504 * records for the AG free space btrees overlap and
505 * we're toast because that is not allowed.
508 error
= EFSCORRUPTED
;
512 free_slab_cursor(&rm_cur
);
514 /* Create rmaps for any AGFL blocks that aren't already rmapped. */
515 agfl_bno
= xfs_buf_to_agfl_bno(agflbp
);
516 b
= agfl_bno
+ ag_rmap
->ar_flcount
;
517 while (*b
!= cpu_to_be32(NULLAGBLOCK
) &&
518 b
- agfl_bno
< libxfs_agfl_size(mp
)) {
521 agbno
= be32_to_cpu(*b
);
522 if (!bitmap_test(own_ag_bitmap
, agbno
, 1)) {
523 error
= rmap_add_ag_rec(mp
, agno
, agbno
, 1,
530 libxfs_buf_relse(agflbp
);
532 bitmap_free(&own_ag_bitmap
);
534 /* Merge all the raw rmaps into the main list */
535 error
= rmap_fold_raw_recs(mp
, agno
);
539 /* Create cursors to refcount structures */
540 error
= init_slab_cursor(ag_rmap
->ar_rmaps
, rmap_compare
, &rm_cur
);
544 /* Insert rmaps into the btree one at a time */
545 rm_rec
= pop_slab_cursor(rm_cur
);
547 struct xfs_owner_info oinfo
= {};
548 struct xfs_perag
*pag
;
550 error
= -libxfs_trans_alloc_rollable(mp
, 16, &tp
);
554 error
= -libxfs_alloc_read_agf(mp
, tp
, agno
, 0, &agbp
);
558 ASSERT(XFS_RMAP_NON_INODE_OWNER(rm_rec
->rm_owner
));
559 oinfo
.oi_owner
= rm_rec
->rm_owner
;
560 pag
= libxfs_perag_get(mp
, agno
);
561 error
= -libxfs_rmap_alloc(tp
, agbp
, pag
, rm_rec
->rm_startblock
,
562 rm_rec
->rm_blockcount
, &oinfo
);
563 libxfs_perag_put(pag
);
567 error
= -libxfs_trans_commit(tp
);
571 fix_freelist(mp
, agno
, false);
573 rm_rec
= pop_slab_cursor(rm_cur
);
576 free_slab_cursor(&rm_cur
);
580 libxfs_trans_cancel(tp
);
582 free_slab_cursor(&rm_cur
);
585 libxfs_buf_relse(agflbp
);
587 bitmap_free(&own_ag_bitmap
);
596 struct xfs_rmap_irec
*rmap
)
598 printf("%s: %p agno=%u pblk=%llu own=%lld lblk=%llu len=%u flags=0x%x\n",
601 (unsigned long long)rmap
->rm_startblock
,
602 (unsigned long long)rmap
->rm_owner
,
603 (unsigned long long)rmap
->rm_offset
,
604 (unsigned int)rmap
->rm_blockcount
,
605 (unsigned int)rmap
->rm_flags
);
608 # define rmap_dump(m, a, r)
612 * Rebuilding the Reference Count & Reverse Mapping Btrees
614 * The reference count (refcnt) and reverse mapping (rmap) btrees are
615 * rebuilt during phase 5, like all other AG btrees. Therefore, reverse
616 * mappings must be processed into reference counts at the end of phase
617 * 4, and the rmaps must be recorded during phase 4. There is a need to
618 * access the rmaps in physical block order, but no particular need for
619 * random access, so the slab.c code provides a big logical array
620 * (consisting of smaller slabs) and some inorder iterator functions.
622 * Once we've recorded all the reverse mappings, we're ready to
623 * translate the rmaps into refcount entries. Imagine the rmap entries
624 * as rectangles representing extents of physical blocks, and that the
625 * rectangles can be laid down to allow them to overlap each other; then
626 * we know that we must emit a refcnt btree entry wherever the amount of
627 * overlap changes, i.e. the emission stimulus is level-triggered:
630 * -- ----- ---- --- ------
631 * -- ---- ----------- ---- ---------
632 * -------------------------------- -----------
633 * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^
634 * 2 1 23 21 3 43 234 2123 1 01 2 3 0
636 * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
638 * Note that in the actual refcnt btree we don't store the refcount < 2
639 * cases because the bnobt tells us which blocks are free; single-use
640 * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt
641 * supports storing multiple entries covering a given block we could
642 * theoretically dispense with the refcntbt and simply count rmaps, but
643 * that's inefficient in the (hot) write path, so we'll take the cost of
644 * the extra tree to save time. Also there's no guarantee that rmap
647 * Given an array of rmaps sorted by physical block number, a starting
648 * physical block (sp), a bag to hold rmaps that cover sp, and the next
649 * physical block where the level changes (np), we can reconstruct the
650 * refcount btree as follows:
652 * While there are still unprocessed rmaps in the array,
653 * - Set sp to the physical block (pblk) of the next unprocessed rmap.
654 * - Add to the bag all rmaps in the array where startblock == sp.
655 * - Set np to the physical block where the bag size will change. This
656 * is the minimum of (the pblk of the next unprocessed rmap) and
657 * (startblock + len of each rmap in the bag).
658 * - Record the bag size as old_bag_size.
660 * - While the bag isn't empty,
661 * - Remove from the bag all rmaps where startblock + len == np.
662 * - Add to the bag all rmaps in the array where startblock == np.
663 * - If the bag size isn't old_bag_size, store the refcount entry
664 * (sp, np - sp, bag_size) in the refcnt btree.
665 * - If the bag is empty, break out of the inner loop.
666 * - Set old_bag_size to the bag size
668 * - Set np to the physical block where the bag size will change.
669 * This is the minimum of (the pblk of the next unprocessed rmap)
670 * and (startblock + len of each rmap in the bag).
672 * An implementation detail is that because this processing happens
673 * during phase 4, the refcount entries are stored in an array so that
674 * phase 5 can load them into the refcount btree. The rmaps can be
675 * loaded directly into the rmap btree during phase 5 as well.
679 * Mark all inodes in the reverse-mapping observation stack as requiring the
680 * reflink inode flag, if the stack depth is greater than 1.
684 struct xfs_mount
*mp
,
685 struct xfs_bag
*rmaps
)
687 xfs_agnumber_t iagno
;
688 struct xfs_rmap_irec
*rmap
;
689 struct ino_tree_node
*irec
;
694 if (bag_count(rmaps
) < 2)
697 /* Reflink flag accounting */
698 foreach_bag_ptr(rmaps
, idx
, rmap
) {
699 ASSERT(!XFS_RMAP_NON_INODE_OWNER(rmap
->rm_owner
));
700 iagno
= XFS_INO_TO_AGNO(mp
, rmap
->rm_owner
);
701 ino
= XFS_INO_TO_AGINO(mp
, rmap
->rm_owner
);
702 pthread_mutex_lock(&ag_locks
[iagno
].lock
);
703 irec
= find_inode_rec(mp
, iagno
, ino
);
704 off
= get_inode_offset(mp
, rmap
->rm_owner
, irec
);
705 /* lock here because we might go outside this ag */
706 set_inode_is_rl(irec
, off
);
707 pthread_mutex_unlock(&ag_locks
[iagno
].lock
);
712 * Emit a refcount object for refcntbt reconstruction during phase 5.
714 #define REFCOUNT_CLAMP(nr) ((nr) > MAXREFCOUNT ? MAXREFCOUNT : (nr))
717 struct xfs_mount
*mp
,
723 struct xfs_refcount_irec rlrec
;
725 struct xfs_slab
*rlslab
;
727 rlslab
= ag_rmaps
[agno
].ar_refcount_items
;
728 ASSERT(nr_rmaps
> 0);
730 dbg_printf("REFL: agno=%u pblk=%u, len=%u -> refcount=%zu\n",
731 agno
, agbno
, len
, nr_rmaps
);
732 rlrec
.rc_startblock
= agbno
;
733 rlrec
.rc_blockcount
= len
;
734 rlrec
.rc_refcount
= REFCOUNT_CLAMP(nr_rmaps
);
735 error
= slab_add(rlslab
, &rlrec
);
738 _("Insufficient memory while recreating refcount tree."));
740 #undef REFCOUNT_CLAMP
743 * Transform a pile of physical block mapping observations into refcount data
744 * for eventual rebuilding of the btrees.
746 #define RMAP_END(r) ((r)->rm_startblock + (r)->rm_blockcount)
749 struct xfs_mount
*mp
,
752 struct xfs_bag
*stack_top
= NULL
;
753 struct xfs_slab
*rmaps
;
754 struct xfs_slab_cursor
*rmaps_cur
;
755 struct xfs_rmap_irec
*array_cur
;
756 struct xfs_rmap_irec
*rmap
;
757 xfs_agblock_t sbno
; /* first bno of this rmap set */
758 xfs_agblock_t cbno
; /* first bno of this refcount set */
759 xfs_agblock_t nbno
; /* next bno where rmap set changes */
764 if (!xfs_sb_version_hasreflink(&mp
->m_sb
))
767 rmaps
= ag_rmaps
[agno
].ar_rmaps
;
769 error
= init_slab_cursor(rmaps
, rmap_compare
, &rmaps_cur
);
773 error
= init_bag(&stack_top
);
777 /* While there are rmaps to be processed... */
779 while (n
< slab_count(rmaps
)) {
780 array_cur
= peek_slab_cursor(rmaps_cur
);
781 sbno
= cbno
= array_cur
->rm_startblock
;
782 /* Push all rmaps with pblk == sbno onto the stack */
784 array_cur
&& array_cur
->rm_startblock
== sbno
;
785 array_cur
= peek_slab_cursor(rmaps_cur
)) {
786 advance_slab_cursor(rmaps_cur
); n
++;
787 rmap_dump("push0", agno
, array_cur
);
788 error
= bag_add(stack_top
, array_cur
);
792 mark_inode_rl(mp
, stack_top
);
794 /* Set nbno to the bno of the next refcount change */
795 if (n
< slab_count(rmaps
) && array_cur
)
796 nbno
= array_cur
->rm_startblock
;
799 foreach_bag_ptr(stack_top
, idx
, rmap
) {
800 nbno
= min(nbno
, RMAP_END(rmap
));
803 /* Emit reverse mappings, if needed */
805 old_stack_nr
= bag_count(stack_top
);
807 /* While stack isn't empty... */
808 while (bag_count(stack_top
)) {
809 /* Pop all rmaps that end at nbno */
810 foreach_bag_ptr_reverse(stack_top
, idx
, rmap
) {
811 if (RMAP_END(rmap
) != nbno
)
813 rmap_dump("pop", agno
, rmap
);
814 error
= bag_remove(stack_top
, idx
);
819 /* Push array items that start at nbno */
821 array_cur
&& array_cur
->rm_startblock
== nbno
;
822 array_cur
= peek_slab_cursor(rmaps_cur
)) {
823 advance_slab_cursor(rmaps_cur
); n
++;
824 rmap_dump("push1", agno
, array_cur
);
825 error
= bag_add(stack_top
, array_cur
);
829 mark_inode_rl(mp
, stack_top
);
831 /* Emit refcount if necessary */
833 if (bag_count(stack_top
) != old_stack_nr
) {
834 if (old_stack_nr
> 1) {
835 refcount_emit(mp
, agno
, cbno
,
842 /* Stack empty, go find the next rmap */
843 if (bag_count(stack_top
) == 0)
845 old_stack_nr
= bag_count(stack_top
);
848 /* Set nbno to the bno of the next refcount change */
849 if (n
< slab_count(rmaps
))
850 nbno
= array_cur
->rm_startblock
;
853 foreach_bag_ptr(stack_top
, idx
, rmap
) {
854 nbno
= min(nbno
, RMAP_END(rmap
));
857 /* Emit reverse mappings, if needed */
862 free_bag(&stack_top
);
863 free_slab_cursor(&rmaps_cur
);
870 * Return the number of rmap objects for an AG.
874 struct xfs_mount
*mp
,
877 return slab_count(ag_rmaps
[agno
].ar_rmaps
);
881 * Return a slab cursor that will return rmap objects in order.
886 struct xfs_slab_cursor
**cur
)
888 return init_slab_cursor(ag_rmaps
[agno
].ar_rmaps
, rmap_compare
, cur
);
892 * Disable the refcount btree check.
895 rmap_avoid_check(void)
897 rmapbt_suspect
= true;
900 /* Look for an rmap in the rmapbt that matches a given rmap. */
903 struct xfs_btree_cur
*bt_cur
,
904 struct xfs_rmap_irec
*rm_rec
,
905 struct xfs_rmap_irec
*tmp
,
910 /* Use the regular btree retrieval routine. */
911 error
= -libxfs_rmap_lookup_le(bt_cur
, rm_rec
->rm_startblock
,
912 rm_rec
->rm_blockcount
,
913 rm_rec
->rm_owner
, rm_rec
->rm_offset
,
914 rm_rec
->rm_flags
, have
);
919 return -libxfs_rmap_get_rec(bt_cur
, tmp
, have
);
922 /* Look for an rmap in the rmapbt that matches a given rmap. */
924 rmap_lookup_overlapped(
925 struct xfs_btree_cur
*bt_cur
,
926 struct xfs_rmap_irec
*rm_rec
,
927 struct xfs_rmap_irec
*tmp
,
930 /* Have to use our fancy version for overlapped */
931 return -libxfs_rmap_lookup_le_range(bt_cur
, rm_rec
->rm_startblock
,
932 rm_rec
->rm_owner
, rm_rec
->rm_offset
,
933 rm_rec
->rm_flags
, tmp
, have
);
936 /* Does the btree rmap cover the observed rmap? */
937 #define NEXTP(x) ((x)->rm_startblock + (x)->rm_blockcount)
938 #define NEXTL(x) ((x)->rm_offset + (x)->rm_blockcount)
941 struct xfs_rmap_irec
*observed
,
942 struct xfs_rmap_irec
*btree
)
944 /* Can't have mismatches in the flags or the owner. */
945 if (btree
->rm_flags
!= observed
->rm_flags
||
946 btree
->rm_owner
!= observed
->rm_owner
)
950 * Btree record can't physically start after the observed
951 * record, nor can it end before the observed record.
953 if (btree
->rm_startblock
> observed
->rm_startblock
||
954 NEXTP(btree
) < NEXTP(observed
))
957 /* If this is metadata or bmbt, we're done. */
958 if (XFS_RMAP_NON_INODE_OWNER(observed
->rm_owner
) ||
959 (observed
->rm_flags
& XFS_RMAP_BMBT_BLOCK
))
962 * Btree record can't logically start after the observed
963 * record, nor can it end before the observed record.
965 if (btree
->rm_offset
> observed
->rm_offset
||
966 NEXTL(btree
) < NEXTL(observed
))
975 * Compare the observed reverse mappings against what's in the ag btree.
979 struct xfs_mount
*mp
,
982 struct xfs_rmap_irec tmp
;
983 struct xfs_slab_cursor
*rm_cur
;
984 struct xfs_btree_cur
*bt_cur
= NULL
;
985 struct xfs_buf
*agbp
= NULL
;
986 struct xfs_rmap_irec
*rm_rec
;
987 struct xfs_perag
*pag
= NULL
;
991 if (!xfs_sb_version_hasrmapbt(&mp
->m_sb
))
993 if (rmapbt_suspect
) {
994 if (no_modify
&& agno
== 0)
995 do_warn(_("would rebuild corrupt rmap btrees.\n"));
999 /* Create cursors to refcount structures */
1000 error
= rmap_init_cursor(agno
, &rm_cur
);
1004 error
= -libxfs_alloc_read_agf(mp
, NULL
, agno
, 0, &agbp
);
1008 /* Leave the per-ag data "uninitialized" since we rewrite it later */
1009 pag
= libxfs_perag_get(mp
, agno
);
1012 bt_cur
= libxfs_rmapbt_init_cursor(mp
, NULL
, agbp
, pag
);
1018 rm_rec
= pop_slab_cursor(rm_cur
);
1020 error
= rmap_lookup(bt_cur
, rm_rec
, &tmp
, &have
);
1024 * Using the range query is expensive, so only do it if
1025 * the regular lookup doesn't find anything or if it doesn't
1026 * match the observed rmap.
1028 if (xfs_sb_version_hasreflink(&bt_cur
->bc_mp
->m_sb
) &&
1029 (!have
|| !rmap_is_good(rm_rec
, &tmp
))) {
1030 error
= rmap_lookup_overlapped(bt_cur
, rm_rec
,
1037 _("Missing reverse-mapping record for (%u/%u) %slen %u owner %"PRId64
" \
1038 %s%soff %"PRIu64
"\n"),
1039 agno
, rm_rec
->rm_startblock
,
1040 (rm_rec
->rm_flags
& XFS_RMAP_UNWRITTEN
) ?
1041 _("unwritten ") : "",
1042 rm_rec
->rm_blockcount
,
1044 (rm_rec
->rm_flags
& XFS_RMAP_ATTR_FORK
) ?
1046 (rm_rec
->rm_flags
& XFS_RMAP_BMBT_BLOCK
) ?
1052 /* Compare each refcount observation against the btree's */
1053 if (!rmap_is_good(rm_rec
, &tmp
)) {
1055 _("Incorrect reverse-mapping: saw (%u/%u) %slen %u owner %"PRId64
" %s%soff \
1056 %"PRIu64
"; should be (%u/%u) %slen %u owner %"PRId64
" %s%soff %"PRIu64
"\n"),
1057 agno
, tmp
.rm_startblock
,
1058 (tmp
.rm_flags
& XFS_RMAP_UNWRITTEN
) ?
1059 _("unwritten ") : "",
1062 (tmp
.rm_flags
& XFS_RMAP_ATTR_FORK
) ?
1064 (tmp
.rm_flags
& XFS_RMAP_BMBT_BLOCK
) ?
1067 agno
, rm_rec
->rm_startblock
,
1068 (rm_rec
->rm_flags
& XFS_RMAP_UNWRITTEN
) ?
1069 _("unwritten ") : "",
1070 rm_rec
->rm_blockcount
,
1072 (rm_rec
->rm_flags
& XFS_RMAP_ATTR_FORK
) ?
1074 (rm_rec
->rm_flags
& XFS_RMAP_BMBT_BLOCK
) ?
1080 rm_rec
= pop_slab_cursor(rm_cur
);
1085 libxfs_btree_del_cursor(bt_cur
, XFS_BTREE_NOERROR
);
1087 libxfs_perag_put(pag
);
1089 libxfs_buf_relse(agbp
);
1090 free_slab_cursor(&rm_cur
);
1095 * Compare the key fields of two rmap records -- positive if key1 > key2,
1096 * negative if key1 < key2, and zero if equal.
1100 struct xfs_rmap_irec
*kp1
,
1101 struct xfs_rmap_irec
*kp2
)
1106 struct xfs_rmap_irec tmp
;
1109 tmp
.rm_flags
&= ~XFS_RMAP_REC_FLAGS
;
1110 oa
= libxfs_rmap_irec_offset_pack(&tmp
);
1112 tmp
.rm_flags
&= ~XFS_RMAP_REC_FLAGS
;
1113 ob
= libxfs_rmap_irec_offset_pack(&tmp
);
1115 d
= (int64_t)kp1
->rm_startblock
- kp2
->rm_startblock
;
1119 if (kp1
->rm_owner
> kp2
->rm_owner
)
1121 else if (kp2
->rm_owner
> kp1
->rm_owner
)
1131 /* Compute the high key of an rmap record. */
1133 rmap_high_key_from_rec(
1134 struct xfs_rmap_irec
*rec
,
1135 struct xfs_rmap_irec
*key
)
1139 adj
= rec
->rm_blockcount
- 1;
1141 key
->rm_startblock
= rec
->rm_startblock
+ adj
;
1142 key
->rm_owner
= rec
->rm_owner
;
1143 key
->rm_offset
= rec
->rm_offset
;
1144 key
->rm_flags
= rec
->rm_flags
& XFS_RMAP_KEY_FLAGS
;
1145 if (XFS_RMAP_NON_INODE_OWNER(rec
->rm_owner
) ||
1146 (rec
->rm_flags
& XFS_RMAP_BMBT_BLOCK
))
1148 key
->rm_offset
+= adj
;
1152 * Record that an inode had the reflink flag set when repair started. The
1153 * inode reflink flag will be adjusted as necessary.
1156 record_inode_reflink_flag(
1157 struct xfs_mount
*mp
,
1158 struct xfs_dinode
*dino
,
1159 xfs_agnumber_t agno
,
1163 struct ino_tree_node
*irec
;
1166 ASSERT(XFS_AGINO_TO_INO(mp
, agno
, ino
) == be64_to_cpu(dino
->di_ino
));
1167 if (!(be64_to_cpu(dino
->di_flags2
) & XFS_DIFLAG2_REFLINK
))
1169 irec
= find_inode_rec(mp
, agno
, ino
);
1170 off
= get_inode_offset(mp
, lino
, irec
);
1171 ASSERT(!inode_was_rl(irec
, off
));
1172 set_inode_was_rl(irec
, off
);
1173 dbg_printf("set was_rl lino=%llu was=0x%llx\n",
1174 (unsigned long long)lino
, (unsigned long long)irec
->ino_was_rl
);
1178 * Inform the user that we're clearing the reflink flag on an inode that
1179 * doesn't actually share any blocks. This is an optimization (the kernel
1180 * skips refcount checks for non-reflink files) and not a corruption repair,
1181 * so we don't need to log every time we clear a flag unless verbose mode is
1185 warn_clearing_reflink(
1188 static bool warned
= false;
1189 static pthread_mutex_t lock
= PTHREAD_MUTEX_INITIALIZER
;
1192 do_warn(_("clearing reflink flag on inode %"PRIu64
"\n"), ino
);
1199 pthread_mutex_lock(&lock
);
1201 do_warn(_("clearing reflink flag on inodes when possible\n"));
1204 pthread_mutex_unlock(&lock
);
1208 * Fix an inode's reflink flag.
1211 fix_inode_reflink_flag(
1212 struct xfs_mount
*mp
,
1213 xfs_agnumber_t agno
,
1217 struct xfs_dinode
*dino
;
1218 struct xfs_buf
*buf
;
1222 _("setting reflink flag on inode %"PRIu64
"\n"),
1223 XFS_AGINO_TO_INO(mp
, agno
, agino
));
1224 else if (!no_modify
) /* && !set */
1225 warn_clearing_reflink(XFS_AGINO_TO_INO(mp
, agno
, agino
));
1229 buf
= get_agino_buf(mp
, agno
, agino
, &dino
);
1232 ASSERT(XFS_AGINO_TO_INO(mp
, agno
, agino
) == be64_to_cpu(dino
->di_ino
));
1234 dino
->di_flags2
|= cpu_to_be64(XFS_DIFLAG2_REFLINK
);
1236 dino
->di_flags2
&= cpu_to_be64(~XFS_DIFLAG2_REFLINK
);
1237 libxfs_dinode_calc_crc(mp
, dino
);
1238 libxfs_buf_mark_dirty(buf
);
1239 libxfs_buf_relse(buf
);
1245 * Fix discrepancies between the state of the inode reflink flag and our
1246 * observations as to whether or not the inode really needs it.
1249 fix_inode_reflink_flags(
1250 struct xfs_mount
*mp
,
1251 xfs_agnumber_t agno
)
1253 struct ino_tree_node
*irec
;
1263 * Update the reflink flag for any inode where there's a discrepancy
1264 * between the inode flag and whether or not we found any reflinked
1267 for (irec
= findfirst_inode_rec(agno
);
1269 irec
= next_ino_rec(irec
)) {
1270 ASSERT((irec
->ino_was_rl
& irec
->ir_free
) == 0);
1271 ASSERT((irec
->ino_is_rl
& irec
->ir_free
) == 0);
1272 was
= irec
->ino_was_rl
;
1273 is
= irec
->ino_is_rl
;
1277 dbg_printf("mismatch ino=%llu was=0x%lx is=0x%lx dif=0x%lx\n",
1278 (unsigned long long)XFS_AGINO_TO_INO(mp
, agno
,
1279 irec
->ino_startnum
),
1282 for (bit
= 0, mask
= 1; bit
< 64; bit
++, mask
<<= 1) {
1283 agino
= bit
+ irec
->ino_startnum
;
1286 else if (was
& mask
)
1287 error
= fix_inode_reflink_flag(mp
, agno
, agino
,
1290 error
= fix_inode_reflink_flag(mp
, agno
, agino
,
1296 _("Unable to fix reflink flag on inode %"PRIu64
".\n"),
1297 XFS_AGINO_TO_INO(mp
, agno
, agino
));
1305 * Return the number of refcount objects for an AG.
1308 refcount_record_count(
1309 struct xfs_mount
*mp
,
1310 xfs_agnumber_t agno
)
1312 return slab_count(ag_rmaps
[agno
].ar_refcount_items
);
1316 * Return a slab cursor that will return refcount objects in order.
1319 init_refcount_cursor(
1320 xfs_agnumber_t agno
,
1321 struct xfs_slab_cursor
**cur
)
1323 return init_slab_cursor(ag_rmaps
[agno
].ar_refcount_items
, NULL
, cur
);
1327 * Disable the refcount btree check.
1330 refcount_avoid_check(void)
1332 refcbt_suspect
= true;
1336 * Compare the observed reference counts against what's in the ag btree.
1340 struct xfs_mount
*mp
,
1341 xfs_agnumber_t agno
)
1343 struct xfs_refcount_irec tmp
;
1344 struct xfs_slab_cursor
*rl_cur
;
1345 struct xfs_btree_cur
*bt_cur
= NULL
;
1346 struct xfs_buf
*agbp
= NULL
;
1347 struct xfs_perag
*pag
= NULL
;
1348 struct xfs_refcount_irec
*rl_rec
;
1353 if (!xfs_sb_version_hasreflink(&mp
->m_sb
))
1355 if (refcbt_suspect
) {
1356 if (no_modify
&& agno
== 0)
1357 do_warn(_("would rebuild corrupt refcount btrees.\n"));
1361 /* Create cursors to refcount structures */
1362 error
= init_refcount_cursor(agno
, &rl_cur
);
1366 error
= -libxfs_alloc_read_agf(mp
, NULL
, agno
, 0, &agbp
);
1370 /* Leave the per-ag data "uninitialized" since we rewrite it later */
1371 pag
= libxfs_perag_get(mp
, agno
);
1374 bt_cur
= libxfs_refcountbt_init_cursor(mp
, NULL
, agbp
, pag
);
1380 rl_rec
= pop_slab_cursor(rl_cur
);
1382 /* Look for a refcount record in the btree */
1383 error
= -libxfs_refcount_lookup_le(bt_cur
,
1384 rl_rec
->rc_startblock
, &have
);
1389 _("Missing reference count record for (%u/%u) len %u count %u\n"),
1390 agno
, rl_rec
->rc_startblock
,
1391 rl_rec
->rc_blockcount
, rl_rec
->rc_refcount
);
1395 error
= -libxfs_refcount_get_rec(bt_cur
, &tmp
, &i
);
1400 _("Missing reference count record for (%u/%u) len %u count %u\n"),
1401 agno
, rl_rec
->rc_startblock
,
1402 rl_rec
->rc_blockcount
, rl_rec
->rc_refcount
);
1406 /* Compare each refcount observation against the btree's */
1407 if (tmp
.rc_startblock
!= rl_rec
->rc_startblock
||
1408 tmp
.rc_blockcount
!= rl_rec
->rc_blockcount
||
1409 tmp
.rc_refcount
!= rl_rec
->rc_refcount
)
1411 _("Incorrect reference count: saw (%u/%u) len %u nlinks %u; should be (%u/%u) len %u nlinks %u\n"),
1412 agno
, tmp
.rc_startblock
, tmp
.rc_blockcount
,
1413 tmp
.rc_refcount
, agno
, rl_rec
->rc_startblock
,
1414 rl_rec
->rc_blockcount
, rl_rec
->rc_refcount
);
1416 rl_rec
= pop_slab_cursor(rl_cur
);
1421 libxfs_btree_del_cursor(bt_cur
, error
? XFS_BTREE_ERROR
:
1424 libxfs_perag_put(pag
);
1426 libxfs_buf_relse(agbp
);
1427 free_slab_cursor(&rl_cur
);
1432 * Regenerate the AGFL so that we don't run out of it while rebuilding the
1433 * rmap btree. If skip_rmapbt is true, don't update the rmapbt (most probably
1434 * because we're updating the rmapbt).
1438 struct xfs_mount
*mp
,
1439 xfs_agnumber_t agno
,
1442 xfs_alloc_arg_t args
;
1447 memset(&args
, 0, sizeof(args
));
1451 args
.pag
= libxfs_perag_get(mp
, agno
);
1452 error
= -libxfs_trans_alloc_rollable(mp
, 0, &tp
);
1454 do_error(_("failed to fix AGFL on AG %d, error %d\n"),
1459 * Prior to rmapbt, all we had to do to fix the freelist is "expand"
1460 * the fresh AGFL header from empty to full. That hasn't changed. For
1461 * rmapbt, however, things change a bit.
1463 * When we're stuffing the rmapbt with the AG btree rmaps the tree can
1464 * expand, so we need to keep the AGFL well-stocked for the expansion.
1465 * However, this expansion can cause the bnobt/cntbt to shrink, which
1466 * can make the AGFL eligible for shrinking. Shrinking involves
1467 * freeing rmapbt entries, but since we haven't finished loading the
1468 * rmapbt with the btree rmaps it's possible for the remove operation
1469 * to fail. The AGFL block is large enough at this point to absorb any
1470 * blocks freed from the bnobt/cntbt, so we can disable shrinking.
1472 * During the initial AGFL regeneration during AGF generation in phase5
1473 * we must also disable rmapbt modifications because the AGF that
1474 * libxfs reads does not yet point to the new rmapbt. These initial
1475 * AGFL entries are added just prior to adding the AG btree block rmaps
1476 * to the rmapbt. It's ok to pass NOSHRINK here too, since the AGFL is
1477 * empty and cannot shrink.
1479 flags
= XFS_ALLOC_FLAG_NOSHRINK
;
1481 flags
|= XFS_ALLOC_FLAG_NORMAP
;
1482 error
= -libxfs_alloc_fix_freelist(&args
, flags
);
1483 libxfs_perag_put(args
.pag
);
1485 do_error(_("failed to fix AGFL on AG %d, error %d\n"),
1488 error
= -libxfs_trans_commit(tp
);
1490 do_error(_("%s: commit failed, error %d\n"), __func__
, error
);
1494 * Remember how many AGFL entries came from excess AG btree allocations and
1495 * therefore already have rmap entries.
1498 rmap_store_agflcount(
1499 struct xfs_mount
*mp
,
1500 xfs_agnumber_t agno
,
1503 if (!rmap_needs_work(mp
))
1506 ag_rmaps
[agno
].ar_flcount
= count
;