2 * Copyright (C) 2016 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version 2
9 * of the License, or (at your option) any later version.
11 * This program is distributed in the hope that it would be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
22 #include "err_protos.h"
33 # define dbg_printf(f, a...) do {printf(f, ## a); fflush(stdout); } while (0)
35 # define dbg_printf(f, a...)
38 /* per-AG rmap object anchor */
40 struct xfs_slab
*ar_rmaps
; /* rmap observations, p4 */
41 struct xfs_slab
*ar_raw_rmaps
; /* unmerged rmaps */
42 int ar_flcount
; /* agfl entries from leftover */
43 /* agbt allocations */
44 struct xfs_rmap_irec ar_last_rmap
; /* last rmap seen */
45 struct xfs_slab
*ar_refcount_items
; /* refcount items, p4-5 */
48 static struct xfs_ag_rmap
*ag_rmaps
;
49 static bool rmapbt_suspect
;
50 static bool refcbt_suspect
;
53 * Compare rmap observations for array sorting.
60 const struct xfs_rmap_irec
*pa
;
61 const struct xfs_rmap_irec
*pb
;
66 oa
= libxfs_rmap_irec_offset_pack(pa
);
67 ob
= libxfs_rmap_irec_offset_pack(pb
);
69 if (pa
->rm_startblock
< pb
->rm_startblock
)
71 else if (pa
->rm_startblock
> pb
->rm_startblock
)
73 else if (pa
->rm_owner
< pb
->rm_owner
)
75 else if (pa
->rm_owner
> pb
->rm_owner
)
86 * Returns true if we must reconstruct either the reference count or reverse
93 return xfs_sb_version_hasreflink(&mp
->m_sb
) ||
94 xfs_sb_version_hasrmapbt(&mp
->m_sb
);
98 * Initialize per-AG reverse map data.
102 struct xfs_mount
*mp
)
107 if (!rmap_needs_work(mp
))
110 ag_rmaps
= calloc(mp
->m_sb
.sb_agcount
, sizeof(struct xfs_ag_rmap
));
112 do_error(_("couldn't allocate per-AG reverse map roots\n"));
114 for (i
= 0; i
< mp
->m_sb
.sb_agcount
; i
++) {
115 error
= init_slab(&ag_rmaps
[i
].ar_rmaps
,
116 sizeof(struct xfs_rmap_irec
));
119 _("Insufficient memory while allocating reverse mapping slabs."));
120 error
= init_slab(&ag_rmaps
[i
].ar_raw_rmaps
,
121 sizeof(struct xfs_rmap_irec
));
124 _("Insufficient memory while allocating raw metadata reverse mapping slabs."));
125 ag_rmaps
[i
].ar_last_rmap
.rm_owner
= XFS_RMAP_OWN_UNKNOWN
;
126 error
= init_slab(&ag_rmaps
[i
].ar_refcount_items
,
127 sizeof(struct xfs_refcount_irec
));
130 _("Insufficient memory while allocating refcount item slabs."));
135 * Free the per-AG reverse-mapping data.
139 struct xfs_mount
*mp
)
143 if (!rmap_needs_work(mp
))
146 for (i
= 0; i
< mp
->m_sb
.sb_agcount
; i
++) {
147 free_slab(&ag_rmaps
[i
].ar_rmaps
);
148 free_slab(&ag_rmaps
[i
].ar_raw_rmaps
);
149 free_slab(&ag_rmaps
[i
].ar_refcount_items
);
156 * Decide if two reverse-mapping records can be merged.
160 struct xfs_rmap_irec
*r1
,
161 struct xfs_rmap_irec
*r2
)
163 if (r1
->rm_owner
!= r2
->rm_owner
)
165 if (r1
->rm_startblock
+ r1
->rm_blockcount
!= r2
->rm_startblock
)
167 if ((unsigned long long)r1
->rm_blockcount
+ r2
->rm_blockcount
>
170 if (XFS_RMAP_NON_INODE_OWNER(r2
->rm_owner
))
172 /* must be an inode owner below here */
173 if (r1
->rm_flags
!= r2
->rm_flags
)
175 if (r1
->rm_flags
& XFS_RMAP_BMBT_BLOCK
)
177 return r1
->rm_offset
+ r1
->rm_blockcount
== r2
->rm_offset
;
181 * Add an observation about a block mapping in an inode's data or attribute
182 * fork for later btree reconstruction.
186 struct xfs_mount
*mp
,
189 struct xfs_bmbt_irec
*irec
)
191 struct xfs_rmap_irec rmap
;
194 struct xfs_rmap_irec
*last_rmap
;
197 if (!rmap_needs_work(mp
))
200 agno
= XFS_FSB_TO_AGNO(mp
, irec
->br_startblock
);
201 agbno
= XFS_FSB_TO_AGBNO(mp
, irec
->br_startblock
);
202 ASSERT(agno
!= NULLAGNUMBER
);
203 ASSERT(agno
< mp
->m_sb
.sb_agcount
);
204 ASSERT(agbno
+ irec
->br_blockcount
<= mp
->m_sb
.sb_agblocks
);
205 ASSERT(ino
!= NULLFSINO
);
206 ASSERT(whichfork
== XFS_DATA_FORK
|| whichfork
== XFS_ATTR_FORK
);
209 rmap
.rm_offset
= irec
->br_startoff
;
211 if (whichfork
== XFS_ATTR_FORK
)
212 rmap
.rm_flags
|= XFS_RMAP_ATTR_FORK
;
213 rmap
.rm_startblock
= agbno
;
214 rmap
.rm_blockcount
= irec
->br_blockcount
;
215 if (irec
->br_state
== XFS_EXT_UNWRITTEN
)
216 rmap
.rm_flags
|= XFS_RMAP_UNWRITTEN
;
217 last_rmap
= &ag_rmaps
[agno
].ar_last_rmap
;
218 if (last_rmap
->rm_owner
== XFS_RMAP_OWN_UNKNOWN
)
220 else if (rmaps_are_mergeable(last_rmap
, &rmap
))
221 last_rmap
->rm_blockcount
+= rmap
.rm_blockcount
;
223 error
= slab_add(ag_rmaps
[agno
].ar_rmaps
, last_rmap
);
232 /* Finish collecting inode data/attr fork rmaps. */
234 rmap_finish_collecting_fork_recs(
235 struct xfs_mount
*mp
,
238 if (!rmap_needs_work(mp
) ||
239 ag_rmaps
[agno
].ar_last_rmap
.rm_owner
== XFS_RMAP_OWN_UNKNOWN
)
241 return slab_add(ag_rmaps
[agno
].ar_rmaps
, &ag_rmaps
[agno
].ar_last_rmap
);
244 /* add a raw rmap; these will be merged later */
247 struct xfs_mount
*mp
,
255 struct xfs_rmap_irec rmap
;
258 rmap
.rm_owner
= owner
;
262 rmap
.rm_flags
|= XFS_RMAP_ATTR_FORK
;
264 rmap
.rm_flags
|= XFS_RMAP_BMBT_BLOCK
;
265 rmap
.rm_startblock
= agbno
;
266 rmap
.rm_blockcount
= len
;
267 return slab_add(ag_rmaps
[agno
].ar_raw_rmaps
, &rmap
);
271 * Add a reverse mapping for an inode fork's block mapping btree block.
275 struct xfs_mount
*mp
,
283 if (!rmap_needs_work(mp
))
286 agno
= XFS_FSB_TO_AGNO(mp
, fsbno
);
287 agbno
= XFS_FSB_TO_AGBNO(mp
, fsbno
);
288 ASSERT(agno
!= NULLAGNUMBER
);
289 ASSERT(agno
< mp
->m_sb
.sb_agcount
);
290 ASSERT(agbno
+ 1 <= mp
->m_sb
.sb_agblocks
);
292 return __rmap_add_raw_rec(mp
, agno
, agbno
, 1, ino
,
293 whichfork
== XFS_ATTR_FORK
, true);
297 * Add a reverse mapping for a per-AG fixed metadata extent.
301 struct xfs_mount
*mp
,
307 if (!rmap_needs_work(mp
))
310 ASSERT(agno
!= NULLAGNUMBER
);
311 ASSERT(agno
< mp
->m_sb
.sb_agcount
);
312 ASSERT(agbno
+ len
<= mp
->m_sb
.sb_agblocks
);
314 return __rmap_add_raw_rec(mp
, agno
, agbno
, len
, owner
, false, false);
318 * Merge adjacent raw rmaps and add them to the main rmap list.
322 struct xfs_mount
*mp
,
325 struct xfs_slab_cursor
*cur
= NULL
;
326 struct xfs_rmap_irec
*prev
, *rec
;
330 old_sz
= slab_count(ag_rmaps
[agno
].ar_rmaps
);
331 if (slab_count(ag_rmaps
[agno
].ar_raw_rmaps
) == 0)
333 qsort_slab(ag_rmaps
[agno
].ar_raw_rmaps
, rmap_compare
);
334 error
= init_slab_cursor(ag_rmaps
[agno
].ar_raw_rmaps
, rmap_compare
,
339 prev
= pop_slab_cursor(cur
);
340 rec
= pop_slab_cursor(cur
);
341 while (prev
&& rec
) {
342 if (rmaps_are_mergeable(prev
, rec
)) {
343 prev
->rm_blockcount
+= rec
->rm_blockcount
;
344 rec
= pop_slab_cursor(cur
);
347 error
= slab_add(ag_rmaps
[agno
].ar_rmaps
, prev
);
351 rec
= pop_slab_cursor(cur
);
354 error
= slab_add(ag_rmaps
[agno
].ar_rmaps
, prev
);
358 free_slab(&ag_rmaps
[agno
].ar_raw_rmaps
);
359 error
= init_slab(&ag_rmaps
[agno
].ar_raw_rmaps
,
360 sizeof(struct xfs_rmap_irec
));
363 _("Insufficient memory while allocating raw metadata reverse mapping slabs."));
366 qsort_slab(ag_rmaps
[agno
].ar_rmaps
, rmap_compare
);
368 free_slab_cursor(&cur
);
379 for (n
= 0; n
< sizeof(mask
) * NBBY
&& (mask
& 1); n
++, mask
>>= 1)
395 for (n
= 0; n
< sizeof(mask
) * NBBY
; n
++, mask
>>= 1)
403 * Add an allocation group's fixed metadata to the rmap list. This includes
404 * sb/agi/agf/agfl headers, inode chunks, and the log.
407 rmap_add_fixed_ag_rec(
408 struct xfs_mount
*mp
,
413 ino_tree_node_t
*ino_rec
;
419 if (!rmap_needs_work(mp
))
422 /* sb/agi/agf/agfl headers */
423 error
= rmap_add_ag_rec(mp
, agno
, 0, XFS_BNO_BLOCK(mp
),
429 ino_rec
= findfirst_inode_rec(agno
);
430 for (; ino_rec
!= NULL
; ino_rec
= next_ino_rec(ino_rec
)) {
431 if (xfs_sb_version_hassparseinodes(&mp
->m_sb
)) {
432 startidx
= find_first_zero_bit(ino_rec
->ir_sparse
);
433 nr
= XFS_INODES_PER_CHUNK
- popcnt(ino_rec
->ir_sparse
);
436 nr
= XFS_INODES_PER_CHUNK
;
438 nr
/= mp
->m_sb
.sb_inopblock
;
441 agino
= ino_rec
->ino_startnum
+ startidx
;
442 agbno
= XFS_AGINO_TO_AGBNO(mp
, agino
);
443 if (XFS_AGINO_TO_OFFSET(mp
, agino
) == 0) {
444 error
= rmap_add_ag_rec(mp
, agno
, agbno
, nr
,
445 XFS_RMAP_OWN_INODES
);
452 fsbno
= mp
->m_sb
.sb_logstart
;
453 if (fsbno
&& XFS_FSB_TO_AGNO(mp
, fsbno
) == agno
) {
454 agbno
= XFS_FSB_TO_AGBNO(mp
, mp
->m_sb
.sb_logstart
);
455 error
= rmap_add_ag_rec(mp
, agno
, agbno
, mp
->m_sb
.sb_logblocks
,
465 * Copy the per-AG btree reverse-mapping data into the rmapbt.
467 * At rmapbt reconstruction time, the rmapbt will be populated _only_ with
468 * rmaps for file extents, inode chunks, AG headers, and bmbt blocks. While
469 * building the AG btrees we can record all the blocks allocated for each
470 * btree, but we cannot resolve the conflict between the fact that one has to
471 * finish allocating the space for the rmapbt before building the bnobt and the
472 * fact that allocating blocks for the bnobt requires adding rmapbt entries.
473 * Therefore we record in-core the rmaps for each btree and here use the
474 * libxfs rmap functions to finish building the rmap btree.
476 * During AGF/AGFL reconstruction in phase 5, rmaps for the AG btrees are
477 * recorded in memory. The rmapbt has not been set up yet, so we need to be
478 * able to "expand" the AGFL without updating the rmapbt. After we've written
479 * out the new AGF header the new rmapbt is available, so this function reads
480 * each AGFL to generate rmap entries. These entries are merged with the AG
481 * btree rmap entries, and then we use libxfs' rmap functions to add them to
482 * the rmapbt, after which it is fully regenerated.
485 rmap_store_ag_btree_rec(
486 struct xfs_mount
*mp
,
489 struct xfs_slab_cursor
*rm_cur
;
490 struct xfs_rmap_irec
*rm_rec
= NULL
;
491 struct xfs_buf
*agbp
= NULL
;
492 struct xfs_buf
*agflbp
= NULL
;
493 struct xfs_trans
*tp
;
494 struct xfs_trans_res tres
= {0};
495 __be32
*agfl_bno
, *b
;
497 struct xfs_owner_info oinfo
;
499 if (!xfs_sb_version_hasrmapbt(&mp
->m_sb
))
502 /* Release the ar_rmaps; they were put into the rmapbt during p5. */
503 free_slab(&ag_rmaps
[agno
].ar_rmaps
);
504 error
= init_slab(&ag_rmaps
[agno
].ar_rmaps
,
505 sizeof(struct xfs_rmap_irec
));
509 /* Add the AGFL blocks to the rmap list */
510 error
= -libxfs_trans_read_buf(
511 mp
, NULL
, mp
->m_ddev_targp
,
512 XFS_AG_DADDR(mp
, agno
, XFS_AGFL_DADDR(mp
)),
513 XFS_FSS_TO_BB(mp
, 1), 0, &agflbp
, &xfs_agfl_buf_ops
);
518 * Sometimes, the blocks at the beginning of the AGFL are there
519 * because we overestimated how many blocks we needed to rebuild
520 * the freespace btrees. ar_flcount records the number of
521 * blocks in this situation. Since those blocks already have an
522 * rmap, we only need to add rmap records for AGFL blocks past
523 * that point in the AGFL because those blocks are a result of a
524 * no-rmap no-shrink freelist fixup that we did earlier.
526 agfl_bno
= XFS_BUF_TO_AGFL_BNO(mp
, agflbp
);
527 b
= agfl_bno
+ ag_rmaps
[agno
].ar_flcount
;
528 while (*b
!= NULLAGBLOCK
&& b
- agfl_bno
< XFS_AGFL_SIZE(mp
)) {
529 error
= rmap_add_ag_rec(mp
, agno
, be32_to_cpu(*b
), 1,
535 libxfs_putbuf(agflbp
);
538 /* Merge all the raw rmaps into the main list */
539 error
= rmap_fold_raw_recs(mp
, agno
);
543 /* Create cursors to refcount structures */
544 error
= init_slab_cursor(ag_rmaps
[agno
].ar_rmaps
, rmap_compare
,
549 /* Insert rmaps into the btree one at a time */
550 rm_rec
= pop_slab_cursor(rm_cur
);
552 error
= -libxfs_trans_alloc(mp
, &tres
, 16, 0, 0, &tp
);
556 error
= -libxfs_alloc_read_agf(mp
, tp
, agno
, 0, &agbp
);
560 ASSERT(XFS_RMAP_NON_INODE_OWNER(rm_rec
->rm_owner
));
561 libxfs_rmap_ag_owner(&oinfo
, rm_rec
->rm_owner
);
562 error
= -libxfs_rmap_alloc(tp
, agbp
, agno
, rm_rec
->rm_startblock
,
563 rm_rec
->rm_blockcount
, &oinfo
);
567 error
= -libxfs_trans_commit(tp
);
571 fix_freelist(mp
, agno
, false);
573 rm_rec
= pop_slab_cursor(rm_cur
);
576 free_slab_cursor(&rm_cur
);
580 libxfs_trans_cancel(tp
);
582 free_slab_cursor(&rm_cur
);
585 libxfs_putbuf(agflbp
);
594 struct xfs_rmap_irec
*rmap
)
596 printf("%s: %p agno=%u pblk=%llu own=%lld lblk=%llu len=%u flags=0x%x\n",
599 (unsigned long long)rmap
->rm_startblock
,
600 (unsigned long long)rmap
->rm_owner
,
601 (unsigned long long)rmap
->rm_offset
,
602 (unsigned int)rmap
->rm_blockcount
,
603 (unsigned int)rmap
->rm_flags
);
606 # define rmap_dump(m, a, r)
610 * Rebuilding the Reference Count & Reverse Mapping Btrees
612 * The reference count (refcnt) and reverse mapping (rmap) btrees are
613 * rebuilt during phase 5, like all other AG btrees. Therefore, reverse
614 * mappings must be processed into reference counts at the end of phase
615 * 4, and the rmaps must be recorded during phase 4. There is a need to
616 * access the rmaps in physical block order, but no particular need for
617 * random access, so the slab.c code provides a big logical array
618 * (consisting of smaller slabs) and some inorder iterator functions.
620 * Once we've recorded all the reverse mappings, we're ready to
621 * translate the rmaps into refcount entries. Imagine the rmap entries
622 * as rectangles representing extents of physical blocks, and that the
623 * rectangles can be laid down to allow them to overlap each other; then
624 * we know that we must emit a refcnt btree entry wherever the amount of
625 * overlap changes, i.e. the emission stimulus is level-triggered:
628 * -- ----- ---- --- ------
629 * -- ---- ----------- ---- ---------
630 * -------------------------------- -----------
631 * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^
632 * 2 1 23 21 3 43 234 2123 1 01 2 3 0
634 * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
636 * Note that in the actual refcnt btree we don't store the refcount < 2
637 * cases because the bnobt tells us which blocks are free; single-use
638 * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt
639 * supports storing multiple entries covering a given block we could
640 * theoretically dispense with the refcntbt and simply count rmaps, but
641 * that's inefficient in the (hot) write path, so we'll take the cost of
642 * the extra tree to save time. Also there's no guarantee that rmap
645 * Given an array of rmaps sorted by physical block number, a starting
646 * physical block (sp), a bag to hold rmaps that cover sp, and the next
647 * physical block where the level changes (np), we can reconstruct the
648 * refcount btree as follows:
650 * While there are still unprocessed rmaps in the array,
651 * - Set sp to the physical block (pblk) of the next unprocessed rmap.
652 * - Add to the bag all rmaps in the array where startblock == sp.
653 * - Set np to the physical block where the bag size will change. This
654 * is the minimum of (the pblk of the next unprocessed rmap) and
655 * (startblock + len of each rmap in the bag).
656 * - Record the bag size as old_bag_size.
658 * - While the bag isn't empty,
659 * - Remove from the bag all rmaps where startblock + len == np.
660 * - Add to the bag all rmaps in the array where startblock == np.
661 * - If the bag size isn't old_bag_size, store the refcount entry
662 * (sp, np - sp, bag_size) in the refcnt btree.
663 * - If the bag is empty, break out of the inner loop.
664 * - Set old_bag_size to the bag size
666 * - Set np to the physical block where the bag size will change.
667 * This is the minimum of (the pblk of the next unprocessed rmap)
668 * and (startblock + len of each rmap in the bag).
670 * An implementation detail is that because this processing happens
671 * during phase 4, the refcount entries are stored in an array so that
672 * phase 5 can load them into the refcount btree. The rmaps can be
673 * loaded directly into the rmap btree during phase 5 as well.
677 * Mark all inodes in the reverse-mapping observation stack as requiring the
678 * reflink inode flag, if the stack depth is greater than 1.
682 struct xfs_mount
*mp
,
683 struct xfs_bag
*rmaps
)
685 xfs_agnumber_t iagno
;
686 struct xfs_rmap_irec
*rmap
;
687 struct ino_tree_node
*irec
;
692 if (bag_count(rmaps
) < 2)
695 /* Reflink flag accounting */
696 foreach_bag_ptr(rmaps
, idx
, rmap
) {
697 ASSERT(!XFS_RMAP_NON_INODE_OWNER(rmap
->rm_owner
));
698 iagno
= XFS_INO_TO_AGNO(mp
, rmap
->rm_owner
);
699 ino
= XFS_INO_TO_AGINO(mp
, rmap
->rm_owner
);
700 pthread_mutex_lock(&ag_locks
[iagno
].lock
);
701 irec
= find_inode_rec(mp
, iagno
, ino
);
702 off
= get_inode_offset(mp
, rmap
->rm_owner
, irec
);
703 /* lock here because we might go outside this ag */
704 set_inode_is_rl(irec
, off
);
705 pthread_mutex_unlock(&ag_locks
[iagno
].lock
);
710 * Emit a refcount object for refcntbt reconstruction during phase 5.
712 #define REFCOUNT_CLAMP(nr) ((nr) > MAXREFCOUNT ? MAXREFCOUNT : (nr))
715 struct xfs_mount
*mp
,
721 struct xfs_refcount_irec rlrec
;
723 struct xfs_slab
*rlslab
;
725 rlslab
= ag_rmaps
[agno
].ar_refcount_items
;
726 ASSERT(nr_rmaps
> 0);
728 dbg_printf("REFL: agno=%u pblk=%u, len=%u -> refcount=%zu\n",
729 agno
, agbno
, len
, nr_rmaps
);
730 rlrec
.rc_startblock
= agbno
;
731 rlrec
.rc_blockcount
= len
;
732 rlrec
.rc_refcount
= REFCOUNT_CLAMP(nr_rmaps
);
733 error
= slab_add(rlslab
, &rlrec
);
736 _("Insufficient memory while recreating refcount tree."));
738 #undef REFCOUNT_CLAMP
741 * Transform a pile of physical block mapping observations into refcount data
742 * for eventual rebuilding of the btrees.
744 #define RMAP_END(r) ((r)->rm_startblock + (r)->rm_blockcount)
747 struct xfs_mount
*mp
,
750 struct xfs_bag
*stack_top
= NULL
;
751 struct xfs_slab
*rmaps
;
752 struct xfs_slab_cursor
*rmaps_cur
;
753 struct xfs_rmap_irec
*array_cur
;
754 struct xfs_rmap_irec
*rmap
;
755 xfs_agblock_t sbno
; /* first bno of this rmap set */
756 xfs_agblock_t cbno
; /* first bno of this refcount set */
757 xfs_agblock_t nbno
; /* next bno where rmap set changes */
762 if (!xfs_sb_version_hasreflink(&mp
->m_sb
))
765 rmaps
= ag_rmaps
[agno
].ar_rmaps
;
767 error
= init_slab_cursor(rmaps
, rmap_compare
, &rmaps_cur
);
771 error
= init_bag(&stack_top
);
775 /* While there are rmaps to be processed... */
777 while (n
< slab_count(rmaps
)) {
778 array_cur
= peek_slab_cursor(rmaps_cur
);
779 sbno
= cbno
= array_cur
->rm_startblock
;
780 /* Push all rmaps with pblk == sbno onto the stack */
782 array_cur
&& array_cur
->rm_startblock
== sbno
;
783 array_cur
= peek_slab_cursor(rmaps_cur
)) {
784 advance_slab_cursor(rmaps_cur
); n
++;
785 rmap_dump("push0", agno
, array_cur
);
786 error
= bag_add(stack_top
, array_cur
);
790 mark_inode_rl(mp
, stack_top
);
792 /* Set nbno to the bno of the next refcount change */
793 if (n
< slab_count(rmaps
))
794 nbno
= array_cur
->rm_startblock
;
797 foreach_bag_ptr(stack_top
, idx
, rmap
) {
798 nbno
= min(nbno
, RMAP_END(rmap
));
801 /* Emit reverse mappings, if needed */
803 old_stack_nr
= bag_count(stack_top
);
805 /* While stack isn't empty... */
806 while (bag_count(stack_top
)) {
807 /* Pop all rmaps that end at nbno */
808 foreach_bag_ptr_reverse(stack_top
, idx
, rmap
) {
809 if (RMAP_END(rmap
) != nbno
)
811 rmap_dump("pop", agno
, rmap
);
812 error
= bag_remove(stack_top
, idx
);
817 /* Push array items that start at nbno */
819 array_cur
&& array_cur
->rm_startblock
== nbno
;
820 array_cur
= peek_slab_cursor(rmaps_cur
)) {
821 advance_slab_cursor(rmaps_cur
); n
++;
822 rmap_dump("push1", agno
, array_cur
);
823 error
= bag_add(stack_top
, array_cur
);
827 mark_inode_rl(mp
, stack_top
);
829 /* Emit refcount if necessary */
831 if (bag_count(stack_top
) != old_stack_nr
) {
832 if (old_stack_nr
> 1) {
833 refcount_emit(mp
, agno
, cbno
,
840 /* Stack empty, go find the next rmap */
841 if (bag_count(stack_top
) == 0)
843 old_stack_nr
= bag_count(stack_top
);
846 /* Set nbno to the bno of the next refcount change */
847 if (n
< slab_count(rmaps
))
848 nbno
= array_cur
->rm_startblock
;
851 foreach_bag_ptr(stack_top
, idx
, rmap
) {
852 nbno
= min(nbno
, RMAP_END(rmap
));
855 /* Emit reverse mappings, if needed */
860 free_bag(&stack_top
);
861 free_slab_cursor(&rmaps_cur
);
868 * Return the number of rmap objects for an AG.
872 struct xfs_mount
*mp
,
875 return slab_count(ag_rmaps
[agno
].ar_rmaps
);
879 * Return a slab cursor that will return rmap objects in order.
884 struct xfs_slab_cursor
**cur
)
886 return init_slab_cursor(ag_rmaps
[agno
].ar_rmaps
, rmap_compare
, cur
);
890 * Disable the refcount btree check.
893 rmap_avoid_check(void)
895 rmapbt_suspect
= true;
898 /* Look for an rmap in the rmapbt that matches a given rmap. */
901 struct xfs_btree_cur
*bt_cur
,
902 struct xfs_rmap_irec
*rm_rec
,
903 struct xfs_rmap_irec
*tmp
,
908 /* Use the regular btree retrieval routine. */
909 error
= -libxfs_rmap_lookup_le(bt_cur
, rm_rec
->rm_startblock
,
910 rm_rec
->rm_blockcount
,
911 rm_rec
->rm_owner
, rm_rec
->rm_offset
,
912 rm_rec
->rm_flags
, have
);
917 return -libxfs_rmap_get_rec(bt_cur
, tmp
, have
);
920 /* Does the btree rmap cover the observed rmap? */
921 #define NEXTP(x) ((x)->rm_startblock + (x)->rm_blockcount)
922 #define NEXTL(x) ((x)->rm_offset + (x)->rm_blockcount)
925 struct xfs_rmap_irec
*observed
,
926 struct xfs_rmap_irec
*btree
)
928 /* Can't have mismatches in the flags or the owner. */
929 if (btree
->rm_flags
!= observed
->rm_flags
||
930 btree
->rm_owner
!= observed
->rm_owner
)
934 * Btree record can't physically start after the observed
935 * record, nor can it end before the observed record.
937 if (btree
->rm_startblock
> observed
->rm_startblock
||
938 NEXTP(btree
) < NEXTP(observed
))
941 /* If this is metadata or bmbt, we're done. */
942 if (XFS_RMAP_NON_INODE_OWNER(observed
->rm_owner
) ||
943 (observed
->rm_flags
& XFS_RMAP_BMBT_BLOCK
))
946 * Btree record can't logically start after the observed
947 * record, nor can it end before the observed record.
949 if (btree
->rm_offset
> observed
->rm_offset
||
950 NEXTL(btree
) < NEXTL(observed
))
959 * Compare the observed reverse mappings against what's in the ag btree.
963 struct xfs_mount
*mp
,
966 struct xfs_slab_cursor
*rm_cur
;
967 struct xfs_btree_cur
*bt_cur
= NULL
;
970 struct xfs_buf
*agbp
= NULL
;
971 struct xfs_rmap_irec
*rm_rec
;
972 struct xfs_rmap_irec tmp
;
973 struct xfs_perag
*pag
; /* per allocation group data */
975 if (!xfs_sb_version_hasrmapbt(&mp
->m_sb
))
977 if (rmapbt_suspect
) {
978 if (no_modify
&& agno
== 0)
979 do_warn(_("would rebuild corrupt rmap btrees.\n"));
983 /* Create cursors to refcount structures */
984 error
= rmap_init_cursor(agno
, &rm_cur
);
988 error
= -libxfs_alloc_read_agf(mp
, NULL
, agno
, 0, &agbp
);
992 /* Leave the per-ag data "uninitialized" since we rewrite it later */
993 pag
= libxfs_perag_get(mp
, agno
);
995 libxfs_perag_put(pag
);
997 bt_cur
= libxfs_rmapbt_init_cursor(mp
, NULL
, agbp
, agno
);
1003 rm_rec
= pop_slab_cursor(rm_cur
);
1005 error
= rmap_lookup(bt_cur
, rm_rec
, &tmp
, &have
);
1010 _("Missing reverse-mapping record for (%u/%u) %slen %u owner %"PRId64
" \
1011 %s%soff %"PRIu64
"\n"),
1012 agno
, rm_rec
->rm_startblock
,
1013 (rm_rec
->rm_flags
& XFS_RMAP_UNWRITTEN
) ?
1014 _("unwritten ") : "",
1015 rm_rec
->rm_blockcount
,
1017 (rm_rec
->rm_flags
& XFS_RMAP_ATTR_FORK
) ?
1019 (rm_rec
->rm_flags
& XFS_RMAP_BMBT_BLOCK
) ?
1025 /* Compare each refcount observation against the btree's */
1026 if (!rmap_is_good(rm_rec
, &tmp
)) {
1028 _("Incorrect reverse-mapping: saw (%u/%u) %slen %u owner %"PRId64
" %s%soff \
1029 %"PRIu64
"; should be (%u/%u) %slen %u owner %"PRId64
" %s%soff %"PRIu64
"\n"),
1030 agno
, tmp
.rm_startblock
,
1031 (tmp
.rm_flags
& XFS_RMAP_UNWRITTEN
) ?
1032 _("unwritten ") : "",
1035 (tmp
.rm_flags
& XFS_RMAP_ATTR_FORK
) ?
1037 (tmp
.rm_flags
& XFS_RMAP_BMBT_BLOCK
) ?
1040 agno
, rm_rec
->rm_startblock
,
1041 (rm_rec
->rm_flags
& XFS_RMAP_UNWRITTEN
) ?
1042 _("unwritten ") : "",
1043 rm_rec
->rm_blockcount
,
1045 (rm_rec
->rm_flags
& XFS_RMAP_ATTR_FORK
) ?
1047 (rm_rec
->rm_flags
& XFS_RMAP_BMBT_BLOCK
) ?
1053 rm_rec
= pop_slab_cursor(rm_cur
);
1058 libxfs_btree_del_cursor(bt_cur
, XFS_BTREE_NOERROR
);
1060 libxfs_putbuf(agbp
);
1061 free_slab_cursor(&rm_cur
);
1066 * Compare the key fields of two rmap records -- positive if key1 > key2,
1067 * negative if key1 < key2, and zero if equal.
1071 struct xfs_rmap_irec
*kp1
,
1072 struct xfs_rmap_irec
*kp2
)
1077 struct xfs_rmap_irec tmp
;
1080 tmp
.rm_flags
&= ~XFS_RMAP_REC_FLAGS
;
1081 oa
= libxfs_rmap_irec_offset_pack(&tmp
);
1083 tmp
.rm_flags
&= ~XFS_RMAP_REC_FLAGS
;
1084 ob
= libxfs_rmap_irec_offset_pack(&tmp
);
1086 d
= (__int64_t
)kp1
->rm_startblock
- kp2
->rm_startblock
;
1090 if (kp1
->rm_owner
> kp2
->rm_owner
)
1092 else if (kp2
->rm_owner
> kp1
->rm_owner
)
1102 /* Compute the high key of an rmap record. */
1104 rmap_high_key_from_rec(
1105 struct xfs_rmap_irec
*rec
,
1106 struct xfs_rmap_irec
*key
)
1110 adj
= rec
->rm_blockcount
- 1;
1112 key
->rm_startblock
= rec
->rm_startblock
+ adj
;
1113 key
->rm_owner
= rec
->rm_owner
;
1114 key
->rm_offset
= rec
->rm_offset
;
1115 key
->rm_flags
= rec
->rm_flags
& XFS_RMAP_KEY_FLAGS
;
1116 if (XFS_RMAP_NON_INODE_OWNER(rec
->rm_owner
) ||
1117 (rec
->rm_flags
& XFS_RMAP_BMBT_BLOCK
))
1119 key
->rm_offset
+= adj
;
1123 * Record that an inode had the reflink flag set when repair started. The
1124 * inode reflink flag will be adjusted as necessary.
1127 record_inode_reflink_flag(
1128 struct xfs_mount
*mp
,
1129 struct xfs_dinode
*dino
,
1130 xfs_agnumber_t agno
,
1134 struct ino_tree_node
*irec
;
1137 ASSERT(XFS_AGINO_TO_INO(mp
, agno
, ino
) == be64_to_cpu(dino
->di_ino
));
1138 if (!(be64_to_cpu(dino
->di_flags2
) & XFS_DIFLAG2_REFLINK
))
1140 irec
= find_inode_rec(mp
, agno
, ino
);
1141 off
= get_inode_offset(mp
, lino
, irec
);
1142 ASSERT(!inode_was_rl(irec
, off
));
1143 set_inode_was_rl(irec
, off
);
1144 dbg_printf("set was_rl lino=%llu was=0x%llx\n",
1145 (unsigned long long)lino
, (unsigned long long)irec
->ino_was_rl
);
1149 * Fix an inode's reflink flag.
1152 fix_inode_reflink_flag(
1153 struct xfs_mount
*mp
,
1154 xfs_agnumber_t agno
,
1158 struct xfs_dinode
*dino
;
1159 struct xfs_buf
*buf
;
1163 _("setting reflink flag on inode %"PRIu64
"\n"),
1164 XFS_AGINO_TO_INO(mp
, agno
, agino
));
1165 else if (!no_modify
) /* && !set */
1167 _("clearing reflink flag on inode %"PRIu64
"\n"),
1168 XFS_AGINO_TO_INO(mp
, agno
, agino
));
1172 buf
= get_agino_buf(mp
, agno
, agino
, &dino
);
1175 ASSERT(XFS_AGINO_TO_INO(mp
, agno
, agino
) == be64_to_cpu(dino
->di_ino
));
1177 dino
->di_flags2
|= cpu_to_be64(XFS_DIFLAG2_REFLINK
);
1179 dino
->di_flags2
&= cpu_to_be64(~XFS_DIFLAG2_REFLINK
);
1180 libxfs_dinode_calc_crc(mp
, dino
);
1181 libxfs_writebuf(buf
, 0);
1187 * Fix discrepancies between the state of the inode reflink flag and our
1188 * observations as to whether or not the inode really needs it.
1191 fix_inode_reflink_flags(
1192 struct xfs_mount
*mp
,
1193 xfs_agnumber_t agno
)
1195 struct ino_tree_node
*irec
;
1205 * Update the reflink flag for any inode where there's a discrepancy
1206 * between the inode flag and whether or not we found any reflinked
1209 for (irec
= findfirst_inode_rec(agno
);
1211 irec
= next_ino_rec(irec
)) {
1212 ASSERT((irec
->ino_was_rl
& irec
->ir_free
) == 0);
1213 ASSERT((irec
->ino_is_rl
& irec
->ir_free
) == 0);
1214 was
= irec
->ino_was_rl
;
1215 is
= irec
->ino_is_rl
;
1219 dbg_printf("mismatch ino=%llu was=0x%lx is=0x%lx dif=0x%lx\n",
1220 (unsigned long long)XFS_AGINO_TO_INO(mp
, agno
,
1221 irec
->ino_startnum
),
1224 for (bit
= 0, mask
= 1; bit
< 64; bit
++, mask
<<= 1) {
1225 agino
= bit
+ irec
->ino_startnum
;
1228 else if (was
& mask
)
1229 error
= fix_inode_reflink_flag(mp
, agno
, agino
,
1232 error
= fix_inode_reflink_flag(mp
, agno
, agino
,
1238 _("Unable to fix reflink flag on inode %"PRIu64
".\n"),
1239 XFS_AGINO_TO_INO(mp
, agno
, agino
));
1247 * Return the number of refcount objects for an AG.
1250 refcount_record_count(
1251 struct xfs_mount
*mp
,
1252 xfs_agnumber_t agno
)
1254 return slab_count(ag_rmaps
[agno
].ar_refcount_items
);
1258 * Return a slab cursor that will return refcount objects in order.
1261 init_refcount_cursor(
1262 xfs_agnumber_t agno
,
1263 struct xfs_slab_cursor
**cur
)
1265 return init_slab_cursor(ag_rmaps
[agno
].ar_refcount_items
, NULL
, cur
);
1269 * Disable the refcount btree check.
1272 refcount_avoid_check(void)
1274 refcbt_suspect
= true;
1278 * Compare the observed reference counts against what's in the ag btree.
1282 struct xfs_mount
*mp
,
1283 xfs_agnumber_t agno
)
1285 struct xfs_slab_cursor
*rl_cur
;
1286 struct xfs_btree_cur
*bt_cur
= NULL
;
1290 struct xfs_buf
*agbp
= NULL
;
1291 struct xfs_refcount_irec
*rl_rec
;
1292 struct xfs_refcount_irec tmp
;
1293 struct xfs_perag
*pag
; /* per allocation group data */
1295 if (!xfs_sb_version_hasreflink(&mp
->m_sb
))
1297 if (refcbt_suspect
) {
1298 if (no_modify
&& agno
== 0)
1299 do_warn(_("would rebuild corrupt refcount btrees.\n"));
1303 /* Create cursors to refcount structures */
1304 error
= init_refcount_cursor(agno
, &rl_cur
);
1308 error
= -libxfs_alloc_read_agf(mp
, NULL
, agno
, 0, &agbp
);
1312 /* Leave the per-ag data "uninitialized" since we rewrite it later */
1313 pag
= libxfs_perag_get(mp
, agno
);
1315 libxfs_perag_put(pag
);
1317 bt_cur
= libxfs_refcountbt_init_cursor(mp
, NULL
, agbp
, agno
, NULL
);
1323 rl_rec
= pop_slab_cursor(rl_cur
);
1325 /* Look for a refcount record in the btree */
1326 error
= -libxfs_refcount_lookup_le(bt_cur
,
1327 rl_rec
->rc_startblock
, &have
);
1332 _("Missing reference count record for (%u/%u) len %u count %u\n"),
1333 agno
, rl_rec
->rc_startblock
,
1334 rl_rec
->rc_blockcount
, rl_rec
->rc_refcount
);
1338 error
= -libxfs_refcount_get_rec(bt_cur
, &tmp
, &i
);
1343 _("Missing reference count record for (%u/%u) len %u count %u\n"),
1344 agno
, rl_rec
->rc_startblock
,
1345 rl_rec
->rc_blockcount
, rl_rec
->rc_refcount
);
1349 /* Compare each refcount observation against the btree's */
1350 if (tmp
.rc_startblock
!= rl_rec
->rc_startblock
||
1351 tmp
.rc_blockcount
< rl_rec
->rc_blockcount
||
1352 tmp
.rc_refcount
< rl_rec
->rc_refcount
)
1354 _("Incorrect reference count: saw (%u/%u) len %u nlinks %u; should be (%u/%u) len %u nlinks %u\n"),
1355 agno
, tmp
.rc_startblock
, tmp
.rc_blockcount
,
1356 tmp
.rc_refcount
, agno
, rl_rec
->rc_startblock
,
1357 rl_rec
->rc_blockcount
, rl_rec
->rc_refcount
);
1359 rl_rec
= pop_slab_cursor(rl_cur
);
1364 libxfs_btree_del_cursor(bt_cur
, XFS_BTREE_NOERROR
);
1366 libxfs_putbuf(agbp
);
1367 free_slab_cursor(&rl_cur
);
1372 * Regenerate the AGFL so that we don't run out of it while rebuilding the
1373 * rmap btree. If skip_rmapbt is true, don't update the rmapbt (most probably
1374 * because we're updating the rmapbt).
1378 struct xfs_mount
*mp
,
1379 xfs_agnumber_t agno
,
1382 xfs_alloc_arg_t args
;
1384 struct xfs_trans_res tres
= {0};
1388 memset(&args
, 0, sizeof(args
));
1392 args
.pag
= libxfs_perag_get(mp
, agno
);
1393 error
= -libxfs_trans_alloc(mp
, &tres
,
1394 libxfs_alloc_min_freelist(mp
, args
.pag
), 0, 0, &tp
);
1396 do_error(_("failed to fix AGFL on AG %d, error %d\n"),
1401 * Prior to rmapbt, all we had to do to fix the freelist is "expand"
1402 * the fresh AGFL header from empty to full. That hasn't changed. For
1403 * rmapbt, however, things change a bit.
1405 * When we're stuffing the rmapbt with the AG btree rmaps the tree can
1406 * expand, so we need to keep the AGFL well-stocked for the expansion.
1407 * However, this expansion can cause the bnobt/cntbt to shrink, which
1408 * can make the AGFL eligible for shrinking. Shrinking involves
1409 * freeing rmapbt entries, but since we haven't finished loading the
1410 * rmapbt with the btree rmaps it's possible for the remove operation
1411 * to fail. The AGFL block is large enough at this point to absorb any
1412 * blocks freed from the bnobt/cntbt, so we can disable shrinking.
1414 * During the initial AGFL regeneration during AGF generation in phase5
1415 * we must also disable rmapbt modifications because the AGF that
1416 * libxfs reads does not yet point to the new rmapbt. These initial
1417 * AGFL entries are added just prior to adding the AG btree block rmaps
1418 * to the rmapbt. It's ok to pass NOSHRINK here too, since the AGFL is
1419 * empty and cannot shrink.
1421 flags
= XFS_ALLOC_FLAG_NOSHRINK
;
1423 flags
|= XFS_ALLOC_FLAG_NORMAP
;
1424 error
= -libxfs_alloc_fix_freelist(&args
, flags
);
1425 libxfs_perag_put(args
.pag
);
1427 do_error(_("failed to fix AGFL on AG %d, error %d\n"),
1430 libxfs_trans_commit(tp
);
1434 * Remember how many AGFL entries came from excess AG btree allocations and
1435 * therefore already have rmap entries.
1438 rmap_store_agflcount(
1439 struct xfs_mount
*mp
,
1440 xfs_agnumber_t agno
,
1443 if (!rmap_needs_work(mp
))
1446 ag_rmaps
[agno
].ar_flcount
= count
;