+// SPDX-License-Identifier: GPL-2.0+
/*
* Copyright (C) 2016 Oracle. All Rights Reserved.
- *
* Author: Darrick J. Wong <darrick.wong@oracle.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
*/
-#include <libxfs.h>
+#include "libxfs.h"
#include "btree.h"
#include "err_protos.h"
#include "libxlog.h"
#include "dinode.h"
#include "slab.h"
#include "rmap.h"
+#include "libfrog/bitmap.h"
#undef RMAP_DEBUG
int ar_flcount; /* agfl entries from leftover */
/* agbt allocations */
struct xfs_rmap_irec ar_last_rmap; /* last rmap seen */
+ struct xfs_slab *ar_refcount_items; /* refcount items, p4-5 */
};
static struct xfs_ag_rmap *ag_rmaps;
-static bool rmapbt_suspect;
+bool rmapbt_suspect;
+static bool refcbt_suspect;
-/*
- * Compare rmap observations for array sorting.
- */
-static int
-rmap_compare(
- const void *a,
- const void *b)
+static inline int rmap_compare(const void *a, const void *b)
{
- const struct xfs_rmap_irec *pa;
- const struct xfs_rmap_irec *pb;
- __u64 oa;
- __u64 ob;
-
- pa = a; pb = b;
- oa = xfs_rmap_irec_offset_pack(pa);
- ob = xfs_rmap_irec_offset_pack(pb);
-
- if (pa->rm_startblock < pb->rm_startblock)
- return -1;
- else if (pa->rm_startblock > pb->rm_startblock)
- return 1;
- else if (pa->rm_owner < pb->rm_owner)
- return -1;
- else if (pa->rm_owner > pb->rm_owner)
- return 1;
- else if (oa < ob)
- return -1;
- else if (oa > ob)
- return 1;
- else
- return 0;
+ return libxfs_rmap_compare(a, b);
}
/*
* mapping trees.
*/
bool
-needs_rmap_work(
+rmap_needs_work(
struct xfs_mount *mp)
{
- return xfs_sb_version_hasrmapbt(&mp->m_sb);
+ return xfs_has_reflink(mp) ||
+ xfs_has_rmapbt(mp);
}
/*
* Initialize per-AG reverse map data.
*/
void
-init_rmaps(
+rmaps_init(
struct xfs_mount *mp)
{
xfs_agnumber_t i;
int error;
- if (!needs_rmap_work(mp))
+ if (!rmap_needs_work(mp))
return;
ag_rmaps = calloc(mp->m_sb.sb_agcount, sizeof(struct xfs_ag_rmap));
do_error(
_("Insufficient memory while allocating raw metadata reverse mapping slabs."));
ag_rmaps[i].ar_last_rmap.rm_owner = XFS_RMAP_OWN_UNKNOWN;
+ error = init_slab(&ag_rmaps[i].ar_refcount_items,
+ sizeof(struct xfs_refcount_irec));
+ if (error)
+ do_error(
+_("Insufficient memory while allocating refcount item slabs."));
}
}
* Free the per-AG reverse-mapping data.
*/
void
-free_rmaps(
+rmaps_free(
struct xfs_mount *mp)
{
xfs_agnumber_t i;
- if (!needs_rmap_work(mp))
+ if (!rmap_needs_work(mp))
return;
for (i = 0; i < mp->m_sb.sb_agcount; i++) {
free_slab(&ag_rmaps[i].ar_rmaps);
free_slab(&ag_rmaps[i].ar_raw_rmaps);
+ free_slab(&ag_rmaps[i].ar_refcount_items);
}
free(ag_rmaps);
ag_rmaps = NULL;
* Decide if two reverse-mapping records can be merged.
*/
bool
-mergeable_rmaps(
+rmaps_are_mergeable(
struct xfs_rmap_irec *r1,
struct xfs_rmap_irec *r2)
{
* fork for later btree reconstruction.
*/
int
-add_rmap(
+rmap_add_rec(
struct xfs_mount *mp,
xfs_ino_t ino,
int whichfork,
struct xfs_rmap_irec *last_rmap;
int error = 0;
- if (!needs_rmap_work(mp))
+ if (!rmap_needs_work(mp))
return 0;
agno = XFS_FSB_TO_AGNO(mp, irec->br_startblock);
last_rmap = &ag_rmaps[agno].ar_last_rmap;
if (last_rmap->rm_owner == XFS_RMAP_OWN_UNKNOWN)
*last_rmap = rmap;
- else if (mergeable_rmaps(last_rmap, &rmap))
+ else if (rmaps_are_mergeable(last_rmap, &rmap))
last_rmap->rm_blockcount += rmap.rm_blockcount;
else {
error = slab_add(ag_rmaps[agno].ar_rmaps, last_rmap);
/* Finish collecting inode data/attr fork rmaps. */
int
-finish_collecting_fork_rmaps(
+rmap_finish_collecting_fork_recs(
struct xfs_mount *mp,
xfs_agnumber_t agno)
{
- if (!needs_rmap_work(mp) ||
+ if (!rmap_needs_work(mp) ||
ag_rmaps[agno].ar_last_rmap.rm_owner == XFS_RMAP_OWN_UNKNOWN)
return 0;
return slab_add(ag_rmaps[agno].ar_rmaps, &ag_rmaps[agno].ar_last_rmap);
/* add a raw rmap; these will be merged later */
static int
-__add_raw_rmap(
+__rmap_add_raw_rec(
struct xfs_mount *mp,
xfs_agnumber_t agno,
xfs_agblock_t agbno,
* Add a reverse mapping for an inode fork's block mapping btree block.
*/
int
-add_bmbt_rmap(
+rmap_add_bmbt_rec(
struct xfs_mount *mp,
xfs_ino_t ino,
int whichfork,
xfs_agnumber_t agno;
xfs_agblock_t agbno;
- if (!needs_rmap_work(mp))
+ if (!rmap_needs_work(mp))
return 0;
agno = XFS_FSB_TO_AGNO(mp, fsbno);
ASSERT(agno < mp->m_sb.sb_agcount);
ASSERT(agbno + 1 <= mp->m_sb.sb_agblocks);
- return __add_raw_rmap(mp, agno, agbno, 1, ino,
+ return __rmap_add_raw_rec(mp, agno, agbno, 1, ino,
whichfork == XFS_ATTR_FORK, true);
}
* Add a reverse mapping for a per-AG fixed metadata extent.
*/
int
-add_ag_rmap(
+rmap_add_ag_rec(
struct xfs_mount *mp,
xfs_agnumber_t agno,
xfs_agblock_t agbno,
xfs_extlen_t len,
uint64_t owner)
{
- if (!needs_rmap_work(mp))
+ if (!rmap_needs_work(mp))
return 0;
ASSERT(agno != NULLAGNUMBER);
ASSERT(agno < mp->m_sb.sb_agcount);
ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
- return __add_raw_rmap(mp, agno, agbno, len, owner, false, false);
+ return __rmap_add_raw_rec(mp, agno, agbno, len, owner, false, false);
}
/*
* Merge adjacent raw rmaps and add them to the main rmap list.
*/
int
-fold_raw_rmaps(
+rmap_fold_raw_recs(
struct xfs_mount *mp,
xfs_agnumber_t agno)
{
struct xfs_slab_cursor *cur = NULL;
struct xfs_rmap_irec *prev, *rec;
- size_t old_sz;
- int error;
+ uint64_t old_sz;
+ int error = 0;
old_sz = slab_count(ag_rmaps[agno].ar_rmaps);
if (slab_count(ag_rmaps[agno].ar_raw_rmaps) == 0)
prev = pop_slab_cursor(cur);
rec = pop_slab_cursor(cur);
- while (rec) {
- if (mergeable_rmaps(prev, rec)) {
+ while (prev && rec) {
+ if (rmaps_are_mergeable(prev, rec)) {
prev->rm_blockcount += rec->rm_blockcount;
rec = pop_slab_cursor(cur);
continue;
static int
find_first_zero_bit(
- __uint64_t mask)
+ uint64_t mask)
{
int n;
int b = 0;
static int
popcnt(
- __uint64_t mask)
+ uint64_t mask)
{
int n;
int b = 0;
* sb/agi/agf/agfl headers, inode chunks, and the log.
*/
int
-add_fixed_ag_rmap_data(
+rmap_add_fixed_ag_rec(
struct xfs_mount *mp,
xfs_agnumber_t agno)
{
int startidx;
int nr;
- if (!needs_rmap_work(mp))
+ if (!rmap_needs_work(mp))
return 0;
/* sb/agi/agf/agfl headers */
- error = add_ag_rmap(mp, agno, 0, XFS_BNO_BLOCK(mp),
+ error = rmap_add_ag_rec(mp, agno, 0, XFS_BNO_BLOCK(mp),
XFS_RMAP_OWN_FS);
if (error)
goto out;
/* inodes */
ino_rec = findfirst_inode_rec(agno);
for (; ino_rec != NULL; ino_rec = next_ino_rec(ino_rec)) {
- if (xfs_sb_version_hassparseinodes(&mp->m_sb)) {
+ if (xfs_has_sparseinodes(mp)) {
startidx = find_first_zero_bit(ino_rec->ir_sparse);
nr = XFS_INODES_PER_CHUNK - popcnt(ino_rec->ir_sparse);
} else {
agino = ino_rec->ino_startnum + startidx;
agbno = XFS_AGINO_TO_AGBNO(mp, agino);
if (XFS_AGINO_TO_OFFSET(mp, agino) == 0) {
- error = add_ag_rmap(mp, agno, agbno, nr,
+ error = rmap_add_ag_rec(mp, agno, agbno, nr,
XFS_RMAP_OWN_INODES);
if (error)
goto out;
fsbno = mp->m_sb.sb_logstart;
if (fsbno && XFS_FSB_TO_AGNO(mp, fsbno) == agno) {
agbno = XFS_FSB_TO_AGBNO(mp, mp->m_sb.sb_logstart);
- error = add_ag_rmap(mp, agno, agbno, mp->m_sb.sb_logblocks,
+ error = rmap_add_ag_rec(mp, agno, agbno, mp->m_sb.sb_logblocks,
XFS_RMAP_OWN_LOG);
if (error)
goto out;
* the rmapbt, after which it is fully regenerated.
*/
int
-store_ag_btree_rmap_data(
+rmap_store_ag_btree_rec(
struct xfs_mount *mp,
xfs_agnumber_t agno)
{
struct xfs_buf *agbp = NULL;
struct xfs_buf *agflbp = NULL;
struct xfs_trans *tp;
- struct xfs_trans_res tres = {0};
__be32 *agfl_bno, *b;
+ struct xfs_ag_rmap *ag_rmap = &ag_rmaps[agno];
+ struct bitmap *own_ag_bitmap = NULL;
int error = 0;
- struct xfs_owner_info oinfo;
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (!xfs_has_rmapbt(mp))
return 0;
/* Release the ar_rmaps; they were put into the rmapbt during p5. */
- free_slab(&ag_rmaps[agno].ar_rmaps);
- error = init_slab(&ag_rmaps[agno].ar_rmaps,
- sizeof(struct xfs_rmap_irec));
+ free_slab(&ag_rmap->ar_rmaps);
+ error = init_slab(&ag_rmap->ar_rmaps, sizeof(struct xfs_rmap_irec));
if (error)
goto err;
/* Add the AGFL blocks to the rmap list */
- error = libxfs_trans_read_buf(
+ error = -libxfs_trans_read_buf(
mp, NULL, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, &agflbp, &xfs_agfl_buf_ops);
if (error)
goto err;
- agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
- agfl_bno += ag_rmaps[agno].ar_flcount;
- b = agfl_bno;
- while (*b != NULLAGBLOCK && b - agfl_bno <= XFS_AGFL_SIZE(mp)) {
- error = add_ag_rmap(mp, agno, be32_to_cpu(*b), 1,
- XFS_RMAP_OWN_AG);
- if (error)
- goto err;
+ /*
+ * Sometimes, the blocks at the beginning of the AGFL are there
+ * because we overestimated how many blocks we needed to rebuild
+ * the freespace btrees. ar_flcount records the number of
+ * blocks in this situation. Since those blocks already have an
+ * rmap, we only need to add rmap records for AGFL blocks past
+ * that point in the AGFL because those blocks are a result of a
+ * no-rmap no-shrink freelist fixup that we did earlier.
+ *
+ * However, some blocks end up on the AGFL because the free space
+ * btrees shed blocks as a result of allocating space to fix the
+ * freelist. We already created in-core rmap records for the free
+ * space btree blocks, so we must be careful not to create those
+ * records again. Create a bitmap of already-recorded OWN_AG rmaps.
+ */
+ error = init_slab_cursor(ag_rmap->ar_raw_rmaps, rmap_compare, &rm_cur);
+ if (error)
+ goto err;
+ error = -bitmap_alloc(&own_ag_bitmap);
+ if (error)
+ goto err_slab;
+ while ((rm_rec = pop_slab_cursor(rm_cur)) != NULL) {
+ if (rm_rec->rm_owner != XFS_RMAP_OWN_AG)
+ continue;
+ error = -bitmap_set(own_ag_bitmap, rm_rec->rm_startblock,
+ rm_rec->rm_blockcount);
+ if (error) {
+ /*
+ * If this range is already set, then the incore rmap
+ * records for the AG free space btrees overlap and
+ * we're toast because that is not allowed.
+ */
+ if (error == EEXIST)
+ error = EFSCORRUPTED;
+ goto err_slab;
+ }
+ }
+ free_slab_cursor(&rm_cur);
+
+ /* Create rmaps for any AGFL blocks that aren't already rmapped. */
+ agfl_bno = xfs_buf_to_agfl_bno(agflbp);
+ b = agfl_bno + ag_rmap->ar_flcount;
+ while (*b != cpu_to_be32(NULLAGBLOCK) &&
+ b - agfl_bno < libxfs_agfl_size(mp)) {
+ xfs_agblock_t agbno;
+
+ agbno = be32_to_cpu(*b);
+ if (!bitmap_test(own_ag_bitmap, agbno, 1)) {
+ error = rmap_add_ag_rec(mp, agno, agbno, 1,
+ XFS_RMAP_OWN_AG);
+ if (error)
+ goto err;
+ }
b++;
}
- libxfs_putbuf(agflbp);
+ libxfs_buf_relse(agflbp);
agflbp = NULL;
+ bitmap_free(&own_ag_bitmap);
/* Merge all the raw rmaps into the main list */
- error = fold_raw_rmaps(mp, agno);
+ error = rmap_fold_raw_recs(mp, agno);
if (error)
goto err;
/* Create cursors to refcount structures */
- error = init_slab_cursor(ag_rmaps[agno].ar_rmaps, rmap_compare,
- &rm_cur);
+ error = init_slab_cursor(ag_rmap->ar_rmaps, rmap_compare, &rm_cur);
if (error)
goto err;
/* Insert rmaps into the btree one at a time */
rm_rec = pop_slab_cursor(rm_cur);
while (rm_rec) {
- error = -libxfs_trans_alloc(mp, &tres, 16, 0, 0, &tp);
+ struct xfs_owner_info oinfo = {};
+ struct xfs_perag *pag;
+
+ error = -libxfs_trans_alloc_rollable(mp, 16, &tp);
if (error)
goto err_slab;
- error = libxfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
- if (error)
+ pag = libxfs_perag_get(mp, agno);
+ error = -libxfs_alloc_read_agf(pag, tp, 0, &agbp);
+ if (error) {
+ libxfs_perag_put(pag);
goto err_trans;
+ }
ASSERT(XFS_RMAP_NON_INODE_OWNER(rm_rec->rm_owner));
- libxfs_rmap_ag_owner(&oinfo, rm_rec->rm_owner);
- error = libxfs_rmap_alloc(tp, agbp, agno, rm_rec->rm_startblock,
+ oinfo.oi_owner = rm_rec->rm_owner;
+ error = -libxfs_rmap_alloc(tp, agbp, pag, rm_rec->rm_startblock,
rm_rec->rm_blockcount, &oinfo);
+ libxfs_perag_put(pag);
if (error)
goto err_trans;
free_slab_cursor(&rm_cur);
err:
if (agflbp)
- libxfs_putbuf(agflbp);
- printf("FAIL err %d\n", error);
+ libxfs_buf_relse(agflbp);
+ if (own_ag_bitmap)
+ bitmap_free(&own_ag_bitmap);
return error;
}
#ifdef RMAP_DEBUG
static void
-dump_rmap(
+rmap_dump(
const char *msg,
xfs_agnumber_t agno,
struct xfs_rmap_irec *rmap)
(unsigned int)rmap->rm_flags);
}
#else
-# define dump_rmap(m, a, r)
+# define rmap_dump(m, a, r)
#endif
+/*
+ * Rebuilding the Reference Count & Reverse Mapping Btrees
+ *
+ * The reference count (refcnt) and reverse mapping (rmap) btrees are
+ * rebuilt during phase 5, like all other AG btrees. Therefore, reverse
+ * mappings must be processed into reference counts at the end of phase
+ * 4, and the rmaps must be recorded during phase 4. There is a need to
+ * access the rmaps in physical block order, but no particular need for
+ * random access, so the slab.c code provides a big logical array
+ * (consisting of smaller slabs) and some inorder iterator functions.
+ *
+ * Once we've recorded all the reverse mappings, we're ready to
+ * translate the rmaps into refcount entries. Imagine the rmap entries
+ * as rectangles representing extents of physical blocks, and that the
+ * rectangles can be laid down to allow them to overlap each other; then
+ * we know that we must emit a refcnt btree entry wherever the amount of
+ * overlap changes, i.e. the emission stimulus is level-triggered:
+ *
+ * - ---
+ * -- ----- ---- --- ------
+ * -- ---- ----------- ---- ---------
+ * -------------------------------- -----------
+ * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^
+ * 2 1 23 21 3 43 234 2123 1 01 2 3 0
+ *
+ * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
+ *
+ * Note that in the actual refcnt btree we don't store the refcount < 2
+ * cases because the bnobt tells us which blocks are free; single-use
+ * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt
+ * supports storing multiple entries covering a given block we could
+ * theoretically dispense with the refcntbt and simply count rmaps, but
+ * that's inefficient in the (hot) write path, so we'll take the cost of
+ * the extra tree to save time. Also there's no guarantee that rmap
+ * will be enabled.
+ *
+ * Given an array of rmaps sorted by physical block number, a starting
+ * physical block (sp), a bag to hold rmaps that cover sp, and the next
+ * physical block where the level changes (np), we can reconstruct the
+ * refcount btree as follows:
+ *
+ * While there are still unprocessed rmaps in the array,
+ * - Set sp to the physical block (pblk) of the next unprocessed rmap.
+ * - Add to the bag all rmaps in the array where startblock == sp.
+ * - Set np to the physical block where the bag size will change. This
+ * is the minimum of (the pblk of the next unprocessed rmap) and
+ * (startblock + len of each rmap in the bag).
+ * - Record the bag size as old_bag_size.
+ *
+ * - While the bag isn't empty,
+ * - Remove from the bag all rmaps where startblock + len == np.
+ * - Add to the bag all rmaps in the array where startblock == np.
+ * - If the bag size isn't old_bag_size, store the refcount entry
+ * (sp, np - sp, bag_size) in the refcnt btree.
+ * - If the bag is empty, break out of the inner loop.
+ * - Set old_bag_size to the bag size
+ * - Set sp = np.
+ * - Set np to the physical block where the bag size will change.
+ * This is the minimum of (the pblk of the next unprocessed rmap)
+ * and (startblock + len of each rmap in the bag).
+ *
+ * An implementation detail is that because this processing happens
+ * during phase 4, the refcount entries are stored in an array so that
+ * phase 5 can load them into the refcount btree. The rmaps can be
+ * loaded directly into the rmap btree during phase 5 as well.
+ */
+
+/*
+ * Mark all inodes in the reverse-mapping observation stack as requiring the
+ * reflink inode flag, if the stack depth is greater than 1.
+ */
+static void
+mark_inode_rl(
+ struct xfs_mount *mp,
+ struct xfs_bag *rmaps)
+{
+ xfs_agnumber_t iagno;
+ struct xfs_rmap_irec *rmap;
+ struct ino_tree_node *irec;
+ int off;
+ uint64_t idx;
+ xfs_agino_t ino;
+
+ if (bag_count(rmaps) < 2)
+ return;
+
+ /* Reflink flag accounting */
+ foreach_bag_ptr(rmaps, idx, rmap) {
+ ASSERT(!XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner));
+ iagno = XFS_INO_TO_AGNO(mp, rmap->rm_owner);
+ ino = XFS_INO_TO_AGINO(mp, rmap->rm_owner);
+ pthread_mutex_lock(&ag_locks[iagno].lock);
+ irec = find_inode_rec(mp, iagno, ino);
+ off = get_inode_offset(mp, rmap->rm_owner, irec);
+ /* lock here because we might go outside this ag */
+ set_inode_is_rl(irec, off);
+ pthread_mutex_unlock(&ag_locks[iagno].lock);
+ }
+}
+
+/*
+ * Emit a refcount object for refcntbt reconstruction during phase 5.
+ */
+static void
+refcount_emit(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len,
+ uint64_t nr_rmaps)
+{
+ struct xfs_refcount_irec rlrec;
+ int error;
+ struct xfs_slab *rlslab;
+
+ rlslab = ag_rmaps[agno].ar_refcount_items;
+ ASSERT(nr_rmaps > 0);
+
+ dbg_printf("REFL: agno=%u pblk=%u, len=%u -> refcount=%zu\n",
+ agno, agbno, len, nr_rmaps);
+ rlrec.rc_startblock = agbno;
+ rlrec.rc_blockcount = len;
+ nr_rmaps = min(nr_rmaps, MAXREFCOUNT);
+ rlrec.rc_refcount = nr_rmaps;
+ rlrec.rc_domain = XFS_REFC_DOMAIN_SHARED;
+
+ error = slab_add(rlslab, &rlrec);
+ if (error)
+ do_error(
+_("Insufficient memory while recreating refcount tree."));
+}
+
+/*
+ * Transform a pile of physical block mapping observations into refcount data
+ * for eventual rebuilding of the btrees.
+ */
+#define RMAP_END(r) ((r)->rm_startblock + (r)->rm_blockcount)
+int
+compute_refcounts(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_bag *stack_top = NULL;
+ struct xfs_slab *rmaps;
+ struct xfs_slab_cursor *rmaps_cur;
+ struct xfs_rmap_irec *array_cur;
+ struct xfs_rmap_irec *rmap;
+ uint64_t n, idx;
+ uint64_t old_stack_nr;
+ xfs_agblock_t sbno; /* first bno of this rmap set */
+ xfs_agblock_t cbno; /* first bno of this refcount set */
+ xfs_agblock_t nbno; /* next bno where rmap set changes */
+ int error;
+
+ if (!xfs_has_reflink(mp))
+ return 0;
+
+ rmaps = ag_rmaps[agno].ar_rmaps;
+
+ error = init_slab_cursor(rmaps, rmap_compare, &rmaps_cur);
+ if (error)
+ return error;
+
+ error = init_bag(&stack_top);
+ if (error)
+ goto err;
+
+ /* While there are rmaps to be processed... */
+ n = 0;
+ while (n < slab_count(rmaps)) {
+ array_cur = peek_slab_cursor(rmaps_cur);
+ sbno = cbno = array_cur->rm_startblock;
+ /* Push all rmaps with pblk == sbno onto the stack */
+ for (;
+ array_cur && array_cur->rm_startblock == sbno;
+ array_cur = peek_slab_cursor(rmaps_cur)) {
+ advance_slab_cursor(rmaps_cur); n++;
+ rmap_dump("push0", agno, array_cur);
+ error = bag_add(stack_top, array_cur);
+ if (error)
+ goto err;
+ }
+ mark_inode_rl(mp, stack_top);
+
+ /* Set nbno to the bno of the next refcount change */
+ if (n < slab_count(rmaps) && array_cur)
+ nbno = array_cur->rm_startblock;
+ else
+ nbno = NULLAGBLOCK;
+ foreach_bag_ptr(stack_top, idx, rmap) {
+ nbno = min(nbno, RMAP_END(rmap));
+ }
+
+ /* Emit reverse mappings, if needed */
+ ASSERT(nbno > sbno);
+ old_stack_nr = bag_count(stack_top);
+
+ /* While stack isn't empty... */
+ while (bag_count(stack_top)) {
+ /* Pop all rmaps that end at nbno */
+ foreach_bag_ptr_reverse(stack_top, idx, rmap) {
+ if (RMAP_END(rmap) != nbno)
+ continue;
+ rmap_dump("pop", agno, rmap);
+ error = bag_remove(stack_top, idx);
+ if (error)
+ goto err;
+ }
+
+ /* Push array items that start at nbno */
+ for (;
+ array_cur && array_cur->rm_startblock == nbno;
+ array_cur = peek_slab_cursor(rmaps_cur)) {
+ advance_slab_cursor(rmaps_cur); n++;
+ rmap_dump("push1", agno, array_cur);
+ error = bag_add(stack_top, array_cur);
+ if (error)
+ goto err;
+ }
+ mark_inode_rl(mp, stack_top);
+
+ /* Emit refcount if necessary */
+ ASSERT(nbno > cbno);
+ if (bag_count(stack_top) != old_stack_nr) {
+ if (old_stack_nr > 1) {
+ refcount_emit(mp, agno, cbno,
+ nbno - cbno,
+ old_stack_nr);
+ }
+ cbno = nbno;
+ }
+
+ /* Stack empty, go find the next rmap */
+ if (bag_count(stack_top) == 0)
+ break;
+ old_stack_nr = bag_count(stack_top);
+ sbno = nbno;
+
+ /* Set nbno to the bno of the next refcount change */
+ if (n < slab_count(rmaps))
+ nbno = array_cur->rm_startblock;
+ else
+ nbno = NULLAGBLOCK;
+ foreach_bag_ptr(stack_top, idx, rmap) {
+ nbno = min(nbno, RMAP_END(rmap));
+ }
+
+ /* Emit reverse mappings, if needed */
+ ASSERT(nbno > sbno);
+ }
+ }
+err:
+ free_bag(&stack_top);
+ free_slab_cursor(&rmaps_cur);
+
+ return error;
+}
+#undef RMAP_END
+
/*
* Return the number of rmap objects for an AG.
*/
-size_t
+uint64_t
rmap_record_count(
- struct xfs_mount *mp,
+ struct xfs_mount *mp,
xfs_agnumber_t agno)
{
return slab_count(ag_rmaps[agno].ar_rmaps);
* Return a slab cursor that will return rmap objects in order.
*/
int
-init_rmap_cursor(
+rmap_init_cursor(
xfs_agnumber_t agno,
struct xfs_slab_cursor **cur)
{
/* Look for an rmap in the rmapbt that matches a given rmap. */
static int
-lookup_rmap(
+rmap_lookup(
struct xfs_btree_cur *bt_cur,
struct xfs_rmap_irec *rm_rec,
struct xfs_rmap_irec *tmp,
int *have)
{
- int error;
-
/* Use the regular btree retrieval routine. */
- error = -libxfs_rmap_lookup_le(bt_cur, rm_rec->rm_startblock,
- rm_rec->rm_blockcount,
+ return -libxfs_rmap_lookup_le(bt_cur, rm_rec->rm_startblock,
rm_rec->rm_owner, rm_rec->rm_offset,
- rm_rec->rm_flags, have);
- if (error)
- return error;
- if (*have == 0)
- return error;
- return -libxfs_rmap_get_rec(bt_cur, tmp, have);
+ rm_rec->rm_flags, tmp, have);
+}
+
+/* Look for an rmap in the rmapbt that matches a given rmap. */
+static int
+rmap_lookup_overlapped(
+ struct xfs_btree_cur *bt_cur,
+ struct xfs_rmap_irec *rm_rec,
+ struct xfs_rmap_irec *tmp,
+ int *have)
+{
+ /* Have to use our fancy version for overlapped */
+ return -libxfs_rmap_lookup_le_range(bt_cur, rm_rec->rm_startblock,
+ rm_rec->rm_owner, rm_rec->rm_offset,
+ rm_rec->rm_flags, tmp, have);
}
/* Does the btree rmap cover the observed rmap? */
#define NEXTP(x) ((x)->rm_startblock + (x)->rm_blockcount)
#define NEXTL(x) ((x)->rm_offset + (x)->rm_blockcount)
static bool
-is_good_rmap(
+rmap_is_good(
struct xfs_rmap_irec *observed,
struct xfs_rmap_irec *btree)
{
/*
* Compare the observed reverse mappings against what's in the ag btree.
*/
-int
-check_rmaps(
+void
+rmaps_verify_btree(
struct xfs_mount *mp,
xfs_agnumber_t agno)
{
+ struct xfs_rmap_irec tmp;
struct xfs_slab_cursor *rm_cur;
struct xfs_btree_cur *bt_cur = NULL;
- int error;
- int have;
struct xfs_buf *agbp = NULL;
struct xfs_rmap_irec *rm_rec;
- struct xfs_rmap_irec tmp;
- struct xfs_perag *pag; /* per allocation group data */
+ struct xfs_perag *pag = NULL;
+ int have;
+ int error;
- if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
- return 0;
+ if (!xfs_has_rmapbt(mp))
+ return;
if (rmapbt_suspect) {
if (no_modify && agno == 0)
do_warn(_("would rebuild corrupt rmap btrees.\n"));
- return 0;
+ return;
}
/* Create cursors to refcount structures */
- error = init_rmap_cursor(agno, &rm_cur);
- if (error)
- return error;
+ error = rmap_init_cursor(agno, &rm_cur);
+ if (error) {
+ do_warn(_("Not enough memory to check reverse mappings.\n"));
+ return;
+ }
- error = -libxfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
- if (error)
- goto err;
+ pag = libxfs_perag_get(mp, agno);
+ error = -libxfs_alloc_read_agf(pag, NULL, 0, &agbp);
+ if (error) {
+ do_warn(_("Could not read AGF %u to check rmap btree.\n"),
+ agno);
+ goto err_pag;
+ }
/* Leave the per-ag data "uninitialized" since we rewrite it later */
- pag = xfs_perag_get(mp, agno);
- pag->pagf_init = 0;
- xfs_perag_put(pag);
+ clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
- bt_cur = libxfs_rmapbt_init_cursor(mp, NULL, agbp, agno);
+ bt_cur = libxfs_rmapbt_init_cursor(mp, NULL, agbp, pag);
if (!bt_cur) {
- error = -ENOMEM;
- goto err;
+ do_warn(_("Not enough memory to check reverse mappings.\n"));
+ goto err_agf;
}
rm_rec = pop_slab_cursor(rm_cur);
while (rm_rec) {
- error = lookup_rmap(bt_cur, rm_rec, &tmp, &have);
- if (error)
- goto err;
+ error = rmap_lookup(bt_cur, rm_rec, &tmp, &have);
+ if (error) {
+ do_warn(
+_("Could not read reverse-mapping record for (%u/%u).\n"),
+ agno, rm_rec->rm_startblock);
+ goto err_cur;
+ }
+
+ /*
+ * Using the range query is expensive, so only do it if
+ * the regular lookup doesn't find anything or if it doesn't
+ * match the observed rmap.
+ */
+ if (xfs_has_reflink(bt_cur->bc_mp) &&
+ (!have || !rmap_is_good(rm_rec, &tmp))) {
+ error = rmap_lookup_overlapped(bt_cur, rm_rec,
+ &tmp, &have);
+ if (error) {
+ do_warn(
+_("Could not read reverse-mapping record for (%u/%u).\n"),
+ agno, rm_rec->rm_startblock);
+ goto err_cur;
+ }
+ }
if (!have) {
do_warn(
_("Missing reverse-mapping record for (%u/%u) %slen %u owner %"PRId64" \
}
/* Compare each refcount observation against the btree's */
- if (!is_good_rmap(rm_rec, &tmp)) {
+ if (!rmap_is_good(rm_rec, &tmp)) {
do_warn(
_("Incorrect reverse-mapping: saw (%u/%u) %slen %u owner %"PRId64" %s%soff \
%"PRIu64"; should be (%u/%u) %slen %u owner %"PRId64" %s%soff %"PRIu64"\n"),
rm_rec = pop_slab_cursor(rm_cur);
}
-err:
- if (bt_cur)
- libxfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR);
- if (agbp)
- libxfs_putbuf(agbp);
+err_cur:
+ libxfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR);
+err_agf:
+ libxfs_buf_relse(agbp);
+err_pag:
+ libxfs_perag_put(pag);
free_slab_cursor(&rm_cur);
- return 0;
}
/*
* Compare the key fields of two rmap records -- positive if key1 > key2,
* negative if key1 < key2, and zero if equal.
*/
-__int64_t
+int64_t
rmap_diffkeys(
struct xfs_rmap_irec *kp1,
struct xfs_rmap_irec *kp2)
{
__u64 oa;
__u64 ob;
- __int64_t d;
+ int64_t d;
struct xfs_rmap_irec tmp;
tmp = *kp1;
tmp.rm_flags &= ~XFS_RMAP_REC_FLAGS;
- oa = xfs_rmap_irec_offset_pack(&tmp);
+ oa = libxfs_rmap_irec_offset_pack(&tmp);
tmp = *kp2;
tmp.rm_flags &= ~XFS_RMAP_REC_FLAGS;
- ob = xfs_rmap_irec_offset_pack(&tmp);
+ ob = libxfs_rmap_irec_offset_pack(&tmp);
- d = (__int64_t)kp1->rm_startblock - kp2->rm_startblock;
+ d = (int64_t)kp1->rm_startblock - kp2->rm_startblock;
if (d)
return d;
key->rm_offset += adj;
}
+/*
+ * Record that an inode had the reflink flag set when repair started. The
+ * inode reflink flag will be adjusted as necessary.
+ */
+void
+record_inode_reflink_flag(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dino,
+ xfs_agnumber_t agno,
+ xfs_agino_t ino,
+ xfs_ino_t lino)
+{
+ struct ino_tree_node *irec;
+ int off;
+
+ ASSERT(XFS_AGINO_TO_INO(mp, agno, ino) == be64_to_cpu(dino->di_ino));
+ if (!(be64_to_cpu(dino->di_flags2) & XFS_DIFLAG2_REFLINK))
+ return;
+ irec = find_inode_rec(mp, agno, ino);
+ off = get_inode_offset(mp, lino, irec);
+ ASSERT(!inode_was_rl(irec, off));
+ set_inode_was_rl(irec, off);
+ dbg_printf("set was_rl lino=%llu was=0x%llx\n",
+ (unsigned long long)lino, (unsigned long long)irec->ino_was_rl);
+}
+
+/*
+ * Inform the user that we're clearing the reflink flag on an inode that
+ * doesn't actually share any blocks. This is an optimization (the kernel
+ * skips refcount checks for non-reflink files) and not a corruption repair,
+ * so we don't need to log every time we clear a flag unless verbose mode is
+ * enabled.
+ */
+static void
+warn_clearing_reflink(
+ xfs_ino_t ino)
+{
+ static bool warned = false;
+ static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+
+ if (verbose) {
+ do_warn(_("clearing reflink flag on inode %"PRIu64"\n"), ino);
+ return;
+ }
+
+ if (warned)
+ return;
+
+ pthread_mutex_lock(&lock);
+ if (!warned) {
+ do_warn(_("clearing reflink flag on inodes when possible\n"));
+ warned = true;
+ }
+ pthread_mutex_unlock(&lock);
+}
+
+/*
+ * Fix an inode's reflink flag.
+ */
+static int
+fix_inode_reflink_flag(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ bool set)
+{
+ struct xfs_dinode *dino;
+ struct xfs_buf *buf;
+
+ if (set)
+ do_warn(
+_("setting reflink flag on inode %"PRIu64"\n"),
+ XFS_AGINO_TO_INO(mp, agno, agino));
+ else if (!no_modify) /* && !set */
+ warn_clearing_reflink(XFS_AGINO_TO_INO(mp, agno, agino));
+ if (no_modify)
+ return 0;
+
+ buf = get_agino_buf(mp, agno, agino, &dino);
+ if (!buf)
+ return 1;
+ ASSERT(XFS_AGINO_TO_INO(mp, agno, agino) == be64_to_cpu(dino->di_ino));
+ if (set)
+ dino->di_flags2 |= cpu_to_be64(XFS_DIFLAG2_REFLINK);
+ else
+ dino->di_flags2 &= cpu_to_be64(~XFS_DIFLAG2_REFLINK);
+ libxfs_dinode_calc_crc(mp, dino);
+ libxfs_buf_mark_dirty(buf);
+ libxfs_buf_relse(buf);
+
+ return 0;
+}
+
+/*
+ * Fix discrepancies between the state of the inode reflink flag and our
+ * observations as to whether or not the inode really needs it.
+ */
+int
+fix_inode_reflink_flags(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct ino_tree_node *irec;
+ int bit;
+ uint64_t was;
+ uint64_t is;
+ uint64_t diff;
+ uint64_t mask;
+ int error = 0;
+ xfs_agino_t agino;
+
+ /*
+ * Update the reflink flag for any inode where there's a discrepancy
+ * between the inode flag and whether or not we found any reflinked
+ * extents.
+ */
+ for (irec = findfirst_inode_rec(agno);
+ irec != NULL;
+ irec = next_ino_rec(irec)) {
+ ASSERT((irec->ino_was_rl & irec->ir_free) == 0);
+ ASSERT((irec->ino_is_rl & irec->ir_free) == 0);
+ was = irec->ino_was_rl;
+ is = irec->ino_is_rl;
+ if (was == is)
+ continue;
+ diff = was ^ is;
+ dbg_printf("mismatch ino=%llu was=0x%lx is=0x%lx dif=0x%lx\n",
+ (unsigned long long)XFS_AGINO_TO_INO(mp, agno,
+ irec->ino_startnum),
+ was, is, diff);
+
+ for (bit = 0, mask = 1; bit < 64; bit++, mask <<= 1) {
+ agino = bit + irec->ino_startnum;
+ if (!(diff & mask))
+ continue;
+ else if (was & mask)
+ error = fix_inode_reflink_flag(mp, agno, agino,
+ false);
+ else if (is & mask)
+ error = fix_inode_reflink_flag(mp, agno, agino,
+ true);
+ else
+ ASSERT(0);
+ if (error)
+ do_error(
+_("Unable to fix reflink flag on inode %"PRIu64".\n"),
+ XFS_AGINO_TO_INO(mp, agno, agino));
+ }
+ }
+
+ return error;
+}
+
+/*
+ * Return the number of refcount objects for an AG.
+ */
+uint64_t
+refcount_record_count(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ return slab_count(ag_rmaps[agno].ar_refcount_items);
+}
+
+/*
+ * Return a slab cursor that will return refcount objects in order.
+ */
+int
+init_refcount_cursor(
+ xfs_agnumber_t agno,
+ struct xfs_slab_cursor **cur)
+{
+ return init_slab_cursor(ag_rmaps[agno].ar_refcount_items, NULL, cur);
+}
+
+/*
+ * Disable the refcount btree check.
+ */
+void
+refcount_avoid_check(void)
+{
+ refcbt_suspect = true;
+}
+
+/*
+ * Compare the observed reference counts against what's in the ag btree.
+ */
+void
+check_refcounts(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno)
+{
+ struct xfs_refcount_irec tmp;
+ struct xfs_slab_cursor *rl_cur;
+ struct xfs_btree_cur *bt_cur = NULL;
+ struct xfs_buf *agbp = NULL;
+ struct xfs_perag *pag = NULL;
+ struct xfs_refcount_irec *rl_rec;
+ int have;
+ int i;
+ int error;
+
+ if (!xfs_has_reflink(mp))
+ return;
+ if (refcbt_suspect) {
+ if (no_modify && agno == 0)
+ do_warn(_("would rebuild corrupt refcount btrees.\n"));
+ return;
+ }
+
+ /* Create cursors to refcount structures */
+ error = init_refcount_cursor(agno, &rl_cur);
+ if (error) {
+ do_warn(_("Not enough memory to check refcount data.\n"));
+ return;
+ }
+
+ pag = libxfs_perag_get(mp, agno);
+ error = -libxfs_alloc_read_agf(pag, NULL, 0, &agbp);
+ if (error) {
+ do_warn(_("Could not read AGF %u to check refcount btree.\n"),
+ agno);
+ goto err_pag;
+ }
+
+ /* Leave the per-ag data "uninitialized" since we rewrite it later */
+ clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
+
+ bt_cur = libxfs_refcountbt_init_cursor(mp, NULL, agbp, pag);
+ if (!bt_cur) {
+ do_warn(_("Not enough memory to check refcount data.\n"));
+ goto err_agf;
+ }
+
+ rl_rec = pop_slab_cursor(rl_cur);
+ while (rl_rec) {
+ /* Look for a refcount record in the btree */
+ error = -libxfs_refcount_lookup_le(bt_cur,
+ XFS_REFC_DOMAIN_SHARED, rl_rec->rc_startblock,
+ &have);
+ if (error) {
+ do_warn(
+_("Could not read reference count record for (%u/%u).\n"),
+ agno, rl_rec->rc_startblock);
+ goto err_cur;
+ }
+ if (!have) {
+ do_warn(
+_("Missing reference count record for (%u/%u) len %u count %u\n"),
+ agno, rl_rec->rc_startblock,
+ rl_rec->rc_blockcount, rl_rec->rc_refcount);
+ goto next_loop;
+ }
+
+ error = -libxfs_refcount_get_rec(bt_cur, &tmp, &i);
+ if (error) {
+ do_warn(
+_("Could not read reference count record for (%u/%u).\n"),
+ agno, rl_rec->rc_startblock);
+ goto err_cur;
+ }
+ if (!i) {
+ do_warn(
+_("Missing reference count record for (%u/%u) len %u count %u\n"),
+ agno, rl_rec->rc_startblock,
+ rl_rec->rc_blockcount, rl_rec->rc_refcount);
+ goto next_loop;
+ }
+
+ /* Compare each refcount observation against the btree's */
+ if (tmp.rc_domain != rl_rec->rc_domain ||
+ tmp.rc_startblock != rl_rec->rc_startblock ||
+ tmp.rc_blockcount != rl_rec->rc_blockcount ||
+ tmp.rc_refcount != rl_rec->rc_refcount) {
+ unsigned int start;
+
+ start = xfs_refcount_encode_startblock(
+ tmp.rc_startblock, tmp.rc_domain);
+
+ do_warn(
+_("Incorrect reference count: saw (%u/%u) len %u nlinks %u; should be (%u/%u) len %u nlinks %u\n"),
+ agno, start, tmp.rc_blockcount,
+ tmp.rc_refcount, agno, rl_rec->rc_startblock,
+ rl_rec->rc_blockcount, rl_rec->rc_refcount);
+ }
+next_loop:
+ rl_rec = pop_slab_cursor(rl_cur);
+ }
+
+err_cur:
+ libxfs_btree_del_cursor(bt_cur, error);
+err_agf:
+ libxfs_buf_relse(agbp);
+err_pag:
+ libxfs_perag_put(pag);
+ free_slab_cursor(&rl_cur);
+}
+
/*
* Regenerate the AGFL so that we don't run out of it while rebuilding the
* rmap btree. If skip_rmapbt is true, don't update the rmapbt (most probably
{
xfs_alloc_arg_t args;
xfs_trans_t *tp;
- struct xfs_trans_res tres = {0};
int flags;
int error;
args.mp = mp;
args.agno = agno;
args.alignment = 1;
- args.pag = xfs_perag_get(mp, agno);
- error = -libxfs_trans_alloc(mp, &tres,
- libxfs_alloc_min_freelist(mp, args.pag), 0, 0, &tp);
+ args.pag = libxfs_perag_get(mp, agno);
+ error = -libxfs_trans_alloc_rollable(mp, 0, &tp);
if (error)
do_error(_("failed to fix AGFL on AG %d, error %d\n"),
agno, error);
flags = XFS_ALLOC_FLAG_NOSHRINK;
if (skip_rmapbt)
flags |= XFS_ALLOC_FLAG_NORMAP;
- error = libxfs_alloc_fix_freelist(&args, flags);
- xfs_perag_put(args.pag);
+ error = -libxfs_alloc_fix_freelist(&args, flags);
+ libxfs_perag_put(args.pag);
if (error) {
do_error(_("failed to fix AGFL on AG %d, error %d\n"),
agno, error);
}
- libxfs_trans_commit(tp);
+ error = -libxfs_trans_commit(tp);
+ if (error)
+ do_error(_("%s: commit failed, error %d\n"), __func__, error);
}
/*
xfs_agnumber_t agno,
int count)
{
- if (!needs_rmap_work(mp))
+ if (!rmap_needs_work(mp))
return;
ag_rmaps[agno].ar_flcount = count;
}
+
+/* Estimate the size of the ondisk rmapbt from the incore data. */
+xfs_extlen_t
+estimate_rmapbt_blocks(
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_ag_rmap *x;
+ unsigned long long nr_recs = 0;
+
+ if (!rmap_needs_work(mp) || !xfs_has_rmapbt(mp))
+ return 0;
+
+ /*
+ * Overestimate the amount of space needed by pretending that every
+ * record in the incore slab will become rmapbt records.
+ */
+ x = &ag_rmaps[pag->pag_agno];
+ if (x->ar_rmaps)
+ nr_recs += slab_count(x->ar_rmaps);
+ if (x->ar_raw_rmaps)
+ nr_recs += slab_count(x->ar_raw_rmaps);
+
+ return libxfs_rmapbt_calc_size(mp, nr_recs);
+}
+
+/* Estimate the size of the ondisk refcountbt from the incore data. */
+xfs_extlen_t
+estimate_refcountbt_blocks(
+ struct xfs_perag *pag)
+{
+ struct xfs_mount *mp = pag->pag_mount;
+ struct xfs_ag_rmap *x;
+
+ if (!rmap_needs_work(mp) || !xfs_has_reflink(mp))
+ return 0;
+
+ x = &ag_rmaps[pag->pag_agno];
+ if (!x->ar_refcount_items)
+ return 0;
+
+ return libxfs_refcountbt_calc_size(mp,
+ slab_count(x->ar_refcount_items));
+}