1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2015 Red Hat, Inc.
7 /* Various utilities for repair of directory and attribute metadata */
11 #include "err_protos.h"
16 * Takes a name and length (name need not be null-terminated) and whether
17 * we are checking a dir (as opposed to an attr).
18 * Returns 1 if the name contains a NUL or if a directory entry contains a '/'.
19 * Returns 0 if the name checks out.
30 ASSERT(length
< MAXNAMELEN
);
32 for (c
= name
, i
= 0; i
< length
; i
++, c
++) {
33 if (isadir
&& *c
== '/')
43 * the cursor gets passed up and down the da btree processing
44 * routines. The interior block processing routines use the
45 * cursor to determine if the pointers to and from the preceding
46 * and succeeding sibling blocks are ok and whether the values in
47 * the current block are consistent with the entries in the parent
48 * nodes. When a block is traversed, a parent-verification routine
49 * is called to verify if the next logical entry in the next level up
50 * is consistent with the greatest hashval in the next block of the
51 * current level. The verification routine is itself recursive and
52 * calls itself if it has to traverse an interior block to get
53 * the next logical entry. The routine recurses upwards through
54 * the tree until it finds a block where it can simply step to
55 * the next entry. The hashval in that entry should be equal to
56 * the hashval being passed to it (the greatest hashval in the block
57 * that the entry points to). If that isn't true, then the tree
58 * is blown and we need to trash it, salvage and trash it, or fix it.
59 * Currently, we just trash it.
63 * Multibuffer handling.
64 * V2 directory blocks can be noncontiguous, needing multiple buffers.
65 * attr blocks are single blocks; this code handles that as well.
72 const struct xfs_buf_ops
*ops
)
74 #define MAP_ARRAY_SZ 4
75 struct xfs_buf_map map_array
[MAP_ARRAY_SZ
];
76 struct xfs_buf_map
*map
;
80 if (nex
> MAP_ARRAY_SZ
) {
81 map
= calloc(nex
, sizeof(*map
));
83 do_error(_("couldn't malloc dir2 buffer list\n"));
87 /* common case avoids calloc/free */
90 for (i
= 0; i
< nex
; i
++) {
91 map
[i
].bm_bn
= XFS_FSB_TO_DADDR(mp
, bmp
[i
].startblock
);
92 map
[i
].bm_len
= XFS_FSB_TO_BB(mp
, bmp
[i
].blockcount
);
94 bp
= libxfs_readbuf_map(mp
->m_dev
, map
, nex
, 0, ops
);
100 #define FORKNAME(type) (type == XFS_DATA_FORK ? _("directory") : _("attribute"))
103 * walk tree from root to the left-most leaf block reading in
104 * blocks and setting up cursor. passes back file block number of the
105 * left-most leaf block if successful (bno). returns 1 if successful,
109 traverse_int_dablock(
111 da_bt_cursor_t
*da_cursor
,
120 xfs_da_intnode_t
*node
;
122 struct xfs_da_geometry
*geo
;
123 struct xfs_da_node_entry
*btree
;
124 struct xfs_da3_icnode_hdr nodehdr
;
126 if (whichfork
== XFS_DATA_FORK
) {
130 geo
= mp
->m_attr_geo
;
135 * traverse down left-side of tree until we hit the
136 * left-most leaf block setting up the btree cursor along
141 da_cursor
->active
= 0;
145 * read in each block along the way and set up cursor
147 nex
= blkmap_getn(da_cursor
->blkmap
, bno
,
148 geo
->fsbcount
, &bmp
, &lbmp
);
153 bp
= da_read_buf(mp
, nex
, bmp
, &xfs_da3_node_buf_ops
);
159 _("can't read %s block %u for inode %" PRIu64
"\n"),
160 FORKNAME(whichfork
), bno
, da_cursor
->ino
);
165 M_DIROPS(mp
)->node_hdr_from_disk(&nodehdr
, node
);
167 if (whichfork
== XFS_DATA_FORK
&&
168 (nodehdr
.magic
== XFS_DIR2_LEAFN_MAGIC
||
169 nodehdr
.magic
== XFS_DIR3_LEAFN_MAGIC
)) {
172 _("found non-root LEAFN node in inode %" PRIu64
" bno = %u\n"),
173 da_cursor
->ino
, bno
);
180 if (nodehdr
.magic
!= XFS_DA_NODE_MAGIC
&&
181 nodehdr
.magic
!= XFS_DA3_NODE_MAGIC
) {
183 _("bad %s magic number 0x%x in inode %" PRIu64
" bno = %u\n"),
184 FORKNAME(whichfork
), nodehdr
.magic
,
185 da_cursor
->ino
, bno
);
190 /* corrupt node; rebuild the dir. */
191 if (bp
->b_error
== -EFSBADCRC
|| bp
->b_error
== -EFSCORRUPTED
) {
194 _("corrupt %s tree block %u for inode %" PRIu64
"\n"),
195 FORKNAME(whichfork
), bno
, da_cursor
->ino
);
199 btree
= M_DIROPS(mp
)->node_tree_p(node
);
200 if (nodehdr
.count
> geo
->node_ents
) {
202 _("bad %s record count in inode %" PRIu64
", count = %d, max = %d\n"),
203 FORKNAME(whichfork
), da_cursor
->ino
,
204 nodehdr
.count
, geo
->node_ents
);
210 * maintain level counter
213 i
= da_cursor
->active
= nodehdr
.level
;
214 if (i
< 1 || i
>= XFS_DA_NODE_MAXDEPTH
) {
216 _("bad header depth for directory inode %" PRIu64
"\n"),
223 if (nodehdr
.level
== i
- 1) {
227 _("bad %s btree for inode %" PRIu64
"\n"),
228 FORKNAME(whichfork
), da_cursor
->ino
);
234 da_cursor
->level
[i
].hashval
= be32_to_cpu(btree
[0].hashval
);
235 da_cursor
->level
[i
].bp
= bp
;
236 da_cursor
->level
[i
].bno
= bno
;
237 da_cursor
->level
[i
].index
= 0;
240 * set up new bno for next level down
242 bno
= be32_to_cpu(btree
[0].before
);
243 } while (node
!= NULL
&& i
> 1);
246 * now return block number and get out
248 *rbno
= da_cursor
->level
[0].bno
= bno
;
252 while (i
> 1 && i
<= da_cursor
->active
) {
253 libxfs_putbuf(da_cursor
->level
[i
].bp
);
261 * blow out buffer for this level and all the rest above as well
262 * if error == 0, we are not expecting to encounter any unreleased
263 * buffers (e.g. if we do, it's a mistake). if error == 1, we're
264 * in an error-handling case so unreleased buffers may exist.
267 release_da_cursor_int(
269 da_bt_cursor_t
*cursor
,
273 int level
= prev_level
+ 1;
275 if (cursor
->level
[level
].bp
!= NULL
) {
277 do_warn(_("release_da_cursor_int got unexpected "
278 "non-null bp, dabno = %u\n"),
279 cursor
->level
[level
].bno
);
283 libxfs_putbuf(cursor
->level
[level
].bp
);
284 cursor
->level
[level
].bp
= NULL
;
287 if (level
< cursor
->active
)
288 release_da_cursor_int(mp
, cursor
, level
, error
);
296 da_bt_cursor_t
*cursor
,
299 release_da_cursor_int(mp
, cursor
, prev_level
, 0);
303 err_release_da_cursor(
305 da_bt_cursor_t
*cursor
,
308 release_da_cursor_int(mp
, cursor
, prev_level
, 1);
312 * make sure that all entries in all blocks along the right side of
313 * of the tree are used and hashval's are consistent. level is the
314 * level of the descendent block. returns 0 if good (even if it had
315 * to be fixed up), and 1 if bad. The right edge of the tree is
316 * technically a block boundary. This routine should be used then
317 * instead of verify_da_path().
320 verify_final_da_path(
322 da_bt_cursor_t
*cursor
,
326 xfs_da_intnode_t
*node
;
327 xfs_dahash_t hashval
;
330 int this_level
= p_level
+ 1;
331 struct xfs_da_node_entry
*btree
;
332 struct xfs_da3_icnode_hdr nodehdr
;
335 fprintf(stderr
, "in verify_final_da_path, this_level = %d\n",
340 * the index should point to the next "unprocessed" entry
341 * in the block which should be the final (rightmost) entry
343 entry
= cursor
->level
[this_level
].index
;
344 node
= cursor
->level
[this_level
].bp
->b_addr
;
345 btree
= M_DIROPS(mp
)->node_tree_p(node
);
346 M_DIROPS(mp
)->node_hdr_from_disk(&nodehdr
, node
);
349 * check internal block consistency on this level -- ensure
350 * that all entries are used, encountered and expected hashvals
353 if (entry
!= nodehdr
.count
- 1) {
355 _("%s block used/count inconsistency - %d/%hu\n"),
356 FORKNAME(whichfork
), entry
, nodehdr
.count
);
360 * hash values monotonically increasing ???
362 if (cursor
->level
[this_level
].hashval
>=
363 be32_to_cpu(btree
[entry
].hashval
)) {
365 _("%s block hashvalue inconsistency, expected > %u / saw %u\n"),
367 cursor
->level
[this_level
].hashval
,
368 be32_to_cpu(btree
[entry
].hashval
));
371 if (nodehdr
.forw
!= 0) {
373 _("bad %s forward block pointer, expected 0, saw %u\n"),
374 FORKNAME(whichfork
), nodehdr
.forw
);
378 do_warn(_("bad %s block in inode %" PRIu64
"\n"),
379 FORKNAME(whichfork
), cursor
->ino
);
383 * keep track of greatest block # -- that gets
384 * us the length of the directory/attribute
386 if (cursor
->level
[this_level
].bno
> cursor
->greatest_bno
)
387 cursor
->greatest_bno
= cursor
->level
[this_level
].bno
;
390 * ok, now check descendant block number against this level
392 if (cursor
->level
[p_level
].bno
!= be32_to_cpu(btree
[entry
].before
)) {
394 fprintf(stderr
, "bad %s btree pointer, child bno should "
395 "be %d, block bno is %d, hashval is %u\n",
396 FORKNAME(whichfork
), be16_to_cpu(btree
[entry
].before
),
397 cursor
->level
[p_level
].bno
,
398 cursor
->level
[p_level
].hashval
);
399 fprintf(stderr
, "verify_final_da_path returns 1 (bad) #1a\n");
404 if (cursor
->level
[p_level
].hashval
!=
405 be32_to_cpu(btree
[entry
].hashval
)) {
408 _("correcting bad hashval in non-leaf %s block\n"
409 "\tin (level %d) in inode %" PRIu64
".\n"),
410 FORKNAME(whichfork
), this_level
, cursor
->ino
);
411 btree
[entry
].hashval
= cpu_to_be32(
412 cursor
->level
[p_level
].hashval
);
413 cursor
->level
[this_level
].dirty
++;
416 _("would correct bad hashval in non-leaf %s block\n"
417 "\tin (level %d) in inode %" PRIu64
".\n"),
418 FORKNAME(whichfork
), this_level
, cursor
->ino
);
423 * Note: squirrel hashval away _before_ releasing the
424 * buffer, preventing a use-after-free problem.
426 hashval
= be32_to_cpu(btree
[entry
].hashval
);
429 * release/write buffer
431 ASSERT(cursor
->level
[this_level
].dirty
== 0 ||
432 (cursor
->level
[this_level
].dirty
&& !no_modify
));
434 if (cursor
->level
[this_level
].dirty
&& !no_modify
)
435 libxfs_writebuf(cursor
->level
[this_level
].bp
, 0);
437 libxfs_putbuf(cursor
->level
[this_level
].bp
);
439 cursor
->level
[this_level
].bp
= NULL
;
442 * bail out if this is the root block (top of tree)
444 if (this_level
>= cursor
->active
) {
446 fprintf(stderr
, "verify_final_da_path returns 0 (ok)\n");
451 * set hashvalue to correctly reflect the now-validated
452 * last entry in this block and continue upwards validation
454 cursor
->level
[this_level
].hashval
= hashval
;
456 return verify_final_da_path(mp
, cursor
, this_level
, whichfork
);
460 * Verifies the path from a descendant block up to the root.
461 * Should be called when the descendant level traversal hits
462 * a block boundary before crossing the boundary (reading in a new
465 * the directory/attr btrees work differently to the other fs btrees.
466 * each interior block contains records that are <hashval, bno>
467 * pairs. The bno is a file bno, not a filesystem bno. The last
468 * hashvalue in the block <bno> will be <hashval>. BUT unlike
469 * the freespace btrees, the *last* value in each block gets
470 * propagated up the tree instead of the first value in each block.
471 * that is, the interior records point to child blocks and the *greatest*
472 * hash value contained by the child block is the one the block above
473 * uses as the key for the child block.
475 * level is the level of the descendent block. returns 0 if good,
476 * and 1 if bad. The descendant block may be a leaf block.
478 * the invariant here is that the values in the cursor for the
479 * levels beneath this level (this_level) and the cursor index
480 * for this level *must* be valid.
482 * that is, the hashval/bno info is accurate for all
483 * DESCENDANTS and match what the node[index] information
484 * for the current index in the cursor for this level.
486 * the index values in the cursor for the descendant level
487 * are allowed to be off by one as they will reflect the
488 * next entry at those levels to be processed.
490 * the hashvalue for the current level can't be set until
491 * we hit the last entry in the block so, it's garbage
492 * until set by this routine.
494 * bno and bp for the current block/level are always valid
495 * since they have to be set so we can get a buffer for the
501 da_bt_cursor_t
*cursor
,
505 xfs_da_intnode_t
*node
;
506 xfs_da_intnode_t
*newnode
;
511 int this_level
= p_level
+ 1;
515 struct xfs_da_geometry
*geo
;
516 struct xfs_da_node_entry
*btree
;
517 struct xfs_da3_icnode_hdr nodehdr
;
519 if (whichfork
== XFS_DATA_FORK
)
522 geo
= mp
->m_attr_geo
;
524 /* No buffer at this level, tree is corrupt. */
525 if (cursor
->level
[this_level
].bp
== NULL
)
529 * index is currently set to point to the entry that
530 * should be processed now in this level.
532 entry
= cursor
->level
[this_level
].index
;
533 node
= cursor
->level
[this_level
].bp
->b_addr
;
534 btree
= M_DIROPS(mp
)->node_tree_p(node
);
535 M_DIROPS(mp
)->node_hdr_from_disk(&nodehdr
, node
);
537 /* No entries in this node? Tree is corrupt. */
538 if (nodehdr
.count
== 0)
542 * if this block is out of entries, validate this
543 * block and move on to the next block.
544 * and update cursor value for said level
546 if (entry
>= nodehdr
.count
) {
548 * update the hash value for this level before
549 * validating it. bno value should be ok since
550 * it was set when the block was first read in.
552 cursor
->level
[this_level
].hashval
=
553 be32_to_cpu(btree
[entry
- 1].hashval
);
556 * keep track of greatest block # -- that gets
557 * us the length of the directory
559 if (cursor
->level
[this_level
].bno
> cursor
->greatest_bno
)
560 cursor
->greatest_bno
= cursor
->level
[this_level
].bno
;
563 * validate the path for the current used-up block
566 if (verify_da_path(mp
, cursor
, this_level
, whichfork
))
569 * ok, now get the next buffer and check sibling pointers
571 dabno
= nodehdr
.forw
;
573 nex
= blkmap_getn(cursor
->blkmap
, dabno
, geo
->fsbcount
,
577 _("can't get map info for %s block %u of inode %" PRIu64
"\n"),
578 FORKNAME(whichfork
), dabno
, cursor
->ino
);
582 bp
= da_read_buf(mp
, nex
, bmp
, &xfs_da3_node_buf_ops
);
588 _("can't read %s block %u for inode %" PRIu64
"\n"),
589 FORKNAME(whichfork
), dabno
, cursor
->ino
);
593 newnode
= bp
->b_addr
;
594 btree
= M_DIROPS(mp
)->node_tree_p(newnode
);
595 M_DIROPS(mp
)->node_hdr_from_disk(&nodehdr
, newnode
);
598 * verify magic number and back pointer, sanity-check
599 * entry count, verify level
602 if (nodehdr
.magic
!= XFS_DA_NODE_MAGIC
&&
603 nodehdr
.magic
!= XFS_DA3_NODE_MAGIC
) {
605 _("bad magic number %x in %s block %u for inode %" PRIu64
"\n"),
606 nodehdr
.magic
, FORKNAME(whichfork
),
610 if (nodehdr
.back
!= cursor
->level
[this_level
].bno
) {
612 _("bad back pointer in %s block %u for inode %" PRIu64
"\n"),
613 FORKNAME(whichfork
), dabno
, cursor
->ino
);
616 if (nodehdr
.count
> geo
->node_ents
) {
618 _("entry count %d too large in %s block %u for inode %" PRIu64
"\n"),
619 nodehdr
.count
, FORKNAME(whichfork
),
623 if (nodehdr
.level
!= this_level
) {
625 _("bad level %d in %s block %u for inode %" PRIu64
"\n"),
626 nodehdr
.level
, FORKNAME(whichfork
),
632 fprintf(stderr
, "verify_da_path returns 1 (bad) #4\n");
639 * update cursor, write out the *current* level if
640 * required. don't write out the descendant level
642 ASSERT(cursor
->level
[this_level
].dirty
== 0 ||
643 (cursor
->level
[this_level
].dirty
&& !no_modify
));
646 * If block looks ok but CRC didn't match, make sure to
650 cursor
->level
[this_level
].bp
->b_error
== -EFSBADCRC
)
651 cursor
->level
[this_level
].dirty
= 1;
653 if (cursor
->level
[this_level
].dirty
&& !no_modify
)
654 libxfs_writebuf(cursor
->level
[this_level
].bp
, 0);
656 libxfs_putbuf(cursor
->level
[this_level
].bp
);
658 /* switch cursor to point at the new buffer we just read */
659 cursor
->level
[this_level
].bp
= bp
;
660 cursor
->level
[this_level
].dirty
= 0;
661 cursor
->level
[this_level
].bno
= dabno
;
662 cursor
->level
[this_level
].hashval
=
663 be32_to_cpu(btree
[0].hashval
);
665 entry
= cursor
->level
[this_level
].index
= 0;
668 * ditto for block numbers
670 if (cursor
->level
[p_level
].bno
!= be32_to_cpu(btree
[entry
].before
)) {
672 fprintf(stderr
, "bad %s btree pointer, child bno "
673 "should be %d, block bno is %d, hashval is %u\n",
674 FORKNAME(whichfork
), be32_to_cpu(btree
[entry
].before
),
675 cursor
->level
[p_level
].bno
,
676 cursor
->level
[p_level
].hashval
);
677 fprintf(stderr
, "verify_da_path returns 1 (bad) #1a\n");
682 * ok, now validate last hashvalue in the descendant
683 * block against the hashval in the current entry
685 if (cursor
->level
[p_level
].hashval
!=
686 be32_to_cpu(btree
[entry
].hashval
)) {
689 _("correcting bad hashval in interior %s block\n"
690 "\tin (level %d) in inode %" PRIu64
".\n"),
691 FORKNAME(whichfork
), this_level
, cursor
->ino
);
692 btree
[entry
].hashval
= cpu_to_be32(
693 cursor
->level
[p_level
].hashval
);
694 cursor
->level
[this_level
].dirty
++;
697 _("would correct bad hashval in interior %s block\n"
698 "\tin (level %d) in inode %" PRIu64
".\n"),
699 FORKNAME(whichfork
), this_level
, cursor
->ino
);
703 * increment index for this level to point to next entry
704 * (which should point to the next descendant block)
706 cursor
->level
[this_level
].index
++;
708 fprintf(stderr
, "verify_da_path returns 0 (ok)\n");