1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2015 Red Hat, Inc.
7 /* Various utilities for repair of directory and attribute metadata */
11 #include "err_protos.h"
16 * the cursor gets passed up and down the da btree processing
17 * routines. The interior block processing routines use the
18 * cursor to determine if the pointers to and from the preceding
19 * and succeeding sibling blocks are ok and whether the values in
20 * the current block are consistent with the entries in the parent
21 * nodes. When a block is traversed, a parent-verification routine
22 * is called to verify if the next logical entry in the next level up
23 * is consistent with the greatest hashval in the next block of the
24 * current level. The verification routine is itself recursive and
25 * calls itself if it has to traverse an interior block to get
26 * the next logical entry. The routine recurses upwards through
27 * the tree until it finds a block where it can simply step to
28 * the next entry. The hashval in that entry should be equal to
29 * the hashval being passed to it (the greatest hashval in the block
30 * that the entry points to). If that isn't true, then the tree
31 * is blown and we need to trash it, salvage and trash it, or fix it.
32 * Currently, we just trash it.
36 * Multibuffer handling.
37 * V2 directory blocks can be noncontiguous, needing multiple buffers.
38 * attr blocks are single blocks; this code handles that as well.
45 const struct xfs_buf_ops
*ops
)
47 #define MAP_ARRAY_SZ 4
48 struct xfs_buf_map map_array
[MAP_ARRAY_SZ
];
49 struct xfs_buf_map
*map
;
53 if (nex
> MAP_ARRAY_SZ
) {
54 map
= calloc(nex
, sizeof(*map
));
56 do_error(_("couldn't malloc dir2 buffer list\n"));
60 /* common case avoids calloc/free */
63 for (i
= 0; i
< nex
; i
++) {
64 map
[i
].bm_bn
= XFS_FSB_TO_DADDR(mp
, bmp
[i
].startblock
);
65 map
[i
].bm_len
= XFS_FSB_TO_BB(mp
, bmp
[i
].blockcount
);
67 bp
= libxfs_readbuf_map(mp
->m_dev
, map
, nex
, 0, ops
);
73 #define FORKNAME(type) (type == XFS_DATA_FORK ? _("directory") : _("attribute"))
76 * walk tree from root to the left-most leaf block reading in
77 * blocks and setting up cursor. passes back file block number of the
78 * left-most leaf block if successful (bno). returns 1 if successful,
84 da_bt_cursor_t
*da_cursor
,
93 xfs_da_intnode_t
*node
;
95 struct xfs_da_geometry
*geo
;
96 struct xfs_da_node_entry
*btree
;
97 struct xfs_da3_icnode_hdr nodehdr
;
99 if (whichfork
== XFS_DATA_FORK
) {
103 geo
= mp
->m_attr_geo
;
108 * traverse down left-side of tree until we hit the
109 * left-most leaf block setting up the btree cursor along
114 da_cursor
->active
= 0;
118 * read in each block along the way and set up cursor
120 nex
= blkmap_getn(da_cursor
->blkmap
, bno
,
121 geo
->fsbcount
, &bmp
, &lbmp
);
126 bp
= da_read_buf(mp
, nex
, bmp
, &xfs_da3_node_buf_ops
);
132 _("can't read %s block %u for inode %" PRIu64
"\n"),
133 FORKNAME(whichfork
), bno
, da_cursor
->ino
);
138 M_DIROPS(mp
)->node_hdr_from_disk(&nodehdr
, node
);
140 if (whichfork
== XFS_DATA_FORK
&&
141 (nodehdr
.magic
== XFS_DIR2_LEAFN_MAGIC
||
142 nodehdr
.magic
== XFS_DIR3_LEAFN_MAGIC
)) {
145 _("found non-root LEAFN node in inode %" PRIu64
" bno = %u\n"),
146 da_cursor
->ino
, bno
);
153 if (nodehdr
.magic
!= XFS_DA_NODE_MAGIC
&&
154 nodehdr
.magic
!= XFS_DA3_NODE_MAGIC
) {
156 _("bad %s magic number 0x%x in inode %" PRIu64
" bno = %u\n"),
157 FORKNAME(whichfork
), nodehdr
.magic
,
158 da_cursor
->ino
, bno
);
163 /* corrupt node; rebuild the dir. */
164 if (bp
->b_error
== -EFSBADCRC
|| bp
->b_error
== -EFSCORRUPTED
) {
167 _("corrupt %s tree block %u for inode %" PRIu64
"\n"),
168 FORKNAME(whichfork
), bno
, da_cursor
->ino
);
172 btree
= M_DIROPS(mp
)->node_tree_p(node
);
173 if (nodehdr
.count
> geo
->node_ents
) {
175 _("bad %s record count in inode %" PRIu64
", count = %d, max = %d\n"),
176 FORKNAME(whichfork
), da_cursor
->ino
,
177 nodehdr
.count
, geo
->node_ents
);
183 * maintain level counter
186 i
= da_cursor
->active
= nodehdr
.level
;
187 if (i
< 1 || i
>= XFS_DA_NODE_MAXDEPTH
) {
189 _("bad header depth for directory inode %" PRIu64
"\n"),
196 if (nodehdr
.level
== i
- 1) {
200 _("bad %s btree for inode %" PRIu64
"\n"),
201 FORKNAME(whichfork
), da_cursor
->ino
);
207 da_cursor
->level
[i
].hashval
= be32_to_cpu(btree
[0].hashval
);
208 da_cursor
->level
[i
].bp
= bp
;
209 da_cursor
->level
[i
].bno
= bno
;
210 da_cursor
->level
[i
].index
= 0;
213 * set up new bno for next level down
215 bno
= be32_to_cpu(btree
[0].before
);
216 } while (node
!= NULL
&& i
> 1);
219 * now return block number and get out
221 *rbno
= da_cursor
->level
[0].bno
= bno
;
225 while (i
> 1 && i
<= da_cursor
->active
) {
226 libxfs_putbuf(da_cursor
->level
[i
].bp
);
234 * blow out buffer for this level and all the rest above as well
235 * if error == 0, we are not expecting to encounter any unreleased
236 * buffers (e.g. if we do, it's a mistake). if error == 1, we're
237 * in an error-handling case so unreleased buffers may exist.
240 release_da_cursor_int(
242 da_bt_cursor_t
*cursor
,
246 int level
= prev_level
+ 1;
248 if (cursor
->level
[level
].bp
!= NULL
) {
250 do_warn(_("release_da_cursor_int got unexpected "
251 "non-null bp, dabno = %u\n"),
252 cursor
->level
[level
].bno
);
256 libxfs_putbuf(cursor
->level
[level
].bp
);
257 cursor
->level
[level
].bp
= NULL
;
260 if (level
< cursor
->active
)
261 release_da_cursor_int(mp
, cursor
, level
, error
);
269 da_bt_cursor_t
*cursor
,
272 release_da_cursor_int(mp
, cursor
, prev_level
, 0);
276 err_release_da_cursor(
278 da_bt_cursor_t
*cursor
,
281 release_da_cursor_int(mp
, cursor
, prev_level
, 1);
285 * make sure that all entries in all blocks along the right side of
286 * of the tree are used and hashval's are consistent. level is the
287 * level of the descendent block. returns 0 if good (even if it had
288 * to be fixed up), and 1 if bad. The right edge of the tree is
289 * technically a block boundary. This routine should be used then
290 * instead of verify_da_path().
293 verify_final_da_path(
295 da_bt_cursor_t
*cursor
,
299 xfs_da_intnode_t
*node
;
300 xfs_dahash_t hashval
;
303 int this_level
= p_level
+ 1;
304 struct xfs_da_node_entry
*btree
;
305 struct xfs_da3_icnode_hdr nodehdr
;
308 fprintf(stderr
, "in verify_final_da_path, this_level = %d\n",
313 * the index should point to the next "unprocessed" entry
314 * in the block which should be the final (rightmost) entry
316 entry
= cursor
->level
[this_level
].index
;
317 node
= cursor
->level
[this_level
].bp
->b_addr
;
318 btree
= M_DIROPS(mp
)->node_tree_p(node
);
319 M_DIROPS(mp
)->node_hdr_from_disk(&nodehdr
, node
);
322 * check internal block consistency on this level -- ensure
323 * that all entries are used, encountered and expected hashvals
326 if (entry
!= nodehdr
.count
- 1) {
328 _("%s block used/count inconsistency - %d/%hu\n"),
329 FORKNAME(whichfork
), entry
, nodehdr
.count
);
333 * hash values monotonically increasing ???
335 if (cursor
->level
[this_level
].hashval
>=
336 be32_to_cpu(btree
[entry
].hashval
)) {
338 _("%s block hashvalue inconsistency, expected > %u / saw %u\n"),
340 cursor
->level
[this_level
].hashval
,
341 be32_to_cpu(btree
[entry
].hashval
));
344 if (nodehdr
.forw
!= 0) {
346 _("bad %s forward block pointer, expected 0, saw %u\n"),
347 FORKNAME(whichfork
), nodehdr
.forw
);
351 do_warn(_("bad %s block in inode %" PRIu64
"\n"),
352 FORKNAME(whichfork
), cursor
->ino
);
356 * keep track of greatest block # -- that gets
357 * us the length of the directory/attribute
359 if (cursor
->level
[this_level
].bno
> cursor
->greatest_bno
)
360 cursor
->greatest_bno
= cursor
->level
[this_level
].bno
;
363 * ok, now check descendant block number against this level
365 if (cursor
->level
[p_level
].bno
!= be32_to_cpu(btree
[entry
].before
)) {
367 fprintf(stderr
, "bad %s btree pointer, child bno should "
368 "be %d, block bno is %d, hashval is %u\n",
369 FORKNAME(whichfork
), be16_to_cpu(btree
[entry
].before
),
370 cursor
->level
[p_level
].bno
,
371 cursor
->level
[p_level
].hashval
);
372 fprintf(stderr
, "verify_final_da_path returns 1 (bad) #1a\n");
377 if (cursor
->level
[p_level
].hashval
!=
378 be32_to_cpu(btree
[entry
].hashval
)) {
381 _("correcting bad hashval in non-leaf %s block\n"
382 "\tin (level %d) in inode %" PRIu64
".\n"),
383 FORKNAME(whichfork
), this_level
, cursor
->ino
);
384 btree
[entry
].hashval
= cpu_to_be32(
385 cursor
->level
[p_level
].hashval
);
386 cursor
->level
[this_level
].dirty
++;
389 _("would correct bad hashval in non-leaf %s block\n"
390 "\tin (level %d) in inode %" PRIu64
".\n"),
391 FORKNAME(whichfork
), this_level
, cursor
->ino
);
396 * Note: squirrel hashval away _before_ releasing the
397 * buffer, preventing a use-after-free problem.
399 hashval
= be32_to_cpu(btree
[entry
].hashval
);
402 * release/write buffer
404 ASSERT(cursor
->level
[this_level
].dirty
== 0 ||
405 (cursor
->level
[this_level
].dirty
&& !no_modify
));
407 if (cursor
->level
[this_level
].dirty
&& !no_modify
)
408 libxfs_writebuf(cursor
->level
[this_level
].bp
, 0);
410 libxfs_putbuf(cursor
->level
[this_level
].bp
);
412 cursor
->level
[this_level
].bp
= NULL
;
415 * bail out if this is the root block (top of tree)
417 if (this_level
>= cursor
->active
) {
419 fprintf(stderr
, "verify_final_da_path returns 0 (ok)\n");
424 * set hashvalue to correctly reflect the now-validated
425 * last entry in this block and continue upwards validation
427 cursor
->level
[this_level
].hashval
= hashval
;
429 return verify_final_da_path(mp
, cursor
, this_level
, whichfork
);
433 * Verifies the path from a descendant block up to the root.
434 * Should be called when the descendant level traversal hits
435 * a block boundary before crossing the boundary (reading in a new
438 * the directory/attr btrees work differently to the other fs btrees.
439 * each interior block contains records that are <hashval, bno>
440 * pairs. The bno is a file bno, not a filesystem bno. The last
441 * hashvalue in the block <bno> will be <hashval>. BUT unlike
442 * the freespace btrees, the *last* value in each block gets
443 * propagated up the tree instead of the first value in each block.
444 * that is, the interior records point to child blocks and the *greatest*
445 * hash value contained by the child block is the one the block above
446 * uses as the key for the child block.
448 * level is the level of the descendent block. returns 0 if good,
449 * and 1 if bad. The descendant block may be a leaf block.
451 * the invariant here is that the values in the cursor for the
452 * levels beneath this level (this_level) and the cursor index
453 * for this level *must* be valid.
455 * that is, the hashval/bno info is accurate for all
456 * DESCENDANTS and match what the node[index] information
457 * for the current index in the cursor for this level.
459 * the index values in the cursor for the descendant level
460 * are allowed to be off by one as they will reflect the
461 * next entry at those levels to be processed.
463 * the hashvalue for the current level can't be set until
464 * we hit the last entry in the block so, it's garbage
465 * until set by this routine.
467 * bno and bp for the current block/level are always valid
468 * since they have to be set so we can get a buffer for the
474 da_bt_cursor_t
*cursor
,
478 xfs_da_intnode_t
*node
;
479 xfs_da_intnode_t
*newnode
;
484 int this_level
= p_level
+ 1;
488 struct xfs_da_geometry
*geo
;
489 struct xfs_da_node_entry
*btree
;
490 struct xfs_da3_icnode_hdr nodehdr
;
492 if (whichfork
== XFS_DATA_FORK
)
495 geo
= mp
->m_attr_geo
;
497 /* No buffer at this level, tree is corrupt. */
498 if (cursor
->level
[this_level
].bp
== NULL
)
502 * index is currently set to point to the entry that
503 * should be processed now in this level.
505 entry
= cursor
->level
[this_level
].index
;
506 node
= cursor
->level
[this_level
].bp
->b_addr
;
507 btree
= M_DIROPS(mp
)->node_tree_p(node
);
508 M_DIROPS(mp
)->node_hdr_from_disk(&nodehdr
, node
);
510 /* No entries in this node? Tree is corrupt. */
511 if (nodehdr
.count
== 0)
515 * if this block is out of entries, validate this
516 * block and move on to the next block.
517 * and update cursor value for said level
519 if (entry
>= nodehdr
.count
) {
521 * update the hash value for this level before
522 * validating it. bno value should be ok since
523 * it was set when the block was first read in.
525 cursor
->level
[this_level
].hashval
=
526 be32_to_cpu(btree
[entry
- 1].hashval
);
529 * keep track of greatest block # -- that gets
530 * us the length of the directory
532 if (cursor
->level
[this_level
].bno
> cursor
->greatest_bno
)
533 cursor
->greatest_bno
= cursor
->level
[this_level
].bno
;
536 * validate the path for the current used-up block
539 if (verify_da_path(mp
, cursor
, this_level
, whichfork
))
542 * ok, now get the next buffer and check sibling pointers
544 dabno
= nodehdr
.forw
;
546 nex
= blkmap_getn(cursor
->blkmap
, dabno
, geo
->fsbcount
,
550 _("can't get map info for %s block %u of inode %" PRIu64
"\n"),
551 FORKNAME(whichfork
), dabno
, cursor
->ino
);
555 bp
= da_read_buf(mp
, nex
, bmp
, &xfs_da3_node_buf_ops
);
561 _("can't read %s block %u for inode %" PRIu64
"\n"),
562 FORKNAME(whichfork
), dabno
, cursor
->ino
);
566 newnode
= bp
->b_addr
;
567 btree
= M_DIROPS(mp
)->node_tree_p(newnode
);
568 M_DIROPS(mp
)->node_hdr_from_disk(&nodehdr
, newnode
);
571 * verify magic number and back pointer, sanity-check
572 * entry count, verify level
575 if (nodehdr
.magic
!= XFS_DA_NODE_MAGIC
&&
576 nodehdr
.magic
!= XFS_DA3_NODE_MAGIC
) {
578 _("bad magic number %x in %s block %u for inode %" PRIu64
"\n"),
579 nodehdr
.magic
, FORKNAME(whichfork
),
583 if (nodehdr
.back
!= cursor
->level
[this_level
].bno
) {
585 _("bad back pointer in %s block %u for inode %" PRIu64
"\n"),
586 FORKNAME(whichfork
), dabno
, cursor
->ino
);
589 if (nodehdr
.count
> geo
->node_ents
) {
591 _("entry count %d too large in %s block %u for inode %" PRIu64
"\n"),
592 nodehdr
.count
, FORKNAME(whichfork
),
596 if (nodehdr
.level
!= this_level
) {
598 _("bad level %d in %s block %u for inode %" PRIu64
"\n"),
599 nodehdr
.level
, FORKNAME(whichfork
),
605 fprintf(stderr
, "verify_da_path returns 1 (bad) #4\n");
612 * update cursor, write out the *current* level if
613 * required. don't write out the descendant level
615 ASSERT(cursor
->level
[this_level
].dirty
== 0 ||
616 (cursor
->level
[this_level
].dirty
&& !no_modify
));
619 * If block looks ok but CRC didn't match, make sure to
623 cursor
->level
[this_level
].bp
->b_error
== -EFSBADCRC
)
624 cursor
->level
[this_level
].dirty
= 1;
626 if (cursor
->level
[this_level
].dirty
&& !no_modify
)
627 libxfs_writebuf(cursor
->level
[this_level
].bp
, 0);
629 libxfs_putbuf(cursor
->level
[this_level
].bp
);
631 /* switch cursor to point at the new buffer we just read */
632 cursor
->level
[this_level
].bp
= bp
;
633 cursor
->level
[this_level
].dirty
= 0;
634 cursor
->level
[this_level
].bno
= dabno
;
635 cursor
->level
[this_level
].hashval
=
636 be32_to_cpu(btree
[0].hashval
);
638 entry
= cursor
->level
[this_level
].index
= 0;
641 * ditto for block numbers
643 if (cursor
->level
[p_level
].bno
!= be32_to_cpu(btree
[entry
].before
)) {
645 fprintf(stderr
, "bad %s btree pointer, child bno "
646 "should be %d, block bno is %d, hashval is %u\n",
647 FORKNAME(whichfork
), be32_to_cpu(btree
[entry
].before
),
648 cursor
->level
[p_level
].bno
,
649 cursor
->level
[p_level
].hashval
);
650 fprintf(stderr
, "verify_da_path returns 1 (bad) #1a\n");
655 * ok, now validate last hashvalue in the descendant
656 * block against the hashval in the current entry
658 if (cursor
->level
[p_level
].hashval
!=
659 be32_to_cpu(btree
[entry
].hashval
)) {
662 _("correcting bad hashval in interior %s block\n"
663 "\tin (level %d) in inode %" PRIu64
".\n"),
664 FORKNAME(whichfork
), this_level
, cursor
->ino
);
665 btree
[entry
].hashval
= cpu_to_be32(
666 cursor
->level
[p_level
].hashval
);
667 cursor
->level
[this_level
].dirty
++;
670 _("would correct bad hashval in interior %s block\n"
671 "\tin (level %d) in inode %" PRIu64
".\n"),
672 FORKNAME(whichfork
), this_level
, cursor
->ino
);
676 * increment index for this level to point to next entry
677 * (which should point to the next descendant block)
679 cursor
->level
[this_level
].index
++;
681 fprintf(stderr
, "verify_da_path returns 0 (ok)\n");