1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
13 #include "xfs_mount.h"
14 #include "xfs_trans.h"
15 #include "xfs_buf_item.h"
16 #include "xfs_trans_priv.h"
17 #include "xfs_trace.h"
19 #include "xfs_log_priv.h"
20 #include "xfs_log_recover.h"
21 #include "xfs_error.h"
22 #include "xfs_inode.h"
24 #include "xfs_quota.h"
27 * This is the number of entries in the l_buf_cancel_table used during
30 #define XLOG_BC_TABLE_SIZE 64
32 #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
33 ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE))
36 * This structure is used during recovery to record the buf log items which
37 * have been canceled and should not be replayed.
39 struct xfs_buf_cancel
{
43 struct list_head bc_list
;
46 static struct xfs_buf_cancel
*
47 xlog_find_buffer_cancelled(
52 struct list_head
*bucket
;
53 struct xfs_buf_cancel
*bcp
;
55 if (!log
->l_buf_cancel_table
)
58 bucket
= XLOG_BUF_CANCEL_BUCKET(log
, blkno
);
59 list_for_each_entry(bcp
, bucket
, bc_list
) {
60 if (bcp
->bc_blkno
== blkno
&& bcp
->bc_len
== len
)
68 xlog_add_buffer_cancelled(
73 struct xfs_buf_cancel
*bcp
;
76 * If we find an existing cancel record, this indicates that the buffer
77 * was cancelled multiple times. To ensure that during pass 2 we keep
78 * the record in the table until we reach its last occurrence in the
79 * log, a reference count is kept to tell how many times we expect to
80 * see this record during the second pass.
82 bcp
= xlog_find_buffer_cancelled(log
, blkno
, len
);
88 bcp
= kmem_alloc(sizeof(struct xfs_buf_cancel
), 0);
89 bcp
->bc_blkno
= blkno
;
92 list_add_tail(&bcp
->bc_list
, XLOG_BUF_CANCEL_BUCKET(log
, blkno
));
97 * Check if there is and entry for blkno, len in the buffer cancel record table.
100 xlog_is_buffer_cancelled(
105 return xlog_find_buffer_cancelled(log
, blkno
, len
) != NULL
;
109 * Check if there is and entry for blkno, len in the buffer cancel record table,
110 * and decremented the reference count on it if there is one.
112 * Remove the cancel record once the refcount hits zero, so that if the same
113 * buffer is re-used again after its last cancellation we actually replay the
114 * changes made at that point.
117 xlog_put_buffer_cancelled(
122 struct xfs_buf_cancel
*bcp
;
124 bcp
= xlog_find_buffer_cancelled(log
, blkno
, len
);
130 if (--bcp
->bc_refcount
== 0) {
131 list_del(&bcp
->bc_list
);
137 /* log buffer item recovery */
140 * Sort buffer items for log recovery. Most buffer items should end up on the
141 * buffer list and are recovered first, with the following exceptions:
143 * 1. XFS_BLF_CANCEL buffers must be processed last because some log items
144 * might depend on the incor ecancellation record, and replaying a cancelled
145 * buffer item can remove the incore record.
147 * 2. XFS_BLF_INODE_BUF buffers are handled after most regular items so that
148 * we replay di_next_unlinked only after flushing the inode 'free' state
149 * to the inode buffer.
151 * See xlog_recover_reorder_trans for more details.
153 STATIC
enum xlog_recover_reorder
154 xlog_recover_buf_reorder(
155 struct xlog_recover_item
*item
)
157 struct xfs_buf_log_format
*buf_f
= item
->ri_buf
[0].i_addr
;
159 if (buf_f
->blf_flags
& XFS_BLF_CANCEL
)
160 return XLOG_REORDER_CANCEL_LIST
;
161 if (buf_f
->blf_flags
& XFS_BLF_INODE_BUF
)
162 return XLOG_REORDER_INODE_BUFFER_LIST
;
163 return XLOG_REORDER_BUFFER_LIST
;
167 xlog_recover_buf_ra_pass2(
169 struct xlog_recover_item
*item
)
171 struct xfs_buf_log_format
*buf_f
= item
->ri_buf
[0].i_addr
;
173 xlog_buf_readahead(log
, buf_f
->blf_blkno
, buf_f
->blf_len
, NULL
);
177 * Build up the table of buf cancel records so that we don't replay cancelled
178 * data in the second pass.
181 xlog_recover_buf_commit_pass1(
183 struct xlog_recover_item
*item
)
185 struct xfs_buf_log_format
*bf
= item
->ri_buf
[0].i_addr
;
187 if (!xfs_buf_log_check_iovec(&item
->ri_buf
[0])) {
188 xfs_err(log
->l_mp
, "bad buffer log item size (%d)",
189 item
->ri_buf
[0].i_len
);
190 return -EFSCORRUPTED
;
193 if (!(bf
->blf_flags
& XFS_BLF_CANCEL
))
194 trace_xfs_log_recover_buf_not_cancel(log
, bf
);
195 else if (xlog_add_buffer_cancelled(log
, bf
->blf_blkno
, bf
->blf_len
))
196 trace_xfs_log_recover_buf_cancel_add(log
, bf
);
198 trace_xfs_log_recover_buf_cancel_ref_inc(log
, bf
);
203 * Validate the recovered buffer is of the correct type and attach the
204 * appropriate buffer operations to them for writeback. Magic numbers are in a
206 * the first 16 bits of the buffer (inode buffer, dquot buffer),
207 * the first 32 bits of the buffer (most blocks),
208 * inside a struct xfs_da_blkinfo at the start of the buffer.
211 xlog_recover_validate_buf_type(
212 struct xfs_mount
*mp
,
214 struct xfs_buf_log_format
*buf_f
,
215 xfs_lsn_t current_lsn
)
217 struct xfs_da_blkinfo
*info
= bp
->b_addr
;
221 char *warnmsg
= NULL
;
224 * We can only do post recovery validation on items on CRC enabled
225 * fielsystems as we need to know when the buffer was written to be able
226 * to determine if we should have replayed the item. If we replay old
227 * metadata over a newer buffer, then it will enter a temporarily
228 * inconsistent state resulting in verification failures. Hence for now
229 * just avoid the verification stage for non-crc filesystems
231 if (!xfs_has_crc(mp
))
234 magic32
= be32_to_cpu(*(__be32
*)bp
->b_addr
);
235 magic16
= be16_to_cpu(*(__be16
*)bp
->b_addr
);
236 magicda
= be16_to_cpu(info
->magic
);
237 switch (xfs_blft_from_flags(buf_f
)) {
238 case XFS_BLFT_BTREE_BUF
:
240 case XFS_ABTB_CRC_MAGIC
:
242 bp
->b_ops
= &xfs_bnobt_buf_ops
;
244 case XFS_ABTC_CRC_MAGIC
:
246 bp
->b_ops
= &xfs_cntbt_buf_ops
;
248 case XFS_IBT_CRC_MAGIC
:
250 bp
->b_ops
= &xfs_inobt_buf_ops
;
252 case XFS_FIBT_CRC_MAGIC
:
254 bp
->b_ops
= &xfs_finobt_buf_ops
;
256 case XFS_BMAP_CRC_MAGIC
:
258 bp
->b_ops
= &xfs_bmbt_buf_ops
;
260 case XFS_RMAP_CRC_MAGIC
:
261 bp
->b_ops
= &xfs_rmapbt_buf_ops
;
263 case XFS_REFC_CRC_MAGIC
:
264 bp
->b_ops
= &xfs_refcountbt_buf_ops
;
267 warnmsg
= "Bad btree block magic!";
271 case XFS_BLFT_AGF_BUF
:
272 if (magic32
!= XFS_AGF_MAGIC
) {
273 warnmsg
= "Bad AGF block magic!";
276 bp
->b_ops
= &xfs_agf_buf_ops
;
278 case XFS_BLFT_AGFL_BUF
:
279 if (magic32
!= XFS_AGFL_MAGIC
) {
280 warnmsg
= "Bad AGFL block magic!";
283 bp
->b_ops
= &xfs_agfl_buf_ops
;
285 case XFS_BLFT_AGI_BUF
:
286 if (magic32
!= XFS_AGI_MAGIC
) {
287 warnmsg
= "Bad AGI block magic!";
290 bp
->b_ops
= &xfs_agi_buf_ops
;
292 case XFS_BLFT_UDQUOT_BUF
:
293 case XFS_BLFT_PDQUOT_BUF
:
294 case XFS_BLFT_GDQUOT_BUF
:
295 #ifdef CONFIG_XFS_QUOTA
296 if (magic16
!= XFS_DQUOT_MAGIC
) {
297 warnmsg
= "Bad DQUOT block magic!";
300 bp
->b_ops
= &xfs_dquot_buf_ops
;
303 "Trying to recover dquots without QUOTA support built in!");
307 case XFS_BLFT_DINO_BUF
:
308 if (magic16
!= XFS_DINODE_MAGIC
) {
309 warnmsg
= "Bad INODE block magic!";
312 bp
->b_ops
= &xfs_inode_buf_ops
;
314 case XFS_BLFT_SYMLINK_BUF
:
315 if (magic32
!= XFS_SYMLINK_MAGIC
) {
316 warnmsg
= "Bad symlink block magic!";
319 bp
->b_ops
= &xfs_symlink_buf_ops
;
321 case XFS_BLFT_DIR_BLOCK_BUF
:
322 if (magic32
!= XFS_DIR2_BLOCK_MAGIC
&&
323 magic32
!= XFS_DIR3_BLOCK_MAGIC
) {
324 warnmsg
= "Bad dir block magic!";
327 bp
->b_ops
= &xfs_dir3_block_buf_ops
;
329 case XFS_BLFT_DIR_DATA_BUF
:
330 if (magic32
!= XFS_DIR2_DATA_MAGIC
&&
331 magic32
!= XFS_DIR3_DATA_MAGIC
) {
332 warnmsg
= "Bad dir data magic!";
335 bp
->b_ops
= &xfs_dir3_data_buf_ops
;
337 case XFS_BLFT_DIR_FREE_BUF
:
338 if (magic32
!= XFS_DIR2_FREE_MAGIC
&&
339 magic32
!= XFS_DIR3_FREE_MAGIC
) {
340 warnmsg
= "Bad dir3 free magic!";
343 bp
->b_ops
= &xfs_dir3_free_buf_ops
;
345 case XFS_BLFT_DIR_LEAF1_BUF
:
346 if (magicda
!= XFS_DIR2_LEAF1_MAGIC
&&
347 magicda
!= XFS_DIR3_LEAF1_MAGIC
) {
348 warnmsg
= "Bad dir leaf1 magic!";
351 bp
->b_ops
= &xfs_dir3_leaf1_buf_ops
;
353 case XFS_BLFT_DIR_LEAFN_BUF
:
354 if (magicda
!= XFS_DIR2_LEAFN_MAGIC
&&
355 magicda
!= XFS_DIR3_LEAFN_MAGIC
) {
356 warnmsg
= "Bad dir leafn magic!";
359 bp
->b_ops
= &xfs_dir3_leafn_buf_ops
;
361 case XFS_BLFT_DA_NODE_BUF
:
362 if (magicda
!= XFS_DA_NODE_MAGIC
&&
363 magicda
!= XFS_DA3_NODE_MAGIC
) {
364 warnmsg
= "Bad da node magic!";
367 bp
->b_ops
= &xfs_da3_node_buf_ops
;
369 case XFS_BLFT_ATTR_LEAF_BUF
:
370 if (magicda
!= XFS_ATTR_LEAF_MAGIC
&&
371 magicda
!= XFS_ATTR3_LEAF_MAGIC
) {
372 warnmsg
= "Bad attr leaf magic!";
375 bp
->b_ops
= &xfs_attr3_leaf_buf_ops
;
377 case XFS_BLFT_ATTR_RMT_BUF
:
378 if (magic32
!= XFS_ATTR3_RMT_MAGIC
) {
379 warnmsg
= "Bad attr remote magic!";
382 bp
->b_ops
= &xfs_attr3_rmt_buf_ops
;
384 case XFS_BLFT_SB_BUF
:
385 if (magic32
!= XFS_SB_MAGIC
) {
386 warnmsg
= "Bad SB block magic!";
389 bp
->b_ops
= &xfs_sb_buf_ops
;
392 case XFS_BLFT_RTBITMAP_BUF
:
393 case XFS_BLFT_RTSUMMARY_BUF
:
394 /* no magic numbers for verification of RT buffers */
395 bp
->b_ops
= &xfs_rtbuf_ops
;
397 #endif /* CONFIG_XFS_RT */
399 xfs_warn(mp
, "Unknown buffer type %d!",
400 xfs_blft_from_flags(buf_f
));
405 * Nothing else to do in the case of a NULL current LSN as this means
406 * the buffer is more recent than the change in the log and will be
409 if (current_lsn
== NULLCOMMITLSN
)
413 xfs_warn(mp
, warnmsg
);
418 * We must update the metadata LSN of the buffer as it is written out to
419 * ensure that older transactions never replay over this one and corrupt
420 * the buffer. This can occur if log recovery is interrupted at some
421 * point after the current transaction completes, at which point a
422 * subsequent mount starts recovery from the beginning.
424 * Write verifiers update the metadata LSN from log items attached to
425 * the buffer. Therefore, initialize a bli purely to carry the LSN to
429 struct xfs_buf_log_item
*bip
;
431 bp
->b_flags
|= _XBF_LOGRECOVERY
;
432 xfs_buf_item_init(bp
, mp
);
433 bip
= bp
->b_log_item
;
434 bip
->bli_item
.li_lsn
= current_lsn
;
439 * Perform a 'normal' buffer recovery. Each logged region of the
440 * buffer should be copied over the corresponding region in the
441 * given buffer. The bitmap in the buf log format structure indicates
442 * where to place the logged data.
445 xlog_recover_do_reg_buffer(
446 struct xfs_mount
*mp
,
447 struct xlog_recover_item
*item
,
449 struct xfs_buf_log_format
*buf_f
,
450 xfs_lsn_t current_lsn
)
456 const size_t size_disk_dquot
= sizeof(struct xfs_disk_dquot
);
458 trace_xfs_log_recover_buf_reg_buf(mp
->m_log
, buf_f
);
461 i
= 1; /* 0 is the buf format structure */
463 bit
= xfs_next_bit(buf_f
->blf_data_map
,
464 buf_f
->blf_map_size
, bit
);
467 nbits
= xfs_contig_bits(buf_f
->blf_data_map
,
468 buf_f
->blf_map_size
, bit
);
470 ASSERT(item
->ri_buf
[i
].i_addr
!= NULL
);
471 ASSERT(item
->ri_buf
[i
].i_len
% XFS_BLF_CHUNK
== 0);
472 ASSERT(BBTOB(bp
->b_length
) >=
473 ((uint
)bit
<< XFS_BLF_SHIFT
) + (nbits
<< XFS_BLF_SHIFT
));
476 * The dirty regions logged in the buffer, even though
477 * contiguous, may span multiple chunks. This is because the
478 * dirty region may span a physical page boundary in a buffer
479 * and hence be split into two separate vectors for writing into
480 * the log. Hence we need to trim nbits back to the length of
481 * the current region being copied out of the log.
483 if (item
->ri_buf
[i
].i_len
< (nbits
<< XFS_BLF_SHIFT
))
484 nbits
= item
->ri_buf
[i
].i_len
>> XFS_BLF_SHIFT
;
487 * Do a sanity check if this is a dquot buffer. Just checking
488 * the first dquot in the buffer should do. XXXThis is
489 * probably a good thing to do for other buf types also.
492 if (buf_f
->blf_flags
&
493 (XFS_BLF_UDQUOT_BUF
|XFS_BLF_PDQUOT_BUF
|XFS_BLF_GDQUOT_BUF
)) {
494 if (item
->ri_buf
[i
].i_addr
== NULL
) {
496 "XFS: NULL dquot in %s.", __func__
);
499 if (item
->ri_buf
[i
].i_len
< size_disk_dquot
) {
501 "XFS: dquot too small (%d) in %s.",
502 item
->ri_buf
[i
].i_len
, __func__
);
505 fa
= xfs_dquot_verify(mp
, item
->ri_buf
[i
].i_addr
, -1);
508 "dquot corrupt at %pS trying to replay into block 0x%llx",
509 fa
, xfs_buf_daddr(bp
));
514 memcpy(xfs_buf_offset(bp
,
515 (uint
)bit
<< XFS_BLF_SHIFT
), /* dest */
516 item
->ri_buf
[i
].i_addr
, /* source */
517 nbits
<<XFS_BLF_SHIFT
); /* length */
523 /* Shouldn't be any more regions */
524 ASSERT(i
== item
->ri_total
);
526 xlog_recover_validate_buf_type(mp
, bp
, buf_f
, current_lsn
);
530 * Perform a dquot buffer recovery.
531 * Simple algorithm: if we have found a QUOTAOFF log item of the same type
532 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
533 * Else, treat it as a regular buffer and do recovery.
535 * Return false if the buffer was tossed and true if we recovered the buffer to
536 * indicate to the caller if the buffer needs writing.
539 xlog_recover_do_dquot_buffer(
540 struct xfs_mount
*mp
,
542 struct xlog_recover_item
*item
,
544 struct xfs_buf_log_format
*buf_f
)
548 trace_xfs_log_recover_buf_dquot_buf(log
, buf_f
);
551 * Filesystems are required to send in quota flags at mount time.
557 if (buf_f
->blf_flags
& XFS_BLF_UDQUOT_BUF
)
558 type
|= XFS_DQTYPE_USER
;
559 if (buf_f
->blf_flags
& XFS_BLF_PDQUOT_BUF
)
560 type
|= XFS_DQTYPE_PROJ
;
561 if (buf_f
->blf_flags
& XFS_BLF_GDQUOT_BUF
)
562 type
|= XFS_DQTYPE_GROUP
;
564 * This type of quotas was turned off, so ignore this buffer
566 if (log
->l_quotaoffs_flag
& type
)
569 xlog_recover_do_reg_buffer(mp
, item
, bp
, buf_f
, NULLCOMMITLSN
);
574 * Perform recovery for a buffer full of inodes. In these buffers, the only
575 * data which should be recovered is that which corresponds to the
576 * di_next_unlinked pointers in the on disk inode structures. The rest of the
577 * data for the inodes is always logged through the inodes themselves rather
578 * than the inode buffer and is recovered in xlog_recover_inode_pass2().
580 * The only time when buffers full of inodes are fully recovered is when the
581 * buffer is full of newly allocated inodes. In this case the buffer will
582 * not be marked as an inode buffer and so will be sent to
583 * xlog_recover_do_reg_buffer() below during recovery.
586 xlog_recover_do_inode_buffer(
587 struct xfs_mount
*mp
,
588 struct xlog_recover_item
*item
,
590 struct xfs_buf_log_format
*buf_f
)
596 int reg_buf_offset
= 0;
597 int reg_buf_bytes
= 0;
598 int next_unlinked_offset
;
600 xfs_agino_t
*logged_nextp
;
601 xfs_agino_t
*buffer_nextp
;
603 trace_xfs_log_recover_buf_inode_buf(mp
->m_log
, buf_f
);
606 * Post recovery validation only works properly on CRC enabled
610 bp
->b_ops
= &xfs_inode_buf_ops
;
612 inodes_per_buf
= BBTOB(bp
->b_length
) >> mp
->m_sb
.sb_inodelog
;
613 for (i
= 0; i
< inodes_per_buf
; i
++) {
614 next_unlinked_offset
= (i
* mp
->m_sb
.sb_inodesize
) +
615 offsetof(struct xfs_dinode
, di_next_unlinked
);
617 while (next_unlinked_offset
>=
618 (reg_buf_offset
+ reg_buf_bytes
)) {
620 * The next di_next_unlinked field is beyond
621 * the current logged region. Find the next
622 * logged region that contains or is beyond
623 * the current di_next_unlinked field.
626 bit
= xfs_next_bit(buf_f
->blf_data_map
,
627 buf_f
->blf_map_size
, bit
);
630 * If there are no more logged regions in the
631 * buffer, then we're done.
636 nbits
= xfs_contig_bits(buf_f
->blf_data_map
,
637 buf_f
->blf_map_size
, bit
);
639 reg_buf_offset
= bit
<< XFS_BLF_SHIFT
;
640 reg_buf_bytes
= nbits
<< XFS_BLF_SHIFT
;
645 * If the current logged region starts after the current
646 * di_next_unlinked field, then move on to the next
647 * di_next_unlinked field.
649 if (next_unlinked_offset
< reg_buf_offset
)
652 ASSERT(item
->ri_buf
[item_index
].i_addr
!= NULL
);
653 ASSERT((item
->ri_buf
[item_index
].i_len
% XFS_BLF_CHUNK
) == 0);
654 ASSERT((reg_buf_offset
+ reg_buf_bytes
) <= BBTOB(bp
->b_length
));
657 * The current logged region contains a copy of the
658 * current di_next_unlinked field. Extract its value
659 * and copy it to the buffer copy.
661 logged_nextp
= item
->ri_buf
[item_index
].i_addr
+
662 next_unlinked_offset
- reg_buf_offset
;
663 if (XFS_IS_CORRUPT(mp
, *logged_nextp
== 0)) {
665 "Bad inode buffer log record (ptr = "PTR_FMT
", bp = "PTR_FMT
"). "
666 "Trying to replay bad (0) inode di_next_unlinked field.",
668 return -EFSCORRUPTED
;
671 buffer_nextp
= xfs_buf_offset(bp
, next_unlinked_offset
);
672 *buffer_nextp
= *logged_nextp
;
675 * If necessary, recalculate the CRC in the on-disk inode. We
676 * have to leave the inode in a consistent state for whoever
679 xfs_dinode_calc_crc(mp
,
680 xfs_buf_offset(bp
, i
* mp
->m_sb
.sb_inodesize
));
688 * V5 filesystems know the age of the buffer on disk being recovered. We can
689 * have newer objects on disk than we are replaying, and so for these cases we
690 * don't want to replay the current change as that will make the buffer contents
691 * temporarily invalid on disk.
693 * The magic number might not match the buffer type we are going to recover
694 * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence
695 * extract the LSN of the existing object in the buffer based on it's current
696 * magic number. If we don't recognise the magic number in the buffer, then
697 * return a LSN of -1 so that the caller knows it was an unrecognised block and
698 * so can recover the buffer.
700 * Note: we cannot rely solely on magic number matches to determine that the
701 * buffer has a valid LSN - we also need to verify that it belongs to this
702 * filesystem, so we need to extract the object's LSN and compare it to that
703 * which we read from the superblock. If the UUIDs don't match, then we've got a
704 * stale metadata block from an old filesystem instance that we need to recover
708 xlog_recover_get_buf_lsn(
709 struct xfs_mount
*mp
,
711 struct xfs_buf_log_format
*buf_f
)
716 void *blk
= bp
->b_addr
;
721 /* v4 filesystems always recover immediately */
722 if (!xfs_has_crc(mp
))
723 goto recover_immediately
;
726 * realtime bitmap and summary file blocks do not have magic numbers or
727 * UUIDs, so we must recover them immediately.
729 blft
= xfs_blft_from_flags(buf_f
);
730 if (blft
== XFS_BLFT_RTBITMAP_BUF
|| blft
== XFS_BLFT_RTSUMMARY_BUF
)
731 goto recover_immediately
;
733 magic32
= be32_to_cpu(*(__be32
*)blk
);
735 case XFS_ABTB_CRC_MAGIC
:
736 case XFS_ABTC_CRC_MAGIC
:
739 case XFS_RMAP_CRC_MAGIC
:
740 case XFS_REFC_CRC_MAGIC
:
741 case XFS_FIBT_CRC_MAGIC
:
743 case XFS_IBT_CRC_MAGIC
:
744 case XFS_IBT_MAGIC
: {
745 struct xfs_btree_block
*btb
= blk
;
747 lsn
= be64_to_cpu(btb
->bb_u
.s
.bb_lsn
);
748 uuid
= &btb
->bb_u
.s
.bb_uuid
;
751 case XFS_BMAP_CRC_MAGIC
:
752 case XFS_BMAP_MAGIC
: {
753 struct xfs_btree_block
*btb
= blk
;
755 lsn
= be64_to_cpu(btb
->bb_u
.l
.bb_lsn
);
756 uuid
= &btb
->bb_u
.l
.bb_uuid
;
760 lsn
= be64_to_cpu(((struct xfs_agf
*)blk
)->agf_lsn
);
761 uuid
= &((struct xfs_agf
*)blk
)->agf_uuid
;
764 lsn
= be64_to_cpu(((struct xfs_agfl
*)blk
)->agfl_lsn
);
765 uuid
= &((struct xfs_agfl
*)blk
)->agfl_uuid
;
768 lsn
= be64_to_cpu(((struct xfs_agi
*)blk
)->agi_lsn
);
769 uuid
= &((struct xfs_agi
*)blk
)->agi_uuid
;
771 case XFS_SYMLINK_MAGIC
:
772 lsn
= be64_to_cpu(((struct xfs_dsymlink_hdr
*)blk
)->sl_lsn
);
773 uuid
= &((struct xfs_dsymlink_hdr
*)blk
)->sl_uuid
;
775 case XFS_DIR3_BLOCK_MAGIC
:
776 case XFS_DIR3_DATA_MAGIC
:
777 case XFS_DIR3_FREE_MAGIC
:
778 lsn
= be64_to_cpu(((struct xfs_dir3_blk_hdr
*)blk
)->lsn
);
779 uuid
= &((struct xfs_dir3_blk_hdr
*)blk
)->uuid
;
781 case XFS_ATTR3_RMT_MAGIC
:
783 * Remote attr blocks are written synchronously, rather than
784 * being logged. That means they do not contain a valid LSN
785 * (i.e. transactionally ordered) in them, and hence any time we
786 * see a buffer to replay over the top of a remote attribute
787 * block we should simply do so.
789 goto recover_immediately
;
792 * superblock uuids are magic. We may or may not have a
793 * sb_meta_uuid on disk, but it will be set in the in-core
794 * superblock. We set the uuid pointer for verification
795 * according to the superblock feature mask to ensure we check
796 * the relevant UUID in the superblock.
798 lsn
= be64_to_cpu(((struct xfs_dsb
*)blk
)->sb_lsn
);
799 if (xfs_has_metauuid(mp
))
800 uuid
= &((struct xfs_dsb
*)blk
)->sb_meta_uuid
;
802 uuid
= &((struct xfs_dsb
*)blk
)->sb_uuid
;
808 if (lsn
!= (xfs_lsn_t
)-1) {
809 if (!uuid_equal(&mp
->m_sb
.sb_meta_uuid
, uuid
))
810 goto recover_immediately
;
814 magicda
= be16_to_cpu(((struct xfs_da_blkinfo
*)blk
)->magic
);
816 case XFS_DIR3_LEAF1_MAGIC
:
817 case XFS_DIR3_LEAFN_MAGIC
:
818 case XFS_ATTR3_LEAF_MAGIC
:
819 case XFS_DA3_NODE_MAGIC
:
820 lsn
= be64_to_cpu(((struct xfs_da3_blkinfo
*)blk
)->lsn
);
821 uuid
= &((struct xfs_da3_blkinfo
*)blk
)->uuid
;
827 if (lsn
!= (xfs_lsn_t
)-1) {
828 if (!uuid_equal(&mp
->m_sb
.sb_meta_uuid
, uuid
))
829 goto recover_immediately
;
834 * We do individual object checks on dquot and inode buffers as they
835 * have their own individual LSN records. Also, we could have a stale
836 * buffer here, so we have to at least recognise these buffer types.
838 * A notd complexity here is inode unlinked list processing - it logs
839 * the inode directly in the buffer, but we don't know which inodes have
840 * been modified, and there is no global buffer LSN. Hence we need to
841 * recover all inode buffer types immediately. This problem will be
842 * fixed by logical logging of the unlinked list modifications.
844 magic16
= be16_to_cpu(*(__be16
*)blk
);
846 case XFS_DQUOT_MAGIC
:
847 case XFS_DINODE_MAGIC
:
848 goto recover_immediately
;
853 /* unknown buffer contents, recover immediately */
856 return (xfs_lsn_t
)-1;
861 * This routine replays a modification made to a buffer at runtime.
862 * There are actually two types of buffer, regular and inode, which
863 * are handled differently. Inode buffers are handled differently
864 * in that we only recover a specific set of data from them, namely
865 * the inode di_next_unlinked fields. This is because all other inode
866 * data is actually logged via inode records and any data we replay
867 * here which overlaps that may be stale.
869 * When meta-data buffers are freed at run time we log a buffer item
870 * with the XFS_BLF_CANCEL bit set to indicate that previous copies
871 * of the buffer in the log should not be replayed at recovery time.
872 * This is so that if the blocks covered by the buffer are reused for
873 * file data before we crash we don't end up replaying old, freed
874 * meta-data into a user's file.
876 * To handle the cancellation of buffer log items, we make two passes
877 * over the log during recovery. During the first we build a table of
878 * those buffers which have been cancelled, and during the second we
879 * only replay those buffers which do not have corresponding cancel
880 * records in the table. See xlog_recover_buf_pass[1,2] above
881 * for more details on the implementation of the table of cancel records.
884 xlog_recover_buf_commit_pass2(
886 struct list_head
*buffer_list
,
887 struct xlog_recover_item
*item
,
888 xfs_lsn_t current_lsn
)
890 struct xfs_buf_log_format
*buf_f
= item
->ri_buf
[0].i_addr
;
891 struct xfs_mount
*mp
= log
->l_mp
;
898 * In this pass we only want to recover all the buffers which have
899 * not been cancelled and are not cancellation buffers themselves.
901 if (buf_f
->blf_flags
& XFS_BLF_CANCEL
) {
902 if (xlog_put_buffer_cancelled(log
, buf_f
->blf_blkno
,
907 if (xlog_is_buffer_cancelled(log
, buf_f
->blf_blkno
,
912 trace_xfs_log_recover_buf_recover(log
, buf_f
);
915 if (buf_f
->blf_flags
& XFS_BLF_INODE_BUF
)
916 buf_flags
|= XBF_UNMAPPED
;
918 error
= xfs_buf_read(mp
->m_ddev_targp
, buf_f
->blf_blkno
, buf_f
->blf_len
,
919 buf_flags
, &bp
, NULL
);
924 * Recover the buffer only if we get an LSN from it and it's less than
925 * the lsn of the transaction we are replaying.
927 * Note that we have to be extremely careful of readahead here.
928 * Readahead does not attach verfiers to the buffers so if we don't
929 * actually do any replay after readahead because of the LSN we found
930 * in the buffer if more recent than that current transaction then we
931 * need to attach the verifier directly. Failure to do so can lead to
932 * future recovery actions (e.g. EFI and unlinked list recovery) can
933 * operate on the buffers and they won't get the verifier attached. This
934 * can lead to blocks on disk having the correct content but a stale
937 * It is safe to assume these clean buffers are currently up to date.
938 * If the buffer is dirtied by a later transaction being replayed, then
939 * the verifier will be reset to match whatever recover turns that
942 lsn
= xlog_recover_get_buf_lsn(mp
, bp
, buf_f
);
943 if (lsn
&& lsn
!= -1 && XFS_LSN_CMP(lsn
, current_lsn
) >= 0) {
944 trace_xfs_log_recover_buf_skip(log
, buf_f
);
945 xlog_recover_validate_buf_type(mp
, bp
, buf_f
, NULLCOMMITLSN
);
949 if (buf_f
->blf_flags
& XFS_BLF_INODE_BUF
) {
950 error
= xlog_recover_do_inode_buffer(mp
, item
, bp
, buf_f
);
953 } else if (buf_f
->blf_flags
&
954 (XFS_BLF_UDQUOT_BUF
|XFS_BLF_PDQUOT_BUF
|XFS_BLF_GDQUOT_BUF
)) {
957 dirty
= xlog_recover_do_dquot_buffer(mp
, log
, item
, bp
, buf_f
);
961 xlog_recover_do_reg_buffer(mp
, item
, bp
, buf_f
, current_lsn
);
965 * Perform delayed write on the buffer. Asynchronous writes will be
966 * slower when taking into account all the buffers to be flushed.
968 * Also make sure that only inode buffers with good sizes stay in
969 * the buffer cache. The kernel moves inodes in buffers of 1 block
970 * or inode_cluster_size bytes, whichever is bigger. The inode
971 * buffers in the log can be a different size if the log was generated
972 * by an older kernel using unclustered inode buffers or a newer kernel
973 * running with a different inode cluster size. Regardless, if
974 * the inode buffer size isn't max(blocksize, inode_cluster_size)
975 * for *our* value of inode_cluster_size, then we need to keep
976 * the buffer out of the buffer cache so that the buffer won't
977 * overlap with future reads of those inodes.
979 if (XFS_DINODE_MAGIC
==
980 be16_to_cpu(*((__be16
*)xfs_buf_offset(bp
, 0))) &&
981 (BBTOB(bp
->b_length
) != M_IGEO(log
->l_mp
)->inode_cluster_size
)) {
983 error
= xfs_bwrite(bp
);
985 ASSERT(bp
->b_mount
== mp
);
986 bp
->b_flags
|= _XBF_LOGRECOVERY
;
987 xfs_buf_delwri_queue(bp
, buffer_list
);
994 trace_xfs_log_recover_buf_cancel(log
, buf_f
);
998 const struct xlog_recover_item_ops xlog_buf_item_ops
= {
999 .item_type
= XFS_LI_BUF
,
1000 .reorder
= xlog_recover_buf_reorder
,
1001 .ra_pass2
= xlog_recover_buf_ra_pass2
,
1002 .commit_pass1
= xlog_recover_buf_commit_pass1
,
1003 .commit_pass2
= xlog_recover_buf_commit_pass2
,
1008 xlog_check_buf_cancel_table(
1013 for (i
= 0; i
< XLOG_BC_TABLE_SIZE
; i
++)
1014 ASSERT(list_empty(&log
->l_buf_cancel_table
[i
]));
1019 xlog_alloc_buf_cancel_table(
1025 ASSERT(log
->l_buf_cancel_table
== NULL
);
1027 p
= kmalloc_array(XLOG_BC_TABLE_SIZE
, sizeof(struct list_head
),
1032 log
->l_buf_cancel_table
= p
;
1033 for (i
= 0; i
< XLOG_BC_TABLE_SIZE
; i
++)
1034 INIT_LIST_HEAD(&log
->l_buf_cancel_table
[i
]);
1040 xlog_free_buf_cancel_table(
1045 if (!log
->l_buf_cancel_table
)
1048 for (i
= 0; i
< XLOG_BC_TABLE_SIZE
; i
++) {
1049 struct xfs_buf_cancel
*bc
;
1051 while ((bc
= list_first_entry_or_null(
1052 &log
->l_buf_cancel_table
[i
],
1053 struct xfs_buf_cancel
, bc_list
))) {
1054 list_del(&bc
->bc_list
);
1059 kmem_free(log
->l_buf_cancel_table
);
1060 log
->l_buf_cancel_table
= NULL
;