]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - libxlog/xfs_log_recover.c
9e01e49bdaa4e290317124bb77dfce6c71c59f40
[thirdparty/xfsprogs-dev.git] / libxlog / xfs_log_recover.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6 #include "libxfs.h"
7 #include "libxlog.h"
8
9 #define xfs_readonly_buftarg(buftarg) (0)
10
11 /* avoid set-but-unused var warning. gcc is not very bright. */
12 #define xlog_clear_stale_blocks(log, taillsn) ({ \
13 (taillsn) = (taillsn); \
14 (0); \
15 })
16
17 #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
18
19 /*
20 * Verify the given count of basic blocks is valid number of blocks
21 * to specify for an operation involving the given XFS log buffer.
22 * Returns nonzero if the count is valid, 0 otherwise.
23 */
24
25 static inline int
26 xlog_buf_bbcount_valid(
27 struct xlog *log,
28 int bbcount)
29 {
30 return bbcount > 0 && bbcount <= log->l_logBBsize;
31 }
32
33 /*
34 * Allocate a buffer to hold log data. The buffer needs to be able
35 * to map to a range of nbblks basic blocks at any valid (basic
36 * block) offset within the log.
37 */
38 xfs_buf_t *
39 xlog_get_bp(
40 struct xlog *log,
41 int nbblks)
42 {
43 if (!xlog_buf_bbcount_valid(log, nbblks)) {
44 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
45 nbblks);
46 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
47 return NULL;
48 }
49
50 /*
51 * We do log I/O in units of log sectors (a power-of-2
52 * multiple of the basic block size), so we round up the
53 * requested size to accommodate the basic blocks required
54 * for complete log sectors.
55 *
56 * In addition, the buffer may be used for a non-sector-
57 * aligned block offset, in which case an I/O of the
58 * requested size could extend beyond the end of the
59 * buffer. If the requested size is only 1 basic block it
60 * will never straddle a sector boundary, so this won't be
61 * an issue. Nor will this be a problem if the log I/O is
62 * done in basic blocks (sector size 1). But otherwise we
63 * extend the buffer by one extra log sector to ensure
64 * there's space to accommodate this possibility.
65 */
66 if (nbblks > 1 && log->l_sectBBsize > 1)
67 nbblks += log->l_sectBBsize;
68 nbblks = round_up(nbblks, log->l_sectBBsize);
69
70 return libxfs_getbufr(log->l_dev, (xfs_daddr_t)-1, nbblks);
71 }
72
73 void
74 xlog_put_bp(
75 xfs_buf_t *bp)
76 {
77 libxfs_putbufr(bp);
78 }
79
80 /*
81 * Return the address of the start of the given block number's data
82 * in a log buffer. The buffer covers a log sector-aligned region.
83 */
84 STATIC char *
85 xlog_align(
86 struct xlog *log,
87 xfs_daddr_t blk_no,
88 int nbblks,
89 struct xfs_buf *bp)
90 {
91 xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
92
93 ASSERT(offset + nbblks <= bp->b_length);
94 return bp->b_addr + BBTOB(offset);
95 }
96
97
98 /*
99 * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
100 */
101 int
102 xlog_bread_noalign(
103 struct xlog *log,
104 xfs_daddr_t blk_no,
105 int nbblks,
106 struct xfs_buf *bp)
107 {
108 if (!xlog_buf_bbcount_valid(log, nbblks)) {
109 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
110 nbblks);
111 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
112 return EFSCORRUPTED;
113 }
114
115 blk_no = round_down(blk_no, log->l_sectBBsize);
116 nbblks = round_up(nbblks, log->l_sectBBsize);
117
118 ASSERT(nbblks > 0);
119 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
120
121 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
122 bp->b_bcount = BBTOB(nbblks);
123 bp->b_error = 0;
124
125 return libxfs_readbufr(log->l_dev, XFS_BUF_ADDR(bp), bp, nbblks, 0);
126 }
127
128 int
129 xlog_bread(
130 struct xlog *log,
131 xfs_daddr_t blk_no,
132 int nbblks,
133 struct xfs_buf *bp,
134 char **offset)
135 {
136 int error;
137
138 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
139 if (error)
140 return error;
141
142 *offset = xlog_align(log, blk_no, nbblks, bp);
143 return 0;
144 }
145
146 /*
147 * Read at an offset into the buffer. Returns with the buffer in it's original
148 * state regardless of the result of the read.
149 */
150 STATIC int
151 xlog_bread_offset(
152 struct xlog *log,
153 xfs_daddr_t blk_no, /* block to read from */
154 int nbblks, /* blocks to read */
155 struct xfs_buf *bp,
156 char *offset)
157 {
158 char *orig_offset = bp->b_addr;
159 int orig_len = bp->b_bcount;
160 int error, error2;
161
162 error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
163 if (error)
164 return error;
165
166 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
167
168 /* must reset buffer pointer even on error */
169 error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
170 if (error)
171 return error;
172 return error2;
173 }
174
175 /*
176 * This routine finds (to an approximation) the first block in the physical
177 * log which contains the given cycle. It uses a binary search algorithm.
178 * Note that the algorithm can not be perfect because the disk will not
179 * necessarily be perfect.
180 */
181 int
182 xlog_find_cycle_start(
183 struct xlog *log,
184 struct xfs_buf *bp,
185 xfs_daddr_t first_blk,
186 xfs_daddr_t *last_blk,
187 uint cycle)
188 {
189 char *offset;
190 xfs_daddr_t mid_blk;
191 xfs_daddr_t end_blk;
192 uint mid_cycle;
193 int error;
194
195 end_blk = *last_blk;
196 mid_blk = BLK_AVG(first_blk, end_blk);
197 while (mid_blk != first_blk && mid_blk != end_blk) {
198 error = xlog_bread(log, mid_blk, 1, bp, &offset);
199 if (error)
200 return error;
201 mid_cycle = xlog_get_cycle(offset);
202 if (mid_cycle == cycle)
203 end_blk = mid_blk; /* last_half_cycle == mid_cycle */
204 else
205 first_blk = mid_blk; /* first_half_cycle == mid_cycle */
206 mid_blk = BLK_AVG(first_blk, end_blk);
207 }
208 ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
209 (mid_blk == end_blk && mid_blk-1 == first_blk));
210
211 *last_blk = end_blk;
212
213 return 0;
214 }
215
216 /*
217 * Check that a range of blocks does not contain stop_on_cycle_no.
218 * Fill in *new_blk with the block offset where such a block is
219 * found, or with -1 (an invalid block number) if there is no such
220 * block in the range. The scan needs to occur from front to back
221 * and the pointer into the region must be updated since a later
222 * routine will need to perform another test.
223 */
224 STATIC int
225 xlog_find_verify_cycle(
226 struct xlog *log,
227 xfs_daddr_t start_blk,
228 int nbblks,
229 uint stop_on_cycle_no,
230 xfs_daddr_t *new_blk)
231 {
232 xfs_daddr_t i, j;
233 uint cycle;
234 xfs_buf_t *bp;
235 int bufblks;
236 char *buf = NULL;
237 int error = 0;
238
239 /*
240 * Greedily allocate a buffer big enough to handle the full
241 * range of basic blocks we'll be examining. If that fails,
242 * try a smaller size. We need to be able to read at least
243 * a log sector, or we're out of luck.
244 */
245 bufblks = 1 << ffs(nbblks);
246 while (bufblks > log->l_logBBsize)
247 bufblks >>= 1;
248 while (!(bp = xlog_get_bp(log, bufblks))) {
249 bufblks >>= 1;
250 if (bufblks < log->l_sectBBsize)
251 return ENOMEM;
252 }
253
254 for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
255 int bcount;
256
257 bcount = min(bufblks, (start_blk + nbblks - i));
258
259 error = xlog_bread(log, i, bcount, bp, &buf);
260 if (error)
261 goto out;
262
263 for (j = 0; j < bcount; j++) {
264 cycle = xlog_get_cycle(buf);
265 if (cycle == stop_on_cycle_no) {
266 *new_blk = i+j;
267 goto out;
268 }
269
270 buf += BBSIZE;
271 }
272 }
273
274 *new_blk = -1;
275
276 out:
277 xlog_put_bp(bp);
278 return error;
279 }
280
281 /*
282 * Potentially backup over partial log record write.
283 *
284 * In the typical case, last_blk is the number of the block directly after
285 * a good log record. Therefore, we subtract one to get the block number
286 * of the last block in the given buffer. extra_bblks contains the number
287 * of blocks we would have read on a previous read. This happens when the
288 * last log record is split over the end of the physical log.
289 *
290 * extra_bblks is the number of blocks potentially verified on a previous
291 * call to this routine.
292 */
293 STATIC int
294 xlog_find_verify_log_record(
295 struct xlog *log,
296 xfs_daddr_t start_blk,
297 xfs_daddr_t *last_blk,
298 int extra_bblks)
299 {
300 xfs_daddr_t i;
301 xfs_buf_t *bp;
302 char *offset = NULL;
303 xlog_rec_header_t *head = NULL;
304 int error = 0;
305 int smallmem = 0;
306 int num_blks = *last_blk - start_blk;
307 int xhdrs;
308
309 ASSERT(start_blk != 0 || *last_blk != start_blk);
310
311 if (!(bp = xlog_get_bp(log, num_blks))) {
312 if (!(bp = xlog_get_bp(log, 1)))
313 return ENOMEM;
314 smallmem = 1;
315 } else {
316 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
317 if (error)
318 goto out;
319 offset += ((num_blks - 1) << BBSHIFT);
320 }
321
322 for (i = (*last_blk) - 1; i >= 0; i--) {
323 if (i < start_blk) {
324 /* valid log record not found */
325 xfs_warn(log->l_mp,
326 "Log inconsistent (didn't find previous header)");
327 ASSERT(0);
328 error = XFS_ERROR(EIO);
329 goto out;
330 }
331
332 if (smallmem) {
333 error = xlog_bread(log, i, 1, bp, &offset);
334 if (error)
335 goto out;
336 }
337
338 head = (xlog_rec_header_t *)offset;
339
340 if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
341 break;
342
343 if (!smallmem)
344 offset -= BBSIZE;
345 }
346
347 /*
348 * We hit the beginning of the physical log & still no header. Return
349 * to caller. If caller can handle a return of -1, then this routine
350 * will be called again for the end of the physical log.
351 */
352 if (i == -1) {
353 error = -1;
354 goto out;
355 }
356
357 /*
358 * We have the final block of the good log (the first block
359 * of the log record _before_ the head. So we check the uuid.
360 */
361 if ((error = xlog_header_check_mount(log->l_mp, head)))
362 goto out;
363
364 /*
365 * We may have found a log record header before we expected one.
366 * last_blk will be the 1st block # with a given cycle #. We may end
367 * up reading an entire log record. In this case, we don't want to
368 * reset last_blk. Only when last_blk points in the middle of a log
369 * record do we update last_blk.
370 */
371 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
372 uint h_size = be32_to_cpu(head->h_size);
373
374 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
375 if (h_size % XLOG_HEADER_CYCLE_SIZE)
376 xhdrs++;
377 } else {
378 xhdrs = 1;
379 }
380
381 if (*last_blk - i + extra_bblks !=
382 BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
383 *last_blk = i;
384
385 out:
386 xlog_put_bp(bp);
387 return error;
388 }
389
390 /*
391 * Head is defined to be the point of the log where the next log write
392 * write could go. This means that incomplete LR writes at the end are
393 * eliminated when calculating the head. We aren't guaranteed that previous
394 * LR have complete transactions. We only know that a cycle number of
395 * current cycle number -1 won't be present in the log if we start writing
396 * from our current block number.
397 *
398 * last_blk contains the block number of the first block with a given
399 * cycle number.
400 *
401 * Return: zero if normal, non-zero if error.
402 */
403 STATIC int
404 xlog_find_head(
405 struct xlog *log,
406 xfs_daddr_t *return_head_blk)
407 {
408 xfs_buf_t *bp;
409 char *offset;
410 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
411 int num_scan_bblks;
412 uint first_half_cycle, last_half_cycle;
413 uint stop_on_cycle;
414 int error, log_bbnum = log->l_logBBsize;
415
416 /* Is the end of the log device zeroed? */
417 if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
418 *return_head_blk = first_blk;
419
420 /* Is the whole lot zeroed? */
421 if (!first_blk) {
422 /* Linux XFS shouldn't generate totally zeroed logs -
423 * mkfs etc write a dummy unmount record to a fresh
424 * log so we can store the uuid in there
425 */
426 xfs_warn(log->l_mp, "totally zeroed log");
427 }
428
429 return 0;
430 } else if (error) {
431 xfs_warn(log->l_mp, "empty log check failed");
432 return error;
433 }
434
435 first_blk = 0; /* get cycle # of 1st block */
436 bp = xlog_get_bp(log, 1);
437 if (!bp)
438 return ENOMEM;
439
440 error = xlog_bread(log, 0, 1, bp, &offset);
441 if (error)
442 goto bp_err;
443
444 first_half_cycle = xlog_get_cycle(offset);
445
446 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
447 error = xlog_bread(log, last_blk, 1, bp, &offset);
448 if (error)
449 goto bp_err;
450
451 last_half_cycle = xlog_get_cycle(offset);
452 ASSERT(last_half_cycle != 0);
453
454 /*
455 * If the 1st half cycle number is equal to the last half cycle number,
456 * then the entire log is stamped with the same cycle number. In this
457 * case, head_blk can't be set to zero (which makes sense). The below
458 * math doesn't work out properly with head_blk equal to zero. Instead,
459 * we set it to log_bbnum which is an invalid block number, but this
460 * value makes the math correct. If head_blk doesn't changed through
461 * all the tests below, *head_blk is set to zero at the very end rather
462 * than log_bbnum. In a sense, log_bbnum and zero are the same block
463 * in a circular file.
464 */
465 if (first_half_cycle == last_half_cycle) {
466 /*
467 * In this case we believe that the entire log should have
468 * cycle number last_half_cycle. We need to scan backwards
469 * from the end verifying that there are no holes still
470 * containing last_half_cycle - 1. If we find such a hole,
471 * then the start of that hole will be the new head. The
472 * simple case looks like
473 * x | x ... | x - 1 | x
474 * Another case that fits this picture would be
475 * x | x + 1 | x ... | x
476 * In this case the head really is somewhere at the end of the
477 * log, as one of the latest writes at the beginning was
478 * incomplete.
479 * One more case is
480 * x | x + 1 | x ... | x - 1 | x
481 * This is really the combination of the above two cases, and
482 * the head has to end up at the start of the x-1 hole at the
483 * end of the log.
484 *
485 * In the 256k log case, we will read from the beginning to the
486 * end of the log and search for cycle numbers equal to x-1.
487 * We don't worry about the x+1 blocks that we encounter,
488 * because we know that they cannot be the head since the log
489 * started with x.
490 */
491 head_blk = log_bbnum;
492 stop_on_cycle = last_half_cycle - 1;
493 } else {
494 /*
495 * In this case we want to find the first block with cycle
496 * number matching last_half_cycle. We expect the log to be
497 * some variation on
498 * x + 1 ... | x ... | x
499 * The first block with cycle number x (last_half_cycle) will
500 * be where the new head belongs. First we do a binary search
501 * for the first occurrence of last_half_cycle. The binary
502 * search may not be totally accurate, so then we scan back
503 * from there looking for occurrences of last_half_cycle before
504 * us. If that backwards scan wraps around the beginning of
505 * the log, then we look for occurrences of last_half_cycle - 1
506 * at the end of the log. The cases we're looking for look
507 * like
508 * v binary search stopped here
509 * x + 1 ... | x | x + 1 | x ... | x
510 * ^ but we want to locate this spot
511 * or
512 * <---------> less than scan distance
513 * x + 1 ... | x ... | x - 1 | x
514 * ^ we want to locate this spot
515 */
516 stop_on_cycle = last_half_cycle;
517 if ((error = xlog_find_cycle_start(log, bp, first_blk,
518 &head_blk, last_half_cycle)))
519 goto bp_err;
520 }
521
522 /*
523 * Now validate the answer. Scan back some number of maximum possible
524 * blocks and make sure each one has the expected cycle number. The
525 * maximum is determined by the total possible amount of buffering
526 * in the in-core log. The following number can be made tighter if
527 * we actually look at the block size of the filesystem.
528 */
529 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
530 if (head_blk >= num_scan_bblks) {
531 /*
532 * We are guaranteed that the entire check can be performed
533 * in one buffer.
534 */
535 start_blk = head_blk - num_scan_bblks;
536 if ((error = xlog_find_verify_cycle(log,
537 start_blk, num_scan_bblks,
538 stop_on_cycle, &new_blk)))
539 goto bp_err;
540 if (new_blk != -1)
541 head_blk = new_blk;
542 } else { /* need to read 2 parts of log */
543 /*
544 * We are going to scan backwards in the log in two parts.
545 * First we scan the physical end of the log. In this part
546 * of the log, we are looking for blocks with cycle number
547 * last_half_cycle - 1.
548 * If we find one, then we know that the log starts there, as
549 * we've found a hole that didn't get written in going around
550 * the end of the physical log. The simple case for this is
551 * x + 1 ... | x ... | x - 1 | x
552 * <---------> less than scan distance
553 * If all of the blocks at the end of the log have cycle number
554 * last_half_cycle, then we check the blocks at the start of
555 * the log looking for occurrences of last_half_cycle. If we
556 * find one, then our current estimate for the location of the
557 * first occurrence of last_half_cycle is wrong and we move
558 * back to the hole we've found. This case looks like
559 * x + 1 ... | x | x + 1 | x ...
560 * ^ binary search stopped here
561 * Another case we need to handle that only occurs in 256k
562 * logs is
563 * x + 1 ... | x ... | x+1 | x ...
564 * ^ binary search stops here
565 * In a 256k log, the scan at the end of the log will see the
566 * x + 1 blocks. We need to skip past those since that is
567 * certainly not the head of the log. By searching for
568 * last_half_cycle-1 we accomplish that.
569 */
570 ASSERT(head_blk <= INT_MAX &&
571 (xfs_daddr_t) num_scan_bblks >= head_blk);
572 start_blk = log_bbnum - (num_scan_bblks - head_blk);
573 if ((error = xlog_find_verify_cycle(log, start_blk,
574 num_scan_bblks - (int)head_blk,
575 (stop_on_cycle - 1), &new_blk)))
576 goto bp_err;
577 if (new_blk != -1) {
578 head_blk = new_blk;
579 goto validate_head;
580 }
581
582 /*
583 * Scan beginning of log now. The last part of the physical
584 * log is good. This scan needs to verify that it doesn't find
585 * the last_half_cycle.
586 */
587 start_blk = 0;
588 ASSERT(head_blk <= INT_MAX);
589 if ((error = xlog_find_verify_cycle(log,
590 start_blk, (int)head_blk,
591 stop_on_cycle, &new_blk)))
592 goto bp_err;
593 if (new_blk != -1)
594 head_blk = new_blk;
595 }
596
597 validate_head:
598 /*
599 * Now we need to make sure head_blk is not pointing to a block in
600 * the middle of a log record.
601 */
602 num_scan_bblks = XLOG_REC_SHIFT(log);
603 if (head_blk >= num_scan_bblks) {
604 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
605
606 /* start ptr at last block ptr before head_blk */
607 if ((error = xlog_find_verify_log_record(log, start_blk,
608 &head_blk, 0)) == -1) {
609 error = XFS_ERROR(EIO);
610 goto bp_err;
611 } else if (error)
612 goto bp_err;
613 } else {
614 start_blk = 0;
615 ASSERT(head_blk <= INT_MAX);
616 if ((error = xlog_find_verify_log_record(log, start_blk,
617 &head_blk, 0)) == -1) {
618 /* We hit the beginning of the log during our search */
619 start_blk = log_bbnum - (num_scan_bblks - head_blk);
620 new_blk = log_bbnum;
621 ASSERT(start_blk <= INT_MAX &&
622 (xfs_daddr_t) log_bbnum-start_blk >= 0);
623 ASSERT(head_blk <= INT_MAX);
624 if ((error = xlog_find_verify_log_record(log,
625 start_blk, &new_blk,
626 (int)head_blk)) == -1) {
627 error = XFS_ERROR(EIO);
628 goto bp_err;
629 } else if (error)
630 goto bp_err;
631 if (new_blk != log_bbnum)
632 head_blk = new_blk;
633 } else if (error)
634 goto bp_err;
635 }
636
637 xlog_put_bp(bp);
638 if (head_blk == log_bbnum)
639 *return_head_blk = 0;
640 else
641 *return_head_blk = head_blk;
642 /*
643 * When returning here, we have a good block number. Bad block
644 * means that during a previous crash, we didn't have a clean break
645 * from cycle number N to cycle number N-1. In this case, we need
646 * to find the first block with cycle number N-1.
647 */
648 return 0;
649
650 bp_err:
651 xlog_put_bp(bp);
652
653 if (error)
654 xfs_warn(log->l_mp, "failed to find log head");
655 return error;
656 }
657
658 /*
659 * Find the sync block number or the tail of the log.
660 *
661 * This will be the block number of the last record to have its
662 * associated buffers synced to disk. Every log record header has
663 * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
664 * to get a sync block number. The only concern is to figure out which
665 * log record header to believe.
666 *
667 * The following algorithm uses the log record header with the largest
668 * lsn. The entire log record does not need to be valid. We only care
669 * that the header is valid.
670 *
671 * We could speed up search by using current head_blk buffer, but it is not
672 * available.
673 */
674 int
675 xlog_find_tail(
676 struct xlog *log,
677 xfs_daddr_t *head_blk,
678 xfs_daddr_t *tail_blk)
679 {
680 xlog_rec_header_t *rhead;
681 xlog_op_header_t *op_head;
682 char *offset = NULL;
683 xfs_buf_t *bp;
684 int error, i, found;
685 xfs_daddr_t umount_data_blk;
686 xfs_daddr_t after_umount_blk;
687 xfs_lsn_t tail_lsn;
688 int hblks;
689
690 found = 0;
691
692 /*
693 * Find previous log record
694 */
695 if ((error = xlog_find_head(log, head_blk)))
696 return error;
697
698 bp = xlog_get_bp(log, 1);
699 if (!bp)
700 return ENOMEM;
701 if (*head_blk == 0) { /* special case */
702 error = xlog_bread(log, 0, 1, bp, &offset);
703 if (error)
704 goto done;
705
706 if (xlog_get_cycle(offset) == 0) {
707 *tail_blk = 0;
708 /* leave all other log inited values alone */
709 goto done;
710 }
711 }
712
713 /*
714 * Search backwards looking for log record header block
715 */
716 ASSERT(*head_blk < INT_MAX);
717 for (i = (int)(*head_blk) - 1; i >= 0; i--) {
718 error = xlog_bread(log, i, 1, bp, &offset);
719 if (error)
720 goto done;
721
722 if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
723 found = 1;
724 break;
725 }
726 }
727 /*
728 * If we haven't found the log record header block, start looking
729 * again from the end of the physical log. XXXmiken: There should be
730 * a check here to make sure we didn't search more than N blocks in
731 * the previous code.
732 */
733 if (!found) {
734 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
735 error = xlog_bread(log, i, 1, bp, &offset);
736 if (error)
737 goto done;
738
739 if (*(__be32 *)offset ==
740 cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
741 found = 2;
742 break;
743 }
744 }
745 }
746 if (!found) {
747 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
748 xlog_put_bp(bp);
749 ASSERT(0);
750 return XFS_ERROR(EIO);
751 }
752
753 /* find blk_no of tail of log */
754 rhead = (xlog_rec_header_t *)offset;
755 *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
756
757 /*
758 * Reset log values according to the state of the log when we
759 * crashed. In the case where head_blk == 0, we bump curr_cycle
760 * one because the next write starts a new cycle rather than
761 * continuing the cycle of the last good log record. At this
762 * point we have guaranteed that all partial log records have been
763 * accounted for. Therefore, we know that the last good log record
764 * written was complete and ended exactly on the end boundary
765 * of the physical log.
766 */
767 log->l_prev_block = i;
768 log->l_curr_block = (int)*head_blk;
769 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
770 if (found == 2)
771 log->l_curr_cycle++;
772 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
773 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
774 xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
775 BBTOB(log->l_curr_block));
776 xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
777 BBTOB(log->l_curr_block));
778
779 /*
780 * Look for unmount record. If we find it, then we know there
781 * was a clean unmount. Since 'i' could be the last block in
782 * the physical log, we convert to a log block before comparing
783 * to the head_blk.
784 *
785 * Save the current tail lsn to use to pass to
786 * xlog_clear_stale_blocks() below. We won't want to clear the
787 * unmount record if there is one, so we pass the lsn of the
788 * unmount record rather than the block after it.
789 */
790 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
791 int h_size = be32_to_cpu(rhead->h_size);
792 int h_version = be32_to_cpu(rhead->h_version);
793
794 if ((h_version & XLOG_VERSION_2) &&
795 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
796 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
797 if (h_size % XLOG_HEADER_CYCLE_SIZE)
798 hblks++;
799 } else {
800 hblks = 1;
801 }
802 } else {
803 hblks = 1;
804 }
805 after_umount_blk = (i + hblks + (int)
806 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
807 tail_lsn = atomic64_read(&log->l_tail_lsn);
808 if (*head_blk == after_umount_blk &&
809 be32_to_cpu(rhead->h_num_logops) == 1) {
810 umount_data_blk = (i + hblks) % log->l_logBBsize;
811 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
812 if (error)
813 goto done;
814
815 op_head = (xlog_op_header_t *)offset;
816 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
817 /*
818 * Set tail and last sync so that newly written
819 * log records will point recovery to after the
820 * current unmount record.
821 */
822 xlog_assign_atomic_lsn(&log->l_tail_lsn,
823 log->l_curr_cycle, after_umount_blk);
824 xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
825 log->l_curr_cycle, after_umount_blk);
826 *tail_blk = after_umount_blk;
827
828 /*
829 * Note that the unmount was clean. If the unmount
830 * was not clean, we need to know this to rebuild the
831 * superblock counters from the perag headers if we
832 * have a filesystem using non-persistent counters.
833 */
834 log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
835 }
836 }
837
838 /*
839 * Make sure that there are no blocks in front of the head
840 * with the same cycle number as the head. This can happen
841 * because we allow multiple outstanding log writes concurrently,
842 * and the later writes might make it out before earlier ones.
843 *
844 * We use the lsn from before modifying it so that we'll never
845 * overwrite the unmount record after a clean unmount.
846 *
847 * Do this only if we are going to recover the filesystem
848 *
849 * NOTE: This used to say "if (!readonly)"
850 * However on Linux, we can & do recover a read-only filesystem.
851 * We only skip recovery if NORECOVERY is specified on mount,
852 * in which case we would not be here.
853 *
854 * But... if the -device- itself is readonly, just skip this.
855 * We can't recover this device anyway, so it won't matter.
856 */
857 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
858 error = xlog_clear_stale_blocks(log, tail_lsn);
859
860 done:
861 xlog_put_bp(bp);
862
863 if (error)
864 xfs_warn(log->l_mp, "failed to locate log tail");
865 return error;
866 }
867
868 /*
869 * Is the log zeroed at all?
870 *
871 * The last binary search should be changed to perform an X block read
872 * once X becomes small enough. You can then search linearly through
873 * the X blocks. This will cut down on the number of reads we need to do.
874 *
875 * If the log is partially zeroed, this routine will pass back the blkno
876 * of the first block with cycle number 0. It won't have a complete LR
877 * preceding it.
878 *
879 * Return:
880 * 0 => the log is completely written to
881 * -1 => use *blk_no as the first block of the log
882 * >0 => error has occurred
883 */
884 int
885 xlog_find_zeroed(
886 struct xlog *log,
887 xfs_daddr_t *blk_no)
888 {
889 xfs_buf_t *bp;
890 char *offset;
891 uint first_cycle, last_cycle;
892 xfs_daddr_t new_blk, last_blk, start_blk;
893 xfs_daddr_t num_scan_bblks;
894 int error, log_bbnum = log->l_logBBsize;
895
896 *blk_no = 0;
897
898 /* check totally zeroed log */
899 bp = xlog_get_bp(log, 1);
900 if (!bp)
901 return ENOMEM;
902 error = xlog_bread(log, 0, 1, bp, &offset);
903 if (error)
904 goto bp_err;
905
906 first_cycle = xlog_get_cycle(offset);
907 if (first_cycle == 0) { /* completely zeroed log */
908 *blk_no = 0;
909 xlog_put_bp(bp);
910 return -1;
911 }
912
913 /* check partially zeroed log */
914 error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
915 if (error)
916 goto bp_err;
917
918 last_cycle = xlog_get_cycle(offset);
919 if (last_cycle != 0) { /* log completely written to */
920 xlog_put_bp(bp);
921 return 0;
922 } else if (first_cycle != 1) {
923 /*
924 * If the cycle of the last block is zero, the cycle of
925 * the first block must be 1. If it's not, maybe we're
926 * not looking at a log... Bail out.
927 */
928 xfs_warn(log->l_mp,
929 "Log inconsistent or not a log (last==0, first!=1)");
930 error = XFS_ERROR(EINVAL);
931 goto bp_err;
932 }
933
934 /* we have a partially zeroed log */
935 last_blk = log_bbnum-1;
936 if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
937 goto bp_err;
938
939 /*
940 * Validate the answer. Because there is no way to guarantee that
941 * the entire log is made up of log records which are the same size,
942 * we scan over the defined maximum blocks. At this point, the maximum
943 * is not chosen to mean anything special. XXXmiken
944 */
945 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
946 ASSERT(num_scan_bblks <= INT_MAX);
947
948 if (last_blk < num_scan_bblks)
949 num_scan_bblks = last_blk;
950 start_blk = last_blk - num_scan_bblks;
951
952 /*
953 * We search for any instances of cycle number 0 that occur before
954 * our current estimate of the head. What we're trying to detect is
955 * 1 ... | 0 | 1 | 0...
956 * ^ binary search ends here
957 */
958 if ((error = xlog_find_verify_cycle(log, start_blk,
959 (int)num_scan_bblks, 0, &new_blk)))
960 goto bp_err;
961 if (new_blk != -1)
962 last_blk = new_blk;
963
964 /*
965 * Potentially backup over partial log record write. We don't need
966 * to search the end of the log because we know it is zero.
967 */
968 if ((error = xlog_find_verify_log_record(log, start_blk,
969 &last_blk, 0)) == -1) {
970 error = XFS_ERROR(EIO);
971 goto bp_err;
972 } else if (error)
973 goto bp_err;
974
975 *blk_no = last_blk;
976 bp_err:
977 xlog_put_bp(bp);
978 if (error)
979 return error;
980 return -1;
981 }
982
983 STATIC xlog_recover_t *
984 xlog_recover_find_tid(
985 struct hlist_head *head,
986 xlog_tid_t tid)
987 {
988 xlog_recover_t *trans;
989 struct hlist_node *n;
990
991 hlist_for_each_entry(trans, n, head, r_list) {
992 if (trans->r_log_tid == tid)
993 return trans;
994 }
995 return NULL;
996 }
997
998 STATIC void
999 xlog_recover_new_tid(
1000 struct hlist_head *head,
1001 xlog_tid_t tid,
1002 xfs_lsn_t lsn)
1003 {
1004 xlog_recover_t *trans;
1005
1006 trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1007 trans->r_log_tid = tid;
1008 trans->r_lsn = lsn;
1009 INIT_LIST_HEAD(&trans->r_itemq);
1010
1011 INIT_HLIST_NODE(&trans->r_list);
1012 hlist_add_head(&trans->r_list, head);
1013 }
1014
1015 STATIC void
1016 xlog_recover_add_item(
1017 struct list_head *head)
1018 {
1019 xlog_recover_item_t *item;
1020
1021 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1022 INIT_LIST_HEAD(&item->ri_list);
1023 list_add_tail(&item->ri_list, head);
1024 }
1025
1026 #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
1027
1028 STATIC int
1029 xlog_recover_add_to_cont_trans(
1030 struct xlog *log,
1031 struct xlog_recover *trans,
1032 char *dp,
1033 int len)
1034 {
1035 xlog_recover_item_t *item;
1036 char *ptr, *old_ptr;
1037 int old_len;
1038
1039 if (list_empty(&trans->r_itemq)) {
1040 /* finish copying rest of trans header */
1041 xlog_recover_add_item(&trans->r_itemq);
1042 ptr = (char *) &trans->r_theader +
1043 sizeof(xfs_trans_header_t) - len;
1044 memcpy(ptr, dp, len); /* d, s, l */
1045 return 0;
1046 }
1047 /* take the tail entry */
1048 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1049
1050 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1051 old_len = item->ri_buf[item->ri_cnt-1].i_len;
1052
1053 ptr = kmem_realloc(old_ptr, len+old_len, KM_SLEEP);
1054 memcpy(&ptr[old_len], dp, len); /* d, s, l */
1055 item->ri_buf[item->ri_cnt-1].i_len += len;
1056 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1057 trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1058 return 0;
1059 }
1060
1061 /*
1062 * The next region to add is the start of a new region. It could be
1063 * a whole region or it could be the first part of a new region. Because
1064 * of this, the assumption here is that the type and size fields of all
1065 * format structures fit into the first 32 bits of the structure.
1066 *
1067 * This works because all regions must be 32 bit aligned. Therefore, we
1068 * either have both fields or we have neither field. In the case we have
1069 * neither field, the data part of the region is zero length. We only have
1070 * a log_op_header and can throw away the header since a new one will appear
1071 * later. If we have at least 4 bytes, then we can determine how many regions
1072 * will appear in the current log item.
1073 */
1074 STATIC int
1075 xlog_recover_add_to_trans(
1076 struct xlog *log,
1077 struct xlog_recover *trans,
1078 char *dp,
1079 int len)
1080 {
1081 struct xfs_inode_log_format *in_f; /* any will do */
1082 xlog_recover_item_t *item;
1083 char *ptr;
1084
1085 if (!len)
1086 return 0;
1087 if (list_empty(&trans->r_itemq)) {
1088 /* we need to catch log corruptions here */
1089 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1090 xfs_warn(log->l_mp, "%s: bad header magic number",
1091 __func__);
1092 ASSERT(0);
1093 return XFS_ERROR(EIO);
1094 }
1095 if (len == sizeof(xfs_trans_header_t))
1096 xlog_recover_add_item(&trans->r_itemq);
1097 memcpy(&trans->r_theader, dp, len); /* d, s, l */
1098 return 0;
1099 }
1100
1101 ptr = kmem_alloc(len, KM_SLEEP);
1102 memcpy(ptr, dp, len);
1103 in_f = (struct xfs_inode_log_format *)ptr;
1104
1105 /* take the tail entry */
1106 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1107 if (item->ri_total != 0 &&
1108 item->ri_total == item->ri_cnt) {
1109 /* tail item is in use, get a new one */
1110 xlog_recover_add_item(&trans->r_itemq);
1111 item = list_entry(trans->r_itemq.prev,
1112 xlog_recover_item_t, ri_list);
1113 }
1114
1115 if (item->ri_total == 0) { /* first region to be added */
1116 if (in_f->ilf_size == 0 ||
1117 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1118 xfs_warn(log->l_mp,
1119 "bad number of regions (%d) in inode log format",
1120 in_f->ilf_size);
1121 ASSERT(0);
1122 kmem_free(ptr);
1123 return XFS_ERROR(EIO);
1124 }
1125
1126 item->ri_total = in_f->ilf_size;
1127 item->ri_buf =
1128 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
1129 KM_SLEEP);
1130 }
1131 ASSERT(item->ri_total > item->ri_cnt);
1132 /* Description region is ri_buf[0] */
1133 item->ri_buf[item->ri_cnt].i_addr = ptr;
1134 item->ri_buf[item->ri_cnt].i_len = len;
1135 item->ri_cnt++;
1136 trace_xfs_log_recover_item_add(log, trans, item, 0);
1137 return 0;
1138 }
1139
1140 /*
1141 * Free up any resources allocated by the transaction
1142 *
1143 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
1144 */
1145 STATIC void
1146 xlog_recover_free_trans(
1147 struct xlog_recover *trans)
1148 {
1149 xlog_recover_item_t *item, *n;
1150 int i;
1151
1152 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
1153 /* Free the regions in the item. */
1154 list_del(&item->ri_list);
1155 for (i = 0; i < item->ri_cnt; i++)
1156 kmem_free(item->ri_buf[i].i_addr);
1157 /* Free the item itself */
1158 kmem_free(item->ri_buf);
1159 kmem_free(item);
1160 }
1161 /* Free the transaction recover structure */
1162 kmem_free(trans);
1163 }
1164
1165 /*
1166 * Perform the transaction.
1167 *
1168 * If the transaction modifies a buffer or inode, do it now. Otherwise,
1169 * EFIs and EFDs get queued up by adding entries into the AIL for them.
1170 */
1171 STATIC int
1172 xlog_recover_commit_trans(
1173 struct xlog *log,
1174 struct xlog_recover *trans,
1175 int pass)
1176 {
1177 int error = 0;
1178
1179 hlist_del(&trans->r_list);
1180 if ((error = xlog_recover_do_trans(log, trans, pass)))
1181 return error;
1182
1183 xlog_recover_free_trans(trans);
1184 return 0;
1185 }
1186
1187 STATIC int
1188 xlog_recover_unmount_trans(
1189 xlog_recover_t *trans)
1190 {
1191 /* Do nothing now */
1192 xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
1193 return 0;
1194 }
1195
1196 /*
1197 * There are two valid states of the r_state field. 0 indicates that the
1198 * transaction structure is in a normal state. We have either seen the
1199 * start of the transaction or the last operation we added was not a partial
1200 * operation. If the last operation we added to the transaction was a
1201 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
1202 *
1203 * NOTE: skip LRs with 0 data length.
1204 */
1205 STATIC int
1206 xlog_recover_process_data(
1207 struct xlog *log,
1208 struct hlist_head rhash[],
1209 struct xlog_rec_header *rhead,
1210 char *dp,
1211 int pass)
1212 {
1213 char *lp;
1214 int num_logops;
1215 xlog_op_header_t *ohead;
1216 xlog_recover_t *trans;
1217 xlog_tid_t tid;
1218 int error;
1219 unsigned long hash;
1220 uint flags;
1221
1222 lp = dp + be32_to_cpu(rhead->h_len);
1223 num_logops = be32_to_cpu(rhead->h_num_logops);
1224
1225 /* check the log format matches our own - else we can't recover */
1226 if (xlog_header_check_recover(log->l_mp, rhead))
1227 return (XFS_ERROR(EIO));
1228
1229 while ((dp < lp) && num_logops) {
1230 ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
1231 ohead = (xlog_op_header_t *)dp;
1232 dp += sizeof(xlog_op_header_t);
1233 if (ohead->oh_clientid != XFS_TRANSACTION &&
1234 ohead->oh_clientid != XFS_LOG) {
1235 xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
1236 __func__, ohead->oh_clientid);
1237 ASSERT(0);
1238 return (XFS_ERROR(EIO));
1239 }
1240 tid = be32_to_cpu(ohead->oh_tid);
1241 hash = XLOG_RHASH(tid);
1242 trans = xlog_recover_find_tid(&rhash[hash], tid);
1243 if (trans == NULL) { /* not found; add new tid */
1244 if (ohead->oh_flags & XLOG_START_TRANS)
1245 xlog_recover_new_tid(&rhash[hash], tid,
1246 be64_to_cpu(rhead->h_lsn));
1247 } else {
1248 if (dp + be32_to_cpu(ohead->oh_len) > lp) {
1249 xfs_warn(log->l_mp, "%s: bad length 0x%x",
1250 __func__, be32_to_cpu(ohead->oh_len));
1251 return (XFS_ERROR(EIO));
1252 }
1253 flags = ohead->oh_flags & ~XLOG_END_TRANS;
1254 if (flags & XLOG_WAS_CONT_TRANS)
1255 flags &= ~XLOG_CONTINUE_TRANS;
1256 switch (flags) {
1257 case XLOG_COMMIT_TRANS:
1258 error = xlog_recover_commit_trans(log,
1259 trans, pass);
1260 break;
1261 case XLOG_UNMOUNT_TRANS:
1262 error = xlog_recover_unmount_trans(trans);
1263 break;
1264 case XLOG_WAS_CONT_TRANS:
1265 error = xlog_recover_add_to_cont_trans(log,
1266 trans, dp,
1267 be32_to_cpu(ohead->oh_len));
1268 break;
1269 case XLOG_START_TRANS:
1270 xfs_warn(log->l_mp, "%s: bad transaction",
1271 __func__);
1272 ASSERT(0);
1273 error = XFS_ERROR(EIO);
1274 break;
1275 case 0:
1276 case XLOG_CONTINUE_TRANS:
1277 error = xlog_recover_add_to_trans(log, trans,
1278 dp, be32_to_cpu(ohead->oh_len));
1279 break;
1280 default:
1281 xfs_warn(log->l_mp, "%s: bad flag 0x%x",
1282 __func__, flags);
1283 ASSERT(0);
1284 error = XFS_ERROR(EIO);
1285 break;
1286 }
1287 if (error)
1288 return error;
1289 }
1290 dp += be32_to_cpu(ohead->oh_len);
1291 num_logops--;
1292 }
1293 return 0;
1294 }
1295
1296 /*
1297 * Upack the log buffer data and crc check it. If the check fails, issue a
1298 * warning if and only if the CRC in the header is non-zero. This makes the
1299 * check an advisory warning, and the zero CRC check will prevent failure
1300 * warnings from being emitted when upgrading the kernel from one that does not
1301 * add CRCs by default.
1302 *
1303 * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
1304 * corruption failure
1305 *
1306 * XXX: we do not calculate the CRC here yet. It's not clear what we should do
1307 * with CRC errors here in userspace, so we'll address that problem later on.
1308 */
1309 #define xlog_cksum(l,r,dp,len) ((r)->h_crc)
1310 STATIC int
1311 xlog_unpack_data_crc(
1312 struct xlog_rec_header *rhead,
1313 char *dp,
1314 struct xlog *log)
1315 {
1316 __le32 crc;
1317
1318 crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
1319 if (crc != rhead->h_crc) {
1320 if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
1321 xfs_alert(log->l_mp,
1322 "log record CRC mismatch: found 0x%x, expected 0x%x.",
1323 le32_to_cpu(rhead->h_crc),
1324 le32_to_cpu(crc));
1325 xfs_hex_dump(dp, 32);
1326 }
1327
1328 /*
1329 * If we've detected a log record corruption, then we can't
1330 * recover past this point. Abort recovery if we are enforcing
1331 * CRC protection by punting an error back up the stack.
1332 */
1333 if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
1334 return EFSCORRUPTED;
1335 }
1336
1337 return 0;
1338 }
1339
1340 STATIC int
1341 xlog_unpack_data(
1342 struct xlog_rec_header *rhead,
1343 char *dp,
1344 struct xlog *log)
1345 {
1346 int i, j, k;
1347 int error;
1348
1349 error = xlog_unpack_data_crc(rhead, dp, log);
1350 if (error)
1351 return error;
1352
1353 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
1354 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
1355 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
1356 dp += BBSIZE;
1357 }
1358
1359 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1360 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
1361 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
1362 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1363 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1364 *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
1365 dp += BBSIZE;
1366 }
1367 }
1368
1369 return 0;
1370 }
1371
1372 STATIC int
1373 xlog_valid_rec_header(
1374 struct xlog *log,
1375 struct xlog_rec_header *rhead,
1376 xfs_daddr_t blkno)
1377 {
1378 int hlen;
1379
1380 if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
1381 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
1382 XFS_ERRLEVEL_LOW, log->l_mp);
1383 return XFS_ERROR(EFSCORRUPTED);
1384 }
1385 if (unlikely(
1386 (!rhead->h_version ||
1387 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
1388 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
1389 __func__, be32_to_cpu(rhead->h_version));
1390 return XFS_ERROR(EIO);
1391 }
1392
1393 /* LR body must have data or it wouldn't have been written */
1394 hlen = be32_to_cpu(rhead->h_len);
1395 if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
1396 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
1397 XFS_ERRLEVEL_LOW, log->l_mp);
1398 return XFS_ERROR(EFSCORRUPTED);
1399 }
1400 if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
1401 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
1402 XFS_ERRLEVEL_LOW, log->l_mp);
1403 return XFS_ERROR(EFSCORRUPTED);
1404 }
1405 return 0;
1406 }
1407
1408 /*
1409 * Read the log from tail to head and process the log records found.
1410 * Handle the two cases where the tail and head are in the same cycle
1411 * and where the active portion of the log wraps around the end of
1412 * the physical log separately. The pass parameter is passed through
1413 * to the routines called to process the data and is not looked at
1414 * here.
1415 */
1416 int
1417 xlog_do_recovery_pass(
1418 struct xlog *log,
1419 xfs_daddr_t head_blk,
1420 xfs_daddr_t tail_blk,
1421 int pass)
1422 {
1423 xlog_rec_header_t *rhead;
1424 xfs_daddr_t blk_no;
1425 char *offset;
1426 xfs_buf_t *hbp, *dbp;
1427 int error = 0, h_size;
1428 int bblks, split_bblks;
1429 int hblks, split_hblks, wrapped_hblks;
1430 struct hlist_head rhash[XLOG_RHASH_SIZE];
1431
1432 ASSERT(head_blk != tail_blk);
1433
1434 /*
1435 * Read the header of the tail block and get the iclog buffer size from
1436 * h_size. Use this to tell how many sectors make up the log header.
1437 */
1438 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1439 /*
1440 * When using variable length iclogs, read first sector of
1441 * iclog header and extract the header size from it. Get a
1442 * new hbp that is the correct size.
1443 */
1444 hbp = xlog_get_bp(log, 1);
1445 if (!hbp)
1446 return ENOMEM;
1447
1448 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
1449 if (error)
1450 goto bread_err1;
1451
1452 rhead = (xlog_rec_header_t *)offset;
1453 error = xlog_valid_rec_header(log, rhead, tail_blk);
1454 if (error)
1455 goto bread_err1;
1456 h_size = be32_to_cpu(rhead->h_size);
1457 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
1458 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1459 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1460 if (h_size % XLOG_HEADER_CYCLE_SIZE)
1461 hblks++;
1462 xlog_put_bp(hbp);
1463 hbp = xlog_get_bp(log, hblks);
1464 } else {
1465 hblks = 1;
1466 }
1467 } else {
1468 ASSERT(log->l_sectBBsize == 1);
1469 hblks = 1;
1470 hbp = xlog_get_bp(log, 1);
1471 h_size = XLOG_BIG_RECORD_BSIZE;
1472 }
1473
1474 if (!hbp)
1475 return ENOMEM;
1476 dbp = xlog_get_bp(log, BTOBB(h_size));
1477 if (!dbp) {
1478 xlog_put_bp(hbp);
1479 return ENOMEM;
1480 }
1481
1482 memset(rhash, 0, sizeof(rhash));
1483 if (tail_blk <= head_blk) {
1484 for (blk_no = tail_blk; blk_no < head_blk; ) {
1485 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
1486 if (error)
1487 goto bread_err2;
1488
1489 rhead = (xlog_rec_header_t *)offset;
1490 error = xlog_valid_rec_header(log, rhead, blk_no);
1491 if (error)
1492 goto bread_err2;
1493
1494 /* blocks in data section */
1495 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1496 error = xlog_bread(log, blk_no + hblks, bblks, dbp,
1497 &offset);
1498 if (error)
1499 goto bread_err2;
1500
1501 error = xlog_unpack_data(rhead, offset, log);
1502 if (error)
1503 goto bread_err2;
1504
1505 error = xlog_recover_process_data(log,
1506 rhash, rhead, offset, pass);
1507 if (error)
1508 goto bread_err2;
1509 blk_no += bblks + hblks;
1510 }
1511 } else {
1512 /*
1513 * Perform recovery around the end of the physical log.
1514 * When the head is not on the same cycle number as the tail,
1515 * we can't do a sequential recovery as above.
1516 */
1517 blk_no = tail_blk;
1518 while (blk_no < log->l_logBBsize) {
1519 /*
1520 * Check for header wrapping around physical end-of-log
1521 */
1522 offset = hbp->b_addr;
1523 split_hblks = 0;
1524 wrapped_hblks = 0;
1525 if (blk_no + hblks <= log->l_logBBsize) {
1526 /* Read header in one read */
1527 error = xlog_bread(log, blk_no, hblks, hbp,
1528 &offset);
1529 if (error)
1530 goto bread_err2;
1531 } else {
1532 /* This LR is split across physical log end */
1533 if (blk_no != log->l_logBBsize) {
1534 /* some data before physical log end */
1535 ASSERT(blk_no <= INT_MAX);
1536 split_hblks = log->l_logBBsize - (int)blk_no;
1537 ASSERT(split_hblks > 0);
1538 error = xlog_bread(log, blk_no,
1539 split_hblks, hbp,
1540 &offset);
1541 if (error)
1542 goto bread_err2;
1543 }
1544
1545 /*
1546 * Note: this black magic still works with
1547 * large sector sizes (non-512) only because:
1548 * - we increased the buffer size originally
1549 * by 1 sector giving us enough extra space
1550 * for the second read;
1551 * - the log start is guaranteed to be sector
1552 * aligned;
1553 * - we read the log end (LR header start)
1554 * _first_, then the log start (LR header end)
1555 * - order is important.
1556 */
1557 wrapped_hblks = hblks - split_hblks;
1558 error = xlog_bread_offset(log, 0,
1559 wrapped_hblks, hbp,
1560 offset + BBTOB(split_hblks));
1561 if (error)
1562 goto bread_err2;
1563 }
1564 rhead = (xlog_rec_header_t *)offset;
1565 error = xlog_valid_rec_header(log, rhead,
1566 split_hblks ? blk_no : 0);
1567 if (error)
1568 goto bread_err2;
1569
1570 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1571 blk_no += hblks;
1572
1573 /* Read in data for log record */
1574 if (blk_no + bblks <= log->l_logBBsize) {
1575 error = xlog_bread(log, blk_no, bblks, dbp,
1576 &offset);
1577 if (error)
1578 goto bread_err2;
1579 } else {
1580 /* This log record is split across the
1581 * physical end of log */
1582 offset = dbp->b_addr;
1583 split_bblks = 0;
1584 if (blk_no != log->l_logBBsize) {
1585 /* some data is before the physical
1586 * end of log */
1587 ASSERT(!wrapped_hblks);
1588 ASSERT(blk_no <= INT_MAX);
1589 split_bblks =
1590 log->l_logBBsize - (int)blk_no;
1591 ASSERT(split_bblks > 0);
1592 error = xlog_bread(log, blk_no,
1593 split_bblks, dbp,
1594 &offset);
1595 if (error)
1596 goto bread_err2;
1597 }
1598
1599 /*
1600 * Note: this black magic still works with
1601 * large sector sizes (non-512) only because:
1602 * - we increased the buffer size originally
1603 * by 1 sector giving us enough extra space
1604 * for the second read;
1605 * - the log start is guaranteed to be sector
1606 * aligned;
1607 * - we read the log end (LR header start)
1608 * _first_, then the log start (LR header end)
1609 * - order is important.
1610 */
1611 error = xlog_bread_offset(log, 0,
1612 bblks - split_bblks, dbp,
1613 offset + BBTOB(split_bblks));
1614 if (error)
1615 goto bread_err2;
1616 }
1617
1618 error = xlog_unpack_data(rhead, offset, log);
1619 if (error)
1620 goto bread_err2;
1621
1622 error = xlog_recover_process_data(log, rhash,
1623 rhead, offset, pass);
1624 if (error)
1625 goto bread_err2;
1626 blk_no += bblks;
1627 }
1628
1629 ASSERT(blk_no >= log->l_logBBsize);
1630 blk_no -= log->l_logBBsize;
1631
1632 /* read first part of physical log */
1633 while (blk_no < head_blk) {
1634 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
1635 if (error)
1636 goto bread_err2;
1637
1638 rhead = (xlog_rec_header_t *)offset;
1639 error = xlog_valid_rec_header(log, rhead, blk_no);
1640 if (error)
1641 goto bread_err2;
1642
1643 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1644 error = xlog_bread(log, blk_no+hblks, bblks, dbp,
1645 &offset);
1646 if (error)
1647 goto bread_err2;
1648
1649 error = xlog_unpack_data(rhead, offset, log);
1650 if (error)
1651 goto bread_err2;
1652
1653 error = xlog_recover_process_data(log, rhash,
1654 rhead, offset, pass);
1655 if (error)
1656 goto bread_err2;
1657 blk_no += bblks + hblks;
1658 }
1659 }
1660
1661 bread_err2:
1662 xlog_put_bp(dbp);
1663 bread_err1:
1664 xlog_put_bp(hbp);
1665 return error;
1666 }