]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - libxlog/xfs_log_recover.c
fs: xfs: Remove KM_NOSLEEP and KM_SLEEP.
[thirdparty/xfsprogs-dev.git] / libxlog / xfs_log_recover.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6 #include "libxfs.h"
7 #include "libxlog.h"
8
9 #define xfs_readonly_buftarg(buftarg) (0)
10
11 /* avoid set-but-unused var warning. gcc is not very bright. */
12 #define xlog_clear_stale_blocks(log, taillsn) ({ \
13 (taillsn) = (taillsn); \
14 (0); \
15 })
16
17 #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
18
19 /*
20 * Verify the given count of basic blocks is valid number of blocks
21 * to specify for an operation involving the given XFS log buffer.
22 * Returns nonzero if the count is valid, 0 otherwise.
23 */
24
25 static inline int
26 xlog_buf_bbcount_valid(
27 struct xlog *log,
28 int bbcount)
29 {
30 return bbcount > 0 && bbcount <= log->l_logBBsize;
31 }
32
33 /*
34 * Allocate a buffer to hold log data. The buffer needs to be able
35 * to map to a range of nbblks basic blocks at any valid (basic
36 * block) offset within the log.
37 */
38 xfs_buf_t *
39 xlog_get_bp(
40 struct xlog *log,
41 int nbblks)
42 {
43 if (!xlog_buf_bbcount_valid(log, nbblks)) {
44 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
45 nbblks);
46 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
47 return NULL;
48 }
49
50 /*
51 * We do log I/O in units of log sectors (a power-of-2
52 * multiple of the basic block size), so we round up the
53 * requested size to accommodate the basic blocks required
54 * for complete log sectors.
55 *
56 * In addition, the buffer may be used for a non-sector-
57 * aligned block offset, in which case an I/O of the
58 * requested size could extend beyond the end of the
59 * buffer. If the requested size is only 1 basic block it
60 * will never straddle a sector boundary, so this won't be
61 * an issue. Nor will this be a problem if the log I/O is
62 * done in basic blocks (sector size 1). But otherwise we
63 * extend the buffer by one extra log sector to ensure
64 * there's space to accommodate this possibility.
65 */
66 if (nbblks > 1 && log->l_sectBBsize > 1)
67 nbblks += log->l_sectBBsize;
68 nbblks = round_up(nbblks, log->l_sectBBsize);
69
70 return libxfs_getbufr(log->l_dev, (xfs_daddr_t)-1, nbblks);
71 }
72
73 void
74 xlog_put_bp(
75 xfs_buf_t *bp)
76 {
77 libxfs_putbufr(bp);
78 }
79
80 /*
81 * Return the address of the start of the given block number's data
82 * in a log buffer. The buffer covers a log sector-aligned region.
83 */
84 STATIC char *
85 xlog_align(
86 struct xlog *log,
87 xfs_daddr_t blk_no,
88 int nbblks,
89 struct xfs_buf *bp)
90 {
91 xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
92
93 ASSERT(offset + nbblks <= bp->b_length);
94 return bp->b_addr + BBTOB(offset);
95 }
96
97
98 /*
99 * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
100 */
101 int
102 xlog_bread_noalign(
103 struct xlog *log,
104 xfs_daddr_t blk_no,
105 int nbblks,
106 struct xfs_buf *bp)
107 {
108 if (!xlog_buf_bbcount_valid(log, nbblks)) {
109 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
110 nbblks);
111 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
112 return EFSCORRUPTED;
113 }
114
115 blk_no = round_down(blk_no, log->l_sectBBsize);
116 nbblks = round_up(nbblks, log->l_sectBBsize);
117
118 ASSERT(nbblks > 0);
119 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
120
121 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
122 bp->b_bcount = BBTOB(nbblks);
123 bp->b_error = 0;
124
125 return libxfs_readbufr(log->l_dev, XFS_BUF_ADDR(bp), bp, nbblks, 0);
126 }
127
128 int
129 xlog_bread(
130 struct xlog *log,
131 xfs_daddr_t blk_no,
132 int nbblks,
133 struct xfs_buf *bp,
134 char **offset)
135 {
136 int error;
137
138 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
139 if (error)
140 return error;
141
142 *offset = xlog_align(log, blk_no, nbblks, bp);
143 return 0;
144 }
145
146 /*
147 * Read at an offset into the buffer. Returns with the buffer in it's original
148 * state regardless of the result of the read.
149 */
150 STATIC int
151 xlog_bread_offset(
152 struct xlog *log,
153 xfs_daddr_t blk_no, /* block to read from */
154 int nbblks, /* blocks to read */
155 struct xfs_buf *bp,
156 char *offset)
157 {
158 char *orig_offset = bp->b_addr;
159 int orig_len = bp->b_bcount;
160 int error, error2;
161
162 error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
163 if (error)
164 return error;
165
166 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
167
168 /* must reset buffer pointer even on error */
169 error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
170 if (error)
171 return error;
172 return error2;
173 }
174
175 /*
176 * This routine finds (to an approximation) the first block in the physical
177 * log which contains the given cycle. It uses a binary search algorithm.
178 * Note that the algorithm can not be perfect because the disk will not
179 * necessarily be perfect.
180 */
181 int
182 xlog_find_cycle_start(
183 struct xlog *log,
184 struct xfs_buf *bp,
185 xfs_daddr_t first_blk,
186 xfs_daddr_t *last_blk,
187 uint cycle)
188 {
189 char *offset;
190 xfs_daddr_t mid_blk;
191 xfs_daddr_t end_blk;
192 uint mid_cycle;
193 int error;
194
195 end_blk = *last_blk;
196 mid_blk = BLK_AVG(first_blk, end_blk);
197 while (mid_blk != first_blk && mid_blk != end_blk) {
198 error = xlog_bread(log, mid_blk, 1, bp, &offset);
199 if (error)
200 return error;
201 mid_cycle = xlog_get_cycle(offset);
202 if (mid_cycle == cycle)
203 end_blk = mid_blk; /* last_half_cycle == mid_cycle */
204 else
205 first_blk = mid_blk; /* first_half_cycle == mid_cycle */
206 mid_blk = BLK_AVG(first_blk, end_blk);
207 }
208 ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
209 (mid_blk == end_blk && mid_blk-1 == first_blk));
210
211 *last_blk = end_blk;
212
213 return 0;
214 }
215
216 /*
217 * Check that a range of blocks does not contain stop_on_cycle_no.
218 * Fill in *new_blk with the block offset where such a block is
219 * found, or with -1 (an invalid block number) if there is no such
220 * block in the range. The scan needs to occur from front to back
221 * and the pointer into the region must be updated since a later
222 * routine will need to perform another test.
223 */
224 STATIC int
225 xlog_find_verify_cycle(
226 struct xlog *log,
227 xfs_daddr_t start_blk,
228 int nbblks,
229 uint stop_on_cycle_no,
230 xfs_daddr_t *new_blk)
231 {
232 xfs_daddr_t i, j;
233 uint cycle;
234 xfs_buf_t *bp;
235 int bufblks;
236 char *buf = NULL;
237 int error = 0;
238
239 /*
240 * Greedily allocate a buffer big enough to handle the full
241 * range of basic blocks we'll be examining. If that fails,
242 * try a smaller size. We need to be able to read at least
243 * a log sector, or we're out of luck.
244 */
245 bufblks = 1 << ffs(nbblks);
246 while (bufblks > log->l_logBBsize)
247 bufblks >>= 1;
248 while (!(bp = xlog_get_bp(log, bufblks))) {
249 bufblks >>= 1;
250 if (bufblks < log->l_sectBBsize)
251 return ENOMEM;
252 }
253
254 for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
255 int bcount;
256
257 bcount = min(bufblks, (start_blk + nbblks - i));
258
259 error = xlog_bread(log, i, bcount, bp, &buf);
260 if (error)
261 goto out;
262
263 for (j = 0; j < bcount; j++) {
264 cycle = xlog_get_cycle(buf);
265 if (cycle == stop_on_cycle_no) {
266 *new_blk = i+j;
267 goto out;
268 }
269
270 buf += BBSIZE;
271 }
272 }
273
274 *new_blk = -1;
275
276 out:
277 xlog_put_bp(bp);
278 return error;
279 }
280
281 /*
282 * Potentially backup over partial log record write.
283 *
284 * In the typical case, last_blk is the number of the block directly after
285 * a good log record. Therefore, we subtract one to get the block number
286 * of the last block in the given buffer. extra_bblks contains the number
287 * of blocks we would have read on a previous read. This happens when the
288 * last log record is split over the end of the physical log.
289 *
290 * extra_bblks is the number of blocks potentially verified on a previous
291 * call to this routine.
292 */
293 STATIC int
294 xlog_find_verify_log_record(
295 struct xlog *log,
296 xfs_daddr_t start_blk,
297 xfs_daddr_t *last_blk,
298 int extra_bblks)
299 {
300 xfs_daddr_t i;
301 xfs_buf_t *bp;
302 char *offset = NULL;
303 xlog_rec_header_t *head = NULL;
304 int error = 0;
305 int smallmem = 0;
306 int num_blks = *last_blk - start_blk;
307 int xhdrs;
308
309 ASSERT(start_blk != 0 || *last_blk != start_blk);
310
311 if (!(bp = xlog_get_bp(log, num_blks))) {
312 if (!(bp = xlog_get_bp(log, 1)))
313 return ENOMEM;
314 smallmem = 1;
315 } else {
316 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
317 if (error)
318 goto out;
319 offset += ((num_blks - 1) << BBSHIFT);
320 }
321
322 for (i = (*last_blk) - 1; i >= 0; i--) {
323 if (i < start_blk) {
324 /* valid log record not found */
325 xfs_warn(log->l_mp,
326 "Log inconsistent (didn't find previous header)");
327 ASSERT(0);
328 error = XFS_ERROR(EIO);
329 goto out;
330 }
331
332 if (smallmem) {
333 error = xlog_bread(log, i, 1, bp, &offset);
334 if (error)
335 goto out;
336 }
337
338 head = (xlog_rec_header_t *)offset;
339
340 if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
341 break;
342
343 if (!smallmem)
344 offset -= BBSIZE;
345 }
346
347 /*
348 * We hit the beginning of the physical log & still no header. Return
349 * to caller. If caller can handle a return of -1, then this routine
350 * will be called again for the end of the physical log.
351 */
352 if (i == -1) {
353 error = -1;
354 goto out;
355 }
356
357 /*
358 * We have the final block of the good log (the first block
359 * of the log record _before_ the head. So we check the uuid.
360 */
361 if ((error = xlog_header_check_mount(log->l_mp, head)))
362 goto out;
363
364 /*
365 * We may have found a log record header before we expected one.
366 * last_blk will be the 1st block # with a given cycle #. We may end
367 * up reading an entire log record. In this case, we don't want to
368 * reset last_blk. Only when last_blk points in the middle of a log
369 * record do we update last_blk.
370 */
371 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
372 uint h_size = be32_to_cpu(head->h_size);
373
374 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
375 if (h_size % XLOG_HEADER_CYCLE_SIZE)
376 xhdrs++;
377 } else {
378 xhdrs = 1;
379 }
380
381 if (*last_blk - i + extra_bblks !=
382 BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
383 *last_blk = i;
384
385 out:
386 xlog_put_bp(bp);
387 return error;
388 }
389
390 /*
391 * Head is defined to be the point of the log where the next log write
392 * write could go. This means that incomplete LR writes at the end are
393 * eliminated when calculating the head. We aren't guaranteed that previous
394 * LR have complete transactions. We only know that a cycle number of
395 * current cycle number -1 won't be present in the log if we start writing
396 * from our current block number.
397 *
398 * last_blk contains the block number of the first block with a given
399 * cycle number.
400 *
401 * Return: zero if normal, non-zero if error.
402 */
403 STATIC int
404 xlog_find_head(
405 struct xlog *log,
406 xfs_daddr_t *return_head_blk)
407 {
408 xfs_buf_t *bp;
409 char *offset;
410 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
411 int num_scan_bblks;
412 uint first_half_cycle, last_half_cycle;
413 uint stop_on_cycle;
414 int error, log_bbnum = log->l_logBBsize;
415
416 /* Is the end of the log device zeroed? */
417 if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
418 *return_head_blk = first_blk;
419
420 /* Is the whole lot zeroed? */
421 if (!first_blk) {
422 /* Linux XFS shouldn't generate totally zeroed logs -
423 * mkfs etc write a dummy unmount record to a fresh
424 * log so we can store the uuid in there
425 */
426 xfs_warn(log->l_mp, "totally zeroed log");
427 }
428
429 return 0;
430 } else if (error) {
431 xfs_warn(log->l_mp, "empty log check failed");
432 return error;
433 }
434
435 first_blk = 0; /* get cycle # of 1st block */
436 bp = xlog_get_bp(log, 1);
437 if (!bp)
438 return ENOMEM;
439
440 error = xlog_bread(log, 0, 1, bp, &offset);
441 if (error)
442 goto bp_err;
443
444 first_half_cycle = xlog_get_cycle(offset);
445
446 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
447 error = xlog_bread(log, last_blk, 1, bp, &offset);
448 if (error)
449 goto bp_err;
450
451 last_half_cycle = xlog_get_cycle(offset);
452 ASSERT(last_half_cycle != 0);
453
454 /*
455 * If the 1st half cycle number is equal to the last half cycle number,
456 * then the entire log is stamped with the same cycle number. In this
457 * case, head_blk can't be set to zero (which makes sense). The below
458 * math doesn't work out properly with head_blk equal to zero. Instead,
459 * we set it to log_bbnum which is an invalid block number, but this
460 * value makes the math correct. If head_blk doesn't changed through
461 * all the tests below, *head_blk is set to zero at the very end rather
462 * than log_bbnum. In a sense, log_bbnum and zero are the same block
463 * in a circular file.
464 */
465 if (first_half_cycle == last_half_cycle) {
466 /*
467 * In this case we believe that the entire log should have
468 * cycle number last_half_cycle. We need to scan backwards
469 * from the end verifying that there are no holes still
470 * containing last_half_cycle - 1. If we find such a hole,
471 * then the start of that hole will be the new head. The
472 * simple case looks like
473 * x | x ... | x - 1 | x
474 * Another case that fits this picture would be
475 * x | x + 1 | x ... | x
476 * In this case the head really is somewhere at the end of the
477 * log, as one of the latest writes at the beginning was
478 * incomplete.
479 * One more case is
480 * x | x + 1 | x ... | x - 1 | x
481 * This is really the combination of the above two cases, and
482 * the head has to end up at the start of the x-1 hole at the
483 * end of the log.
484 *
485 * In the 256k log case, we will read from the beginning to the
486 * end of the log and search for cycle numbers equal to x-1.
487 * We don't worry about the x+1 blocks that we encounter,
488 * because we know that they cannot be the head since the log
489 * started with x.
490 */
491 head_blk = log_bbnum;
492 stop_on_cycle = last_half_cycle - 1;
493 } else {
494 /*
495 * In this case we want to find the first block with cycle
496 * number matching last_half_cycle. We expect the log to be
497 * some variation on
498 * x + 1 ... | x ... | x
499 * The first block with cycle number x (last_half_cycle) will
500 * be where the new head belongs. First we do a binary search
501 * for the first occurrence of last_half_cycle. The binary
502 * search may not be totally accurate, so then we scan back
503 * from there looking for occurrences of last_half_cycle before
504 * us. If that backwards scan wraps around the beginning of
505 * the log, then we look for occurrences of last_half_cycle - 1
506 * at the end of the log. The cases we're looking for look
507 * like
508 * v binary search stopped here
509 * x + 1 ... | x | x + 1 | x ... | x
510 * ^ but we want to locate this spot
511 * or
512 * <---------> less than scan distance
513 * x + 1 ... | x ... | x - 1 | x
514 * ^ we want to locate this spot
515 */
516 stop_on_cycle = last_half_cycle;
517 if ((error = xlog_find_cycle_start(log, bp, first_blk,
518 &head_blk, last_half_cycle)))
519 goto bp_err;
520 }
521
522 /*
523 * Now validate the answer. Scan back some number of maximum possible
524 * blocks and make sure each one has the expected cycle number. The
525 * maximum is determined by the total possible amount of buffering
526 * in the in-core log. The following number can be made tighter if
527 * we actually look at the block size of the filesystem.
528 */
529 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
530 if (head_blk >= num_scan_bblks) {
531 /*
532 * We are guaranteed that the entire check can be performed
533 * in one buffer.
534 */
535 start_blk = head_blk - num_scan_bblks;
536 if ((error = xlog_find_verify_cycle(log,
537 start_blk, num_scan_bblks,
538 stop_on_cycle, &new_blk)))
539 goto bp_err;
540 if (new_blk != -1)
541 head_blk = new_blk;
542 } else { /* need to read 2 parts of log */
543 /*
544 * We are going to scan backwards in the log in two parts.
545 * First we scan the physical end of the log. In this part
546 * of the log, we are looking for blocks with cycle number
547 * last_half_cycle - 1.
548 * If we find one, then we know that the log starts there, as
549 * we've found a hole that didn't get written in going around
550 * the end of the physical log. The simple case for this is
551 * x + 1 ... | x ... | x - 1 | x
552 * <---------> less than scan distance
553 * If all of the blocks at the end of the log have cycle number
554 * last_half_cycle, then we check the blocks at the start of
555 * the log looking for occurrences of last_half_cycle. If we
556 * find one, then our current estimate for the location of the
557 * first occurrence of last_half_cycle is wrong and we move
558 * back to the hole we've found. This case looks like
559 * x + 1 ... | x | x + 1 | x ...
560 * ^ binary search stopped here
561 * Another case we need to handle that only occurs in 256k
562 * logs is
563 * x + 1 ... | x ... | x+1 | x ...
564 * ^ binary search stops here
565 * In a 256k log, the scan at the end of the log will see the
566 * x + 1 blocks. We need to skip past those since that is
567 * certainly not the head of the log. By searching for
568 * last_half_cycle-1 we accomplish that.
569 */
570 ASSERT(head_blk <= INT_MAX &&
571 (xfs_daddr_t) num_scan_bblks >= head_blk);
572 start_blk = log_bbnum - (num_scan_bblks - head_blk);
573 if ((error = xlog_find_verify_cycle(log, start_blk,
574 num_scan_bblks - (int)head_blk,
575 (stop_on_cycle - 1), &new_blk)))
576 goto bp_err;
577 if (new_blk != -1) {
578 head_blk = new_blk;
579 goto validate_head;
580 }
581
582 /*
583 * Scan beginning of log now. The last part of the physical
584 * log is good. This scan needs to verify that it doesn't find
585 * the last_half_cycle.
586 */
587 start_blk = 0;
588 ASSERT(head_blk <= INT_MAX);
589 if ((error = xlog_find_verify_cycle(log,
590 start_blk, (int)head_blk,
591 stop_on_cycle, &new_blk)))
592 goto bp_err;
593 if (new_blk != -1)
594 head_blk = new_blk;
595 }
596
597 validate_head:
598 /*
599 * Now we need to make sure head_blk is not pointing to a block in
600 * the middle of a log record.
601 */
602 num_scan_bblks = XLOG_REC_SHIFT(log);
603 if (head_blk >= num_scan_bblks) {
604 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
605
606 /* start ptr at last block ptr before head_blk */
607 if ((error = xlog_find_verify_log_record(log, start_blk,
608 &head_blk, 0)) == -1) {
609 error = XFS_ERROR(EIO);
610 goto bp_err;
611 } else if (error)
612 goto bp_err;
613 } else {
614 start_blk = 0;
615 ASSERT(head_blk <= INT_MAX);
616 if ((error = xlog_find_verify_log_record(log, start_blk,
617 &head_blk, 0)) == -1) {
618 /* We hit the beginning of the log during our search */
619 start_blk = log_bbnum - (num_scan_bblks - head_blk);
620 new_blk = log_bbnum;
621 ASSERT(start_blk <= INT_MAX &&
622 (xfs_daddr_t) log_bbnum-start_blk >= 0);
623 ASSERT(head_blk <= INT_MAX);
624 if ((error = xlog_find_verify_log_record(log,
625 start_blk, &new_blk,
626 (int)head_blk)) == -1) {
627 error = XFS_ERROR(EIO);
628 goto bp_err;
629 } else if (error)
630 goto bp_err;
631 if (new_blk != log_bbnum)
632 head_blk = new_blk;
633 } else if (error)
634 goto bp_err;
635 }
636
637 xlog_put_bp(bp);
638 if (head_blk == log_bbnum)
639 *return_head_blk = 0;
640 else
641 *return_head_blk = head_blk;
642 /*
643 * When returning here, we have a good block number. Bad block
644 * means that during a previous crash, we didn't have a clean break
645 * from cycle number N to cycle number N-1. In this case, we need
646 * to find the first block with cycle number N-1.
647 */
648 return 0;
649
650 bp_err:
651 xlog_put_bp(bp);
652
653 if (error)
654 xfs_warn(log->l_mp, "failed to find log head");
655 return error;
656 }
657
658 /*
659 * Find the sync block number or the tail of the log.
660 *
661 * This will be the block number of the last record to have its
662 * associated buffers synced to disk. Every log record header has
663 * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
664 * to get a sync block number. The only concern is to figure out which
665 * log record header to believe.
666 *
667 * The following algorithm uses the log record header with the largest
668 * lsn. The entire log record does not need to be valid. We only care
669 * that the header is valid.
670 *
671 * We could speed up search by using current head_blk buffer, but it is not
672 * available.
673 */
674 int
675 xlog_find_tail(
676 struct xlog *log,
677 xfs_daddr_t *head_blk,
678 xfs_daddr_t *tail_blk)
679 {
680 xlog_rec_header_t *rhead;
681 xlog_op_header_t *op_head;
682 char *offset = NULL;
683 xfs_buf_t *bp;
684 int error, i, found;
685 xfs_daddr_t umount_data_blk;
686 xfs_daddr_t after_umount_blk;
687 xfs_lsn_t tail_lsn;
688 int hblks;
689
690 found = 0;
691
692 /*
693 * Find previous log record
694 */
695 if ((error = xlog_find_head(log, head_blk)))
696 return error;
697
698 bp = xlog_get_bp(log, 1);
699 if (!bp)
700 return ENOMEM;
701 if (*head_blk == 0) { /* special case */
702 error = xlog_bread(log, 0, 1, bp, &offset);
703 if (error)
704 goto done;
705
706 if (xlog_get_cycle(offset) == 0) {
707 *tail_blk = 0;
708 /* leave all other log inited values alone */
709 goto done;
710 }
711 }
712
713 /*
714 * Search backwards looking for log record header block
715 */
716 ASSERT(*head_blk < INT_MAX);
717 for (i = (int)(*head_blk) - 1; i >= 0; i--) {
718 error = xlog_bread(log, i, 1, bp, &offset);
719 if (error)
720 goto done;
721
722 if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
723 found = 1;
724 break;
725 }
726 }
727 /*
728 * If we haven't found the log record header block, start looking
729 * again from the end of the physical log. XXXmiken: There should be
730 * a check here to make sure we didn't search more than N blocks in
731 * the previous code.
732 */
733 if (!found) {
734 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
735 error = xlog_bread(log, i, 1, bp, &offset);
736 if (error)
737 goto done;
738
739 if (*(__be32 *)offset ==
740 cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
741 found = 2;
742 break;
743 }
744 }
745 }
746 if (!found) {
747 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
748 xlog_put_bp(bp);
749 ASSERT(0);
750 return XFS_ERROR(EIO);
751 }
752
753 /* find blk_no of tail of log */
754 rhead = (xlog_rec_header_t *)offset;
755 *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
756
757 /*
758 * Reset log values according to the state of the log when we
759 * crashed. In the case where head_blk == 0, we bump curr_cycle
760 * one because the next write starts a new cycle rather than
761 * continuing the cycle of the last good log record. At this
762 * point we have guaranteed that all partial log records have been
763 * accounted for. Therefore, we know that the last good log record
764 * written was complete and ended exactly on the end boundary
765 * of the physical log.
766 */
767 log->l_prev_block = i;
768 log->l_curr_block = (int)*head_blk;
769 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
770 if (found == 2)
771 log->l_curr_cycle++;
772 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
773 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
774 xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
775 BBTOB(log->l_curr_block));
776 xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
777 BBTOB(log->l_curr_block));
778
779 /*
780 * Look for unmount record. If we find it, then we know there
781 * was a clean unmount. Since 'i' could be the last block in
782 * the physical log, we convert to a log block before comparing
783 * to the head_blk.
784 *
785 * Save the current tail lsn to use to pass to
786 * xlog_clear_stale_blocks() below. We won't want to clear the
787 * unmount record if there is one, so we pass the lsn of the
788 * unmount record rather than the block after it.
789 */
790 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
791 int h_size = be32_to_cpu(rhead->h_size);
792 int h_version = be32_to_cpu(rhead->h_version);
793
794 if ((h_version & XLOG_VERSION_2) &&
795 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
796 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
797 if (h_size % XLOG_HEADER_CYCLE_SIZE)
798 hblks++;
799 } else {
800 hblks = 1;
801 }
802 } else {
803 hblks = 1;
804 }
805 after_umount_blk = (i + hblks + (int)
806 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
807 tail_lsn = atomic64_read(&log->l_tail_lsn);
808 if (*head_blk == after_umount_blk &&
809 be32_to_cpu(rhead->h_num_logops) == 1) {
810 umount_data_blk = (i + hblks) % log->l_logBBsize;
811 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
812 if (error)
813 goto done;
814
815 op_head = (xlog_op_header_t *)offset;
816 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
817 /*
818 * Set tail and last sync so that newly written
819 * log records will point recovery to after the
820 * current unmount record.
821 */
822 xlog_assign_atomic_lsn(&log->l_tail_lsn,
823 log->l_curr_cycle, after_umount_blk);
824 xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
825 log->l_curr_cycle, after_umount_blk);
826 *tail_blk = after_umount_blk;
827
828 /*
829 * Note that the unmount was clean. If the unmount
830 * was not clean, we need to know this to rebuild the
831 * superblock counters from the perag headers if we
832 * have a filesystem using non-persistent counters.
833 */
834 log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
835 }
836 }
837
838 /*
839 * Make sure that there are no blocks in front of the head
840 * with the same cycle number as the head. This can happen
841 * because we allow multiple outstanding log writes concurrently,
842 * and the later writes might make it out before earlier ones.
843 *
844 * We use the lsn from before modifying it so that we'll never
845 * overwrite the unmount record after a clean unmount.
846 *
847 * Do this only if we are going to recover the filesystem
848 *
849 * NOTE: This used to say "if (!readonly)"
850 * However on Linux, we can & do recover a read-only filesystem.
851 * We only skip recovery if NORECOVERY is specified on mount,
852 * in which case we would not be here.
853 *
854 * But... if the -device- itself is readonly, just skip this.
855 * We can't recover this device anyway, so it won't matter.
856 */
857 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
858 error = xlog_clear_stale_blocks(log, tail_lsn);
859
860 done:
861 xlog_put_bp(bp);
862
863 if (error)
864 xfs_warn(log->l_mp, "failed to locate log tail");
865 return error;
866 }
867
868 /*
869 * Is the log zeroed at all?
870 *
871 * The last binary search should be changed to perform an X block read
872 * once X becomes small enough. You can then search linearly through
873 * the X blocks. This will cut down on the number of reads we need to do.
874 *
875 * If the log is partially zeroed, this routine will pass back the blkno
876 * of the first block with cycle number 0. It won't have a complete LR
877 * preceding it.
878 *
879 * Return:
880 * 0 => the log is completely written to
881 * -1 => use *blk_no as the first block of the log
882 * >0 => error has occurred
883 */
884 int
885 xlog_find_zeroed(
886 struct xlog *log,
887 xfs_daddr_t *blk_no)
888 {
889 xfs_buf_t *bp;
890 char *offset;
891 uint first_cycle, last_cycle;
892 xfs_daddr_t new_blk, last_blk, start_blk;
893 xfs_daddr_t num_scan_bblks;
894 int error, log_bbnum = log->l_logBBsize;
895
896 *blk_no = 0;
897
898 /* check totally zeroed log */
899 bp = xlog_get_bp(log, 1);
900 if (!bp)
901 return ENOMEM;
902 error = xlog_bread(log, 0, 1, bp, &offset);
903 if (error)
904 goto bp_err;
905
906 first_cycle = xlog_get_cycle(offset);
907 if (first_cycle == 0) { /* completely zeroed log */
908 *blk_no = 0;
909 xlog_put_bp(bp);
910 return -1;
911 }
912
913 /* check partially zeroed log */
914 error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
915 if (error)
916 goto bp_err;
917
918 last_cycle = xlog_get_cycle(offset);
919 if (last_cycle != 0) { /* log completely written to */
920 xlog_put_bp(bp);
921 return 0;
922 } else if (first_cycle != 1) {
923 /*
924 * If the cycle of the last block is zero, the cycle of
925 * the first block must be 1. If it's not, maybe we're
926 * not looking at a log... Bail out.
927 */
928 xfs_warn(log->l_mp,
929 "Log inconsistent or not a log (last==0, first!=1)");
930 error = XFS_ERROR(EINVAL);
931 goto bp_err;
932 }
933
934 /* we have a partially zeroed log */
935 last_blk = log_bbnum-1;
936 if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
937 goto bp_err;
938
939 /*
940 * Validate the answer. Because there is no way to guarantee that
941 * the entire log is made up of log records which are the same size,
942 * we scan over the defined maximum blocks. At this point, the maximum
943 * is not chosen to mean anything special. XXXmiken
944 */
945 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
946 ASSERT(num_scan_bblks <= INT_MAX);
947
948 if (last_blk < num_scan_bblks)
949 num_scan_bblks = last_blk;
950 start_blk = last_blk - num_scan_bblks;
951
952 /*
953 * We search for any instances of cycle number 0 that occur before
954 * our current estimate of the head. What we're trying to detect is
955 * 1 ... | 0 | 1 | 0...
956 * ^ binary search ends here
957 */
958 if ((error = xlog_find_verify_cycle(log, start_blk,
959 (int)num_scan_bblks, 0, &new_blk)))
960 goto bp_err;
961 if (new_blk != -1)
962 last_blk = new_blk;
963
964 /*
965 * Potentially backup over partial log record write. We don't need
966 * to search the end of the log because we know it is zero.
967 */
968 if ((error = xlog_find_verify_log_record(log, start_blk,
969 &last_blk, 0)) == -1) {
970 error = XFS_ERROR(EIO);
971 goto bp_err;
972 } else if (error)
973 goto bp_err;
974
975 *blk_no = last_blk;
976 bp_err:
977 xlog_put_bp(bp);
978 if (error)
979 return error;
980 return -1;
981 }
982
983 STATIC xlog_recover_t *
984 xlog_recover_find_tid(
985 struct hlist_head *head,
986 xlog_tid_t tid)
987 {
988 xlog_recover_t *trans;
989 struct hlist_node *n;
990
991 hlist_for_each_entry(trans, n, head, r_list) {
992 if (trans->r_log_tid == tid)
993 return trans;
994 }
995 return NULL;
996 }
997
998 STATIC void
999 xlog_recover_new_tid(
1000 struct hlist_head *head,
1001 xlog_tid_t tid,
1002 xfs_lsn_t lsn)
1003 {
1004 xlog_recover_t *trans;
1005
1006 trans = kmem_zalloc(sizeof(xlog_recover_t), 0);
1007 trans->r_log_tid = tid;
1008 trans->r_lsn = lsn;
1009 INIT_LIST_HEAD(&trans->r_itemq);
1010
1011 INIT_HLIST_NODE(&trans->r_list);
1012 hlist_add_head(&trans->r_list, head);
1013 }
1014
1015 STATIC void
1016 xlog_recover_add_item(
1017 struct list_head *head)
1018 {
1019 xlog_recover_item_t *item;
1020
1021 item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
1022 INIT_LIST_HEAD(&item->ri_list);
1023 list_add_tail(&item->ri_list, head);
1024 }
1025
1026 STATIC int
1027 xlog_recover_add_to_cont_trans(
1028 struct xlog *log,
1029 struct xlog_recover *trans,
1030 char *dp,
1031 int len)
1032 {
1033 xlog_recover_item_t *item;
1034 char *ptr, *old_ptr;
1035 int old_len;
1036
1037 if (list_empty(&trans->r_itemq)) {
1038 /* finish copying rest of trans header */
1039 xlog_recover_add_item(&trans->r_itemq);
1040 ptr = (char *) &trans->r_theader +
1041 sizeof(xfs_trans_header_t) - len;
1042 memcpy(ptr, dp, len); /* d, s, l */
1043 return 0;
1044 }
1045 /* take the tail entry */
1046 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1047
1048 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1049 old_len = item->ri_buf[item->ri_cnt-1].i_len;
1050
1051 ptr = kmem_realloc(old_ptr, len+old_len, 0);
1052 memcpy(&ptr[old_len], dp, len); /* d, s, l */
1053 item->ri_buf[item->ri_cnt-1].i_len += len;
1054 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1055 trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1056 return 0;
1057 }
1058
1059 /*
1060 * The next region to add is the start of a new region. It could be
1061 * a whole region or it could be the first part of a new region. Because
1062 * of this, the assumption here is that the type and size fields of all
1063 * format structures fit into the first 32 bits of the structure.
1064 *
1065 * This works because all regions must be 32 bit aligned. Therefore, we
1066 * either have both fields or we have neither field. In the case we have
1067 * neither field, the data part of the region is zero length. We only have
1068 * a log_op_header and can throw away the header since a new one will appear
1069 * later. If we have at least 4 bytes, then we can determine how many regions
1070 * will appear in the current log item.
1071 */
1072 STATIC int
1073 xlog_recover_add_to_trans(
1074 struct xlog *log,
1075 struct xlog_recover *trans,
1076 char *dp,
1077 int len)
1078 {
1079 struct xfs_inode_log_format *in_f; /* any will do */
1080 xlog_recover_item_t *item;
1081 char *ptr;
1082
1083 if (!len)
1084 return 0;
1085 if (list_empty(&trans->r_itemq)) {
1086 /* we need to catch log corruptions here */
1087 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1088 xfs_warn(log->l_mp, "%s: bad header magic number",
1089 __func__);
1090 ASSERT(0);
1091 return XFS_ERROR(EIO);
1092 }
1093 if (len == sizeof(xfs_trans_header_t))
1094 xlog_recover_add_item(&trans->r_itemq);
1095 memcpy(&trans->r_theader, dp, len); /* d, s, l */
1096 return 0;
1097 }
1098
1099 ptr = kmem_alloc(len, 0);
1100 memcpy(ptr, dp, len);
1101 in_f = (struct xfs_inode_log_format *)ptr;
1102
1103 /* take the tail entry */
1104 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1105 if (item->ri_total != 0 &&
1106 item->ri_total == item->ri_cnt) {
1107 /* tail item is in use, get a new one */
1108 xlog_recover_add_item(&trans->r_itemq);
1109 item = list_entry(trans->r_itemq.prev,
1110 xlog_recover_item_t, ri_list);
1111 }
1112
1113 if (item->ri_total == 0) { /* first region to be added */
1114 if (in_f->ilf_size == 0 ||
1115 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1116 xfs_warn(log->l_mp,
1117 "bad number of regions (%d) in inode log format",
1118 in_f->ilf_size);
1119 ASSERT(0);
1120 kmem_free(ptr);
1121 return XFS_ERROR(EIO);
1122 }
1123
1124 item->ri_total = in_f->ilf_size;
1125 item->ri_buf =
1126 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
1127 0);
1128 }
1129 ASSERT(item->ri_total > item->ri_cnt);
1130 /* Description region is ri_buf[0] */
1131 item->ri_buf[item->ri_cnt].i_addr = ptr;
1132 item->ri_buf[item->ri_cnt].i_len = len;
1133 item->ri_cnt++;
1134 trace_xfs_log_recover_item_add(log, trans, item, 0);
1135 return 0;
1136 }
1137
1138 /*
1139 * Free up any resources allocated by the transaction
1140 *
1141 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
1142 */
1143 STATIC void
1144 xlog_recover_free_trans(
1145 struct xlog_recover *trans)
1146 {
1147 xlog_recover_item_t *item, *n;
1148 int i;
1149
1150 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
1151 /* Free the regions in the item. */
1152 list_del(&item->ri_list);
1153 for (i = 0; i < item->ri_cnt; i++)
1154 kmem_free(item->ri_buf[i].i_addr);
1155 /* Free the item itself */
1156 kmem_free(item->ri_buf);
1157 kmem_free(item);
1158 }
1159 /* Free the transaction recover structure */
1160 kmem_free(trans);
1161 }
1162
1163 /*
1164 * Perform the transaction.
1165 *
1166 * If the transaction modifies a buffer or inode, do it now. Otherwise,
1167 * EFIs and EFDs get queued up by adding entries into the AIL for them.
1168 */
1169 STATIC int
1170 xlog_recover_commit_trans(
1171 struct xlog *log,
1172 struct xlog_recover *trans,
1173 int pass)
1174 {
1175 int error = 0;
1176
1177 hlist_del(&trans->r_list);
1178 if ((error = xlog_recover_do_trans(log, trans, pass)))
1179 return error;
1180
1181 xlog_recover_free_trans(trans);
1182 return 0;
1183 }
1184
1185 STATIC int
1186 xlog_recover_unmount_trans(
1187 xlog_recover_t *trans)
1188 {
1189 /* Do nothing now */
1190 xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
1191 return 0;
1192 }
1193
1194 /*
1195 * There are two valid states of the r_state field. 0 indicates that the
1196 * transaction structure is in a normal state. We have either seen the
1197 * start of the transaction or the last operation we added was not a partial
1198 * operation. If the last operation we added to the transaction was a
1199 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
1200 *
1201 * NOTE: skip LRs with 0 data length.
1202 */
1203 STATIC int
1204 xlog_recover_process_data(
1205 struct xlog *log,
1206 struct hlist_head rhash[],
1207 struct xlog_rec_header *rhead,
1208 char *dp,
1209 int pass)
1210 {
1211 char *lp;
1212 int num_logops;
1213 xlog_op_header_t *ohead;
1214 xlog_recover_t *trans;
1215 xlog_tid_t tid;
1216 int error;
1217 unsigned long hash;
1218 uint flags;
1219
1220 lp = dp + be32_to_cpu(rhead->h_len);
1221 num_logops = be32_to_cpu(rhead->h_num_logops);
1222
1223 /* check the log format matches our own - else we can't recover */
1224 if (xlog_header_check_recover(log->l_mp, rhead))
1225 return (XFS_ERROR(EIO));
1226
1227 while ((dp < lp) && num_logops) {
1228 ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
1229 ohead = (xlog_op_header_t *)dp;
1230 dp += sizeof(xlog_op_header_t);
1231 if (ohead->oh_clientid != XFS_TRANSACTION &&
1232 ohead->oh_clientid != XFS_LOG) {
1233 xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
1234 __func__, ohead->oh_clientid);
1235 ASSERT(0);
1236 return (XFS_ERROR(EIO));
1237 }
1238 tid = be32_to_cpu(ohead->oh_tid);
1239 hash = XLOG_RHASH(tid);
1240 trans = xlog_recover_find_tid(&rhash[hash], tid);
1241 if (trans == NULL) { /* not found; add new tid */
1242 if (ohead->oh_flags & XLOG_START_TRANS)
1243 xlog_recover_new_tid(&rhash[hash], tid,
1244 be64_to_cpu(rhead->h_lsn));
1245 } else {
1246 if (dp + be32_to_cpu(ohead->oh_len) > lp) {
1247 xfs_warn(log->l_mp, "%s: bad length 0x%x",
1248 __func__, be32_to_cpu(ohead->oh_len));
1249 return (XFS_ERROR(EIO));
1250 }
1251 flags = ohead->oh_flags & ~XLOG_END_TRANS;
1252 if (flags & XLOG_WAS_CONT_TRANS)
1253 flags &= ~XLOG_CONTINUE_TRANS;
1254 switch (flags) {
1255 case XLOG_COMMIT_TRANS:
1256 error = xlog_recover_commit_trans(log,
1257 trans, pass);
1258 break;
1259 case XLOG_UNMOUNT_TRANS:
1260 error = xlog_recover_unmount_trans(trans);
1261 break;
1262 case XLOG_WAS_CONT_TRANS:
1263 error = xlog_recover_add_to_cont_trans(log,
1264 trans, dp,
1265 be32_to_cpu(ohead->oh_len));
1266 break;
1267 case XLOG_START_TRANS:
1268 xfs_warn(log->l_mp, "%s: bad transaction",
1269 __func__);
1270 ASSERT(0);
1271 error = XFS_ERROR(EIO);
1272 break;
1273 case 0:
1274 case XLOG_CONTINUE_TRANS:
1275 error = xlog_recover_add_to_trans(log, trans,
1276 dp, be32_to_cpu(ohead->oh_len));
1277 break;
1278 default:
1279 xfs_warn(log->l_mp, "%s: bad flag 0x%x",
1280 __func__, flags);
1281 ASSERT(0);
1282 error = XFS_ERROR(EIO);
1283 break;
1284 }
1285 if (error)
1286 return error;
1287 }
1288 dp += be32_to_cpu(ohead->oh_len);
1289 num_logops--;
1290 }
1291 return 0;
1292 }
1293
1294 /*
1295 * Upack the log buffer data and crc check it. If the check fails, issue a
1296 * warning if and only if the CRC in the header is non-zero. This makes the
1297 * check an advisory warning, and the zero CRC check will prevent failure
1298 * warnings from being emitted when upgrading the kernel from one that does not
1299 * add CRCs by default.
1300 *
1301 * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
1302 * corruption failure
1303 *
1304 * XXX: we do not calculate the CRC here yet. It's not clear what we should do
1305 * with CRC errors here in userspace, so we'll address that problem later on.
1306 */
1307 #define xlog_cksum(l,r,dp,len) ((r)->h_crc)
1308 STATIC int
1309 xlog_unpack_data_crc(
1310 struct xlog_rec_header *rhead,
1311 char *dp,
1312 struct xlog *log)
1313 {
1314 __le32 crc;
1315
1316 crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
1317 if (crc != rhead->h_crc) {
1318 if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
1319 xfs_alert(log->l_mp,
1320 "log record CRC mismatch: found 0x%x, expected 0x%x.",
1321 le32_to_cpu(rhead->h_crc),
1322 le32_to_cpu(crc));
1323 xfs_hex_dump(dp, 32);
1324 }
1325
1326 /*
1327 * If we've detected a log record corruption, then we can't
1328 * recover past this point. Abort recovery if we are enforcing
1329 * CRC protection by punting an error back up the stack.
1330 */
1331 if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
1332 return EFSCORRUPTED;
1333 }
1334
1335 return 0;
1336 }
1337
1338 STATIC int
1339 xlog_unpack_data(
1340 struct xlog_rec_header *rhead,
1341 char *dp,
1342 struct xlog *log)
1343 {
1344 int i, j, k;
1345 int error;
1346
1347 error = xlog_unpack_data_crc(rhead, dp, log);
1348 if (error)
1349 return error;
1350
1351 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
1352 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
1353 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
1354 dp += BBSIZE;
1355 }
1356
1357 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1358 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
1359 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
1360 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1361 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1362 *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
1363 dp += BBSIZE;
1364 }
1365 }
1366
1367 return 0;
1368 }
1369
1370 STATIC int
1371 xlog_valid_rec_header(
1372 struct xlog *log,
1373 struct xlog_rec_header *rhead,
1374 xfs_daddr_t blkno)
1375 {
1376 int hlen;
1377
1378 if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
1379 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
1380 XFS_ERRLEVEL_LOW, log->l_mp);
1381 return XFS_ERROR(EFSCORRUPTED);
1382 }
1383 if (unlikely(
1384 (!rhead->h_version ||
1385 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
1386 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
1387 __func__, be32_to_cpu(rhead->h_version));
1388 return XFS_ERROR(EIO);
1389 }
1390
1391 /* LR body must have data or it wouldn't have been written */
1392 hlen = be32_to_cpu(rhead->h_len);
1393 if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
1394 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
1395 XFS_ERRLEVEL_LOW, log->l_mp);
1396 return XFS_ERROR(EFSCORRUPTED);
1397 }
1398 if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
1399 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
1400 XFS_ERRLEVEL_LOW, log->l_mp);
1401 return XFS_ERROR(EFSCORRUPTED);
1402 }
1403 return 0;
1404 }
1405
1406 /*
1407 * Read the log from tail to head and process the log records found.
1408 * Handle the two cases where the tail and head are in the same cycle
1409 * and where the active portion of the log wraps around the end of
1410 * the physical log separately. The pass parameter is passed through
1411 * to the routines called to process the data and is not looked at
1412 * here.
1413 */
1414 int
1415 xlog_do_recovery_pass(
1416 struct xlog *log,
1417 xfs_daddr_t head_blk,
1418 xfs_daddr_t tail_blk,
1419 int pass)
1420 {
1421 xlog_rec_header_t *rhead;
1422 xfs_daddr_t blk_no;
1423 char *offset;
1424 xfs_buf_t *hbp, *dbp;
1425 int error = 0, h_size;
1426 int bblks, split_bblks;
1427 int hblks, split_hblks, wrapped_hblks;
1428 struct hlist_head rhash[XLOG_RHASH_SIZE];
1429
1430 ASSERT(head_blk != tail_blk);
1431
1432 /*
1433 * Read the header of the tail block and get the iclog buffer size from
1434 * h_size. Use this to tell how many sectors make up the log header.
1435 */
1436 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1437 /*
1438 * When using variable length iclogs, read first sector of
1439 * iclog header and extract the header size from it. Get a
1440 * new hbp that is the correct size.
1441 */
1442 hbp = xlog_get_bp(log, 1);
1443 if (!hbp)
1444 return ENOMEM;
1445
1446 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
1447 if (error)
1448 goto bread_err1;
1449
1450 rhead = (xlog_rec_header_t *)offset;
1451 error = xlog_valid_rec_header(log, rhead, tail_blk);
1452 if (error)
1453 goto bread_err1;
1454 h_size = be32_to_cpu(rhead->h_size);
1455 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
1456 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1457 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1458 if (h_size % XLOG_HEADER_CYCLE_SIZE)
1459 hblks++;
1460 xlog_put_bp(hbp);
1461 hbp = xlog_get_bp(log, hblks);
1462 } else {
1463 hblks = 1;
1464 }
1465 } else {
1466 ASSERT(log->l_sectBBsize == 1);
1467 hblks = 1;
1468 hbp = xlog_get_bp(log, 1);
1469 h_size = XLOG_BIG_RECORD_BSIZE;
1470 }
1471
1472 if (!hbp)
1473 return ENOMEM;
1474 dbp = xlog_get_bp(log, BTOBB(h_size));
1475 if (!dbp) {
1476 xlog_put_bp(hbp);
1477 return ENOMEM;
1478 }
1479
1480 memset(rhash, 0, sizeof(rhash));
1481 if (tail_blk <= head_blk) {
1482 for (blk_no = tail_blk; blk_no < head_blk; ) {
1483 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
1484 if (error)
1485 goto bread_err2;
1486
1487 rhead = (xlog_rec_header_t *)offset;
1488 error = xlog_valid_rec_header(log, rhead, blk_no);
1489 if (error)
1490 goto bread_err2;
1491
1492 /* blocks in data section */
1493 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1494 error = xlog_bread(log, blk_no + hblks, bblks, dbp,
1495 &offset);
1496 if (error)
1497 goto bread_err2;
1498
1499 error = xlog_unpack_data(rhead, offset, log);
1500 if (error)
1501 goto bread_err2;
1502
1503 error = xlog_recover_process_data(log,
1504 rhash, rhead, offset, pass);
1505 if (error)
1506 goto bread_err2;
1507 blk_no += bblks + hblks;
1508 }
1509 } else {
1510 /*
1511 * Perform recovery around the end of the physical log.
1512 * When the head is not on the same cycle number as the tail,
1513 * we can't do a sequential recovery as above.
1514 */
1515 blk_no = tail_blk;
1516 while (blk_no < log->l_logBBsize) {
1517 /*
1518 * Check for header wrapping around physical end-of-log
1519 */
1520 offset = hbp->b_addr;
1521 split_hblks = 0;
1522 wrapped_hblks = 0;
1523 if (blk_no + hblks <= log->l_logBBsize) {
1524 /* Read header in one read */
1525 error = xlog_bread(log, blk_no, hblks, hbp,
1526 &offset);
1527 if (error)
1528 goto bread_err2;
1529 } else {
1530 /* This LR is split across physical log end */
1531 if (blk_no != log->l_logBBsize) {
1532 /* some data before physical log end */
1533 ASSERT(blk_no <= INT_MAX);
1534 split_hblks = log->l_logBBsize - (int)blk_no;
1535 ASSERT(split_hblks > 0);
1536 error = xlog_bread(log, blk_no,
1537 split_hblks, hbp,
1538 &offset);
1539 if (error)
1540 goto bread_err2;
1541 }
1542
1543 /*
1544 * Note: this black magic still works with
1545 * large sector sizes (non-512) only because:
1546 * - we increased the buffer size originally
1547 * by 1 sector giving us enough extra space
1548 * for the second read;
1549 * - the log start is guaranteed to be sector
1550 * aligned;
1551 * - we read the log end (LR header start)
1552 * _first_, then the log start (LR header end)
1553 * - order is important.
1554 */
1555 wrapped_hblks = hblks - split_hblks;
1556 error = xlog_bread_offset(log, 0,
1557 wrapped_hblks, hbp,
1558 offset + BBTOB(split_hblks));
1559 if (error)
1560 goto bread_err2;
1561 }
1562 rhead = (xlog_rec_header_t *)offset;
1563 error = xlog_valid_rec_header(log, rhead,
1564 split_hblks ? blk_no : 0);
1565 if (error)
1566 goto bread_err2;
1567
1568 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1569 blk_no += hblks;
1570
1571 /* Read in data for log record */
1572 if (blk_no + bblks <= log->l_logBBsize) {
1573 error = xlog_bread(log, blk_no, bblks, dbp,
1574 &offset);
1575 if (error)
1576 goto bread_err2;
1577 } else {
1578 /* This log record is split across the
1579 * physical end of log */
1580 offset = dbp->b_addr;
1581 split_bblks = 0;
1582 if (blk_no != log->l_logBBsize) {
1583 /* some data is before the physical
1584 * end of log */
1585 ASSERT(!wrapped_hblks);
1586 ASSERT(blk_no <= INT_MAX);
1587 split_bblks =
1588 log->l_logBBsize - (int)blk_no;
1589 ASSERT(split_bblks > 0);
1590 error = xlog_bread(log, blk_no,
1591 split_bblks, dbp,
1592 &offset);
1593 if (error)
1594 goto bread_err2;
1595 }
1596
1597 /*
1598 * Note: this black magic still works with
1599 * large sector sizes (non-512) only because:
1600 * - we increased the buffer size originally
1601 * by 1 sector giving us enough extra space
1602 * for the second read;
1603 * - the log start is guaranteed to be sector
1604 * aligned;
1605 * - we read the log end (LR header start)
1606 * _first_, then the log start (LR header end)
1607 * - order is important.
1608 */
1609 error = xlog_bread_offset(log, 0,
1610 bblks - split_bblks, dbp,
1611 offset + BBTOB(split_bblks));
1612 if (error)
1613 goto bread_err2;
1614 }
1615
1616 error = xlog_unpack_data(rhead, offset, log);
1617 if (error)
1618 goto bread_err2;
1619
1620 error = xlog_recover_process_data(log, rhash,
1621 rhead, offset, pass);
1622 if (error)
1623 goto bread_err2;
1624 blk_no += bblks;
1625 }
1626
1627 ASSERT(blk_no >= log->l_logBBsize);
1628 blk_no -= log->l_logBBsize;
1629
1630 /* read first part of physical log */
1631 while (blk_no < head_blk) {
1632 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
1633 if (error)
1634 goto bread_err2;
1635
1636 rhead = (xlog_rec_header_t *)offset;
1637 error = xlog_valid_rec_header(log, rhead, blk_no);
1638 if (error)
1639 goto bread_err2;
1640
1641 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1642 error = xlog_bread(log, blk_no+hblks, bblks, dbp,
1643 &offset);
1644 if (error)
1645 goto bread_err2;
1646
1647 error = xlog_unpack_data(rhead, offset, log);
1648 if (error)
1649 goto bread_err2;
1650
1651 error = xlog_recover_process_data(log, rhash,
1652 rhead, offset, pass);
1653 if (error)
1654 goto bread_err2;
1655 blk_no += bblks + hblks;
1656 }
1657 }
1658
1659 bread_err2:
1660 xlog_put_bp(dbp);
1661 bread_err1:
1662 xlog_put_bp(hbp);
1663 return error;
1664 }