]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - libxlog/xfs_log_recover.c
xfs: remove newlines from strings passed to __xfs_printk
[thirdparty/xfsprogs-dev.git] / libxlog / xfs_log_recover.c
1 /*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 #include <xfs/libxlog.h>
20
21 #define xfs_readonly_buftarg(buftarg) (0)
22
23 /* avoid set-but-unused var warning. gcc is not very bright. */
24 #define xlog_clear_stale_blocks(log, taillsn) ({ \
25 (taillsn) = (taillsn); \
26 (0); \
27 })
28
29 #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
30
31 /*
32 * Verify the given count of basic blocks is valid number of blocks
33 * to specify for an operation involving the given XFS log buffer.
34 * Returns nonzero if the count is valid, 0 otherwise.
35 */
36
37 static inline int
38 xlog_buf_bbcount_valid(
39 struct xlog *log,
40 int bbcount)
41 {
42 return bbcount > 0 && bbcount <= log->l_logBBsize;
43 }
44
45 /*
46 * Allocate a buffer to hold log data. The buffer needs to be able
47 * to map to a range of nbblks basic blocks at any valid (basic
48 * block) offset within the log.
49 */
50 xfs_buf_t *
51 xlog_get_bp(
52 struct xlog *log,
53 int nbblks)
54 {
55 if (!xlog_buf_bbcount_valid(log, nbblks)) {
56 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
57 nbblks);
58 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
59 return NULL;
60 }
61
62 /*
63 * We do log I/O in units of log sectors (a power-of-2
64 * multiple of the basic block size), so we round up the
65 * requested size to accommodate the basic blocks required
66 * for complete log sectors.
67 *
68 * In addition, the buffer may be used for a non-sector-
69 * aligned block offset, in which case an I/O of the
70 * requested size could extend beyond the end of the
71 * buffer. If the requested size is only 1 basic block it
72 * will never straddle a sector boundary, so this won't be
73 * an issue. Nor will this be a problem if the log I/O is
74 * done in basic blocks (sector size 1). But otherwise we
75 * extend the buffer by one extra log sector to ensure
76 * there's space to accommodate this possibility.
77 */
78 if (nbblks > 1 && log->l_sectBBsize > 1)
79 nbblks += log->l_sectBBsize;
80 nbblks = round_up(nbblks, log->l_sectBBsize);
81
82 return libxfs_getbufr(log->l_dev, (xfs_daddr_t)-1, nbblks);
83 }
84
85 void
86 xlog_put_bp(
87 xfs_buf_t *bp)
88 {
89 libxfs_putbufr(bp);
90 }
91
92 /*
93 * Return the address of the start of the given block number's data
94 * in a log buffer. The buffer covers a log sector-aligned region.
95 */
96 STATIC xfs_caddr_t
97 xlog_align(
98 struct xlog *log,
99 xfs_daddr_t blk_no,
100 int nbblks,
101 struct xfs_buf *bp)
102 {
103 xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
104
105 ASSERT(offset + nbblks <= bp->b_length);
106 return bp->b_addr + BBTOB(offset);
107 }
108
109
110 /*
111 * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
112 */
113 int
114 xlog_bread_noalign(
115 struct xlog *log,
116 xfs_daddr_t blk_no,
117 int nbblks,
118 struct xfs_buf *bp)
119 {
120 if (!xlog_buf_bbcount_valid(log, nbblks)) {
121 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
122 nbblks);
123 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
124 return EFSCORRUPTED;
125 }
126
127 blk_no = round_down(blk_no, log->l_sectBBsize);
128 nbblks = round_up(nbblks, log->l_sectBBsize);
129
130 ASSERT(nbblks > 0);
131 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
132
133 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
134 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
135 bp->b_error = 0;
136
137 return libxfs_readbufr(log->l_dev, XFS_BUF_ADDR(bp), bp, nbblks, 0);
138 }
139
140 int
141 xlog_bread(
142 struct xlog *log,
143 xfs_daddr_t blk_no,
144 int nbblks,
145 struct xfs_buf *bp,
146 xfs_caddr_t *offset)
147 {
148 int error;
149
150 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
151 if (error)
152 return error;
153
154 *offset = xlog_align(log, blk_no, nbblks, bp);
155 return 0;
156 }
157
158 /*
159 * Read at an offset into the buffer. Returns with the buffer in it's original
160 * state regardless of the result of the read.
161 */
162 STATIC int
163 xlog_bread_offset(
164 struct xlog *log,
165 xfs_daddr_t blk_no, /* block to read from */
166 int nbblks, /* blocks to read */
167 struct xfs_buf *bp,
168 xfs_caddr_t offset)
169 {
170 xfs_caddr_t orig_offset = bp->b_addr;
171 int orig_len = bp->b_bcount;
172 int error, error2;
173
174 error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks));
175 if (error)
176 return error;
177
178 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
179
180 /* must reset buffer pointer even on error */
181 error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len);
182 if (error)
183 return error;
184 return error2;
185 }
186
187 /*
188 * This routine finds (to an approximation) the first block in the physical
189 * log which contains the given cycle. It uses a binary search algorithm.
190 * Note that the algorithm can not be perfect because the disk will not
191 * necessarily be perfect.
192 */
193 int
194 xlog_find_cycle_start(
195 struct xlog *log,
196 struct xfs_buf *bp,
197 xfs_daddr_t first_blk,
198 xfs_daddr_t *last_blk,
199 uint cycle)
200 {
201 xfs_caddr_t offset;
202 xfs_daddr_t mid_blk;
203 xfs_daddr_t end_blk;
204 uint mid_cycle;
205 int error;
206
207 end_blk = *last_blk;
208 mid_blk = BLK_AVG(first_blk, end_blk);
209 while (mid_blk != first_blk && mid_blk != end_blk) {
210 error = xlog_bread(log, mid_blk, 1, bp, &offset);
211 if (error)
212 return error;
213 mid_cycle = xlog_get_cycle(offset);
214 if (mid_cycle == cycle)
215 end_blk = mid_blk; /* last_half_cycle == mid_cycle */
216 else
217 first_blk = mid_blk; /* first_half_cycle == mid_cycle */
218 mid_blk = BLK_AVG(first_blk, end_blk);
219 }
220 ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
221 (mid_blk == end_blk && mid_blk-1 == first_blk));
222
223 *last_blk = end_blk;
224
225 return 0;
226 }
227
228 /*
229 * Check that a range of blocks does not contain stop_on_cycle_no.
230 * Fill in *new_blk with the block offset where such a block is
231 * found, or with -1 (an invalid block number) if there is no such
232 * block in the range. The scan needs to occur from front to back
233 * and the pointer into the region must be updated since a later
234 * routine will need to perform another test.
235 */
236 STATIC int
237 xlog_find_verify_cycle(
238 struct xlog *log,
239 xfs_daddr_t start_blk,
240 int nbblks,
241 uint stop_on_cycle_no,
242 xfs_daddr_t *new_blk)
243 {
244 xfs_daddr_t i, j;
245 uint cycle;
246 xfs_buf_t *bp;
247 xfs_daddr_t bufblks;
248 xfs_caddr_t buf = NULL;
249 int error = 0;
250
251 /*
252 * Greedily allocate a buffer big enough to handle the full
253 * range of basic blocks we'll be examining. If that fails,
254 * try a smaller size. We need to be able to read at least
255 * a log sector, or we're out of luck.
256 */
257 bufblks = 1 << ffs(nbblks);
258 while (bufblks > log->l_logBBsize)
259 bufblks >>= 1;
260 while (!(bp = xlog_get_bp(log, bufblks))) {
261 bufblks >>= 1;
262 if (bufblks < log->l_sectBBsize)
263 return ENOMEM;
264 }
265
266 for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
267 int bcount;
268
269 bcount = min(bufblks, (start_blk + nbblks - i));
270
271 error = xlog_bread(log, i, bcount, bp, &buf);
272 if (error)
273 goto out;
274
275 for (j = 0; j < bcount; j++) {
276 cycle = xlog_get_cycle(buf);
277 if (cycle == stop_on_cycle_no) {
278 *new_blk = i+j;
279 goto out;
280 }
281
282 buf += BBSIZE;
283 }
284 }
285
286 *new_blk = -1;
287
288 out:
289 xlog_put_bp(bp);
290 return error;
291 }
292
293 /*
294 * Potentially backup over partial log record write.
295 *
296 * In the typical case, last_blk is the number of the block directly after
297 * a good log record. Therefore, we subtract one to get the block number
298 * of the last block in the given buffer. extra_bblks contains the number
299 * of blocks we would have read on a previous read. This happens when the
300 * last log record is split over the end of the physical log.
301 *
302 * extra_bblks is the number of blocks potentially verified on a previous
303 * call to this routine.
304 */
305 STATIC int
306 xlog_find_verify_log_record(
307 struct xlog *log,
308 xfs_daddr_t start_blk,
309 xfs_daddr_t *last_blk,
310 int extra_bblks)
311 {
312 xfs_daddr_t i;
313 xfs_buf_t *bp;
314 xfs_caddr_t offset = NULL;
315 xlog_rec_header_t *head = NULL;
316 int error = 0;
317 int smallmem = 0;
318 int num_blks = *last_blk - start_blk;
319 int xhdrs;
320
321 ASSERT(start_blk != 0 || *last_blk != start_blk);
322
323 if (!(bp = xlog_get_bp(log, num_blks))) {
324 if (!(bp = xlog_get_bp(log, 1)))
325 return ENOMEM;
326 smallmem = 1;
327 } else {
328 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
329 if (error)
330 goto out;
331 offset += ((num_blks - 1) << BBSHIFT);
332 }
333
334 for (i = (*last_blk) - 1; i >= 0; i--) {
335 if (i < start_blk) {
336 /* valid log record not found */
337 xfs_warn(log->l_mp,
338 "Log inconsistent (didn't find previous header)");
339 ASSERT(0);
340 error = XFS_ERROR(EIO);
341 goto out;
342 }
343
344 if (smallmem) {
345 error = xlog_bread(log, i, 1, bp, &offset);
346 if (error)
347 goto out;
348 }
349
350 head = (xlog_rec_header_t *)offset;
351
352 if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
353 break;
354
355 if (!smallmem)
356 offset -= BBSIZE;
357 }
358
359 /*
360 * We hit the beginning of the physical log & still no header. Return
361 * to caller. If caller can handle a return of -1, then this routine
362 * will be called again for the end of the physical log.
363 */
364 if (i == -1) {
365 error = -1;
366 goto out;
367 }
368
369 /*
370 * We have the final block of the good log (the first block
371 * of the log record _before_ the head. So we check the uuid.
372 */
373 if ((error = xlog_header_check_mount(log->l_mp, head)))
374 goto out;
375
376 /*
377 * We may have found a log record header before we expected one.
378 * last_blk will be the 1st block # with a given cycle #. We may end
379 * up reading an entire log record. In this case, we don't want to
380 * reset last_blk. Only when last_blk points in the middle of a log
381 * record do we update last_blk.
382 */
383 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
384 uint h_size = be32_to_cpu(head->h_size);
385
386 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
387 if (h_size % XLOG_HEADER_CYCLE_SIZE)
388 xhdrs++;
389 } else {
390 xhdrs = 1;
391 }
392
393 if (*last_blk - i + extra_bblks !=
394 BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
395 *last_blk = i;
396
397 out:
398 xlog_put_bp(bp);
399 return error;
400 }
401
402 /*
403 * Head is defined to be the point of the log where the next log write
404 * write could go. This means that incomplete LR writes at the end are
405 * eliminated when calculating the head. We aren't guaranteed that previous
406 * LR have complete transactions. We only know that a cycle number of
407 * current cycle number -1 won't be present in the log if we start writing
408 * from our current block number.
409 *
410 * last_blk contains the block number of the first block with a given
411 * cycle number.
412 *
413 * Return: zero if normal, non-zero if error.
414 */
415 STATIC int
416 xlog_find_head(
417 struct xlog *log,
418 xfs_daddr_t *return_head_blk)
419 {
420 xfs_buf_t *bp;
421 xfs_caddr_t offset;
422 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
423 int num_scan_bblks;
424 uint first_half_cycle, last_half_cycle;
425 uint stop_on_cycle;
426 int error, log_bbnum = log->l_logBBsize;
427
428 /* Is the end of the log device zeroed? */
429 if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
430 *return_head_blk = first_blk;
431
432 /* Is the whole lot zeroed? */
433 if (!first_blk) {
434 /* Linux XFS shouldn't generate totally zeroed logs -
435 * mkfs etc write a dummy unmount record to a fresh
436 * log so we can store the uuid in there
437 */
438 xfs_warn(log->l_mp, "totally zeroed log");
439 }
440
441 return 0;
442 } else if (error) {
443 xfs_warn(log->l_mp, "empty log check failed");
444 return error;
445 }
446
447 first_blk = 0; /* get cycle # of 1st block */
448 bp = xlog_get_bp(log, 1);
449 if (!bp)
450 return ENOMEM;
451
452 error = xlog_bread(log, 0, 1, bp, &offset);
453 if (error)
454 goto bp_err;
455
456 first_half_cycle = xlog_get_cycle(offset);
457
458 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
459 error = xlog_bread(log, last_blk, 1, bp, &offset);
460 if (error)
461 goto bp_err;
462
463 last_half_cycle = xlog_get_cycle(offset);
464 ASSERT(last_half_cycle != 0);
465
466 /*
467 * If the 1st half cycle number is equal to the last half cycle number,
468 * then the entire log is stamped with the same cycle number. In this
469 * case, head_blk can't be set to zero (which makes sense). The below
470 * math doesn't work out properly with head_blk equal to zero. Instead,
471 * we set it to log_bbnum which is an invalid block number, but this
472 * value makes the math correct. If head_blk doesn't changed through
473 * all the tests below, *head_blk is set to zero at the very end rather
474 * than log_bbnum. In a sense, log_bbnum and zero are the same block
475 * in a circular file.
476 */
477 if (first_half_cycle == last_half_cycle) {
478 /*
479 * In this case we believe that the entire log should have
480 * cycle number last_half_cycle. We need to scan backwards
481 * from the end verifying that there are no holes still
482 * containing last_half_cycle - 1. If we find such a hole,
483 * then the start of that hole will be the new head. The
484 * simple case looks like
485 * x | x ... | x - 1 | x
486 * Another case that fits this picture would be
487 * x | x + 1 | x ... | x
488 * In this case the head really is somewhere at the end of the
489 * log, as one of the latest writes at the beginning was
490 * incomplete.
491 * One more case is
492 * x | x + 1 | x ... | x - 1 | x
493 * This is really the combination of the above two cases, and
494 * the head has to end up at the start of the x-1 hole at the
495 * end of the log.
496 *
497 * In the 256k log case, we will read from the beginning to the
498 * end of the log and search for cycle numbers equal to x-1.
499 * We don't worry about the x+1 blocks that we encounter,
500 * because we know that they cannot be the head since the log
501 * started with x.
502 */
503 head_blk = log_bbnum;
504 stop_on_cycle = last_half_cycle - 1;
505 } else {
506 /*
507 * In this case we want to find the first block with cycle
508 * number matching last_half_cycle. We expect the log to be
509 * some variation on
510 * x + 1 ... | x ... | x
511 * The first block with cycle number x (last_half_cycle) will
512 * be where the new head belongs. First we do a binary search
513 * for the first occurrence of last_half_cycle. The binary
514 * search may not be totally accurate, so then we scan back
515 * from there looking for occurrences of last_half_cycle before
516 * us. If that backwards scan wraps around the beginning of
517 * the log, then we look for occurrences of last_half_cycle - 1
518 * at the end of the log. The cases we're looking for look
519 * like
520 * v binary search stopped here
521 * x + 1 ... | x | x + 1 | x ... | x
522 * ^ but we want to locate this spot
523 * or
524 * <---------> less than scan distance
525 * x + 1 ... | x ... | x - 1 | x
526 * ^ we want to locate this spot
527 */
528 stop_on_cycle = last_half_cycle;
529 if ((error = xlog_find_cycle_start(log, bp, first_blk,
530 &head_blk, last_half_cycle)))
531 goto bp_err;
532 }
533
534 /*
535 * Now validate the answer. Scan back some number of maximum possible
536 * blocks and make sure each one has the expected cycle number. The
537 * maximum is determined by the total possible amount of buffering
538 * in the in-core log. The following number can be made tighter if
539 * we actually look at the block size of the filesystem.
540 */
541 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
542 if (head_blk >= num_scan_bblks) {
543 /*
544 * We are guaranteed that the entire check can be performed
545 * in one buffer.
546 */
547 start_blk = head_blk - num_scan_bblks;
548 if ((error = xlog_find_verify_cycle(log,
549 start_blk, num_scan_bblks,
550 stop_on_cycle, &new_blk)))
551 goto bp_err;
552 if (new_blk != -1)
553 head_blk = new_blk;
554 } else { /* need to read 2 parts of log */
555 /*
556 * We are going to scan backwards in the log in two parts.
557 * First we scan the physical end of the log. In this part
558 * of the log, we are looking for blocks with cycle number
559 * last_half_cycle - 1.
560 * If we find one, then we know that the log starts there, as
561 * we've found a hole that didn't get written in going around
562 * the end of the physical log. The simple case for this is
563 * x + 1 ... | x ... | x - 1 | x
564 * <---------> less than scan distance
565 * If all of the blocks at the end of the log have cycle number
566 * last_half_cycle, then we check the blocks at the start of
567 * the log looking for occurrences of last_half_cycle. If we
568 * find one, then our current estimate for the location of the
569 * first occurrence of last_half_cycle is wrong and we move
570 * back to the hole we've found. This case looks like
571 * x + 1 ... | x | x + 1 | x ...
572 * ^ binary search stopped here
573 * Another case we need to handle that only occurs in 256k
574 * logs is
575 * x + 1 ... | x ... | x+1 | x ...
576 * ^ binary search stops here
577 * In a 256k log, the scan at the end of the log will see the
578 * x + 1 blocks. We need to skip past those since that is
579 * certainly not the head of the log. By searching for
580 * last_half_cycle-1 we accomplish that.
581 */
582 ASSERT(head_blk <= INT_MAX &&
583 (xfs_daddr_t) num_scan_bblks >= head_blk);
584 start_blk = log_bbnum - (num_scan_bblks - head_blk);
585 if ((error = xlog_find_verify_cycle(log, start_blk,
586 num_scan_bblks - (int)head_blk,
587 (stop_on_cycle - 1), &new_blk)))
588 goto bp_err;
589 if (new_blk != -1) {
590 head_blk = new_blk;
591 goto validate_head;
592 }
593
594 /*
595 * Scan beginning of log now. The last part of the physical
596 * log is good. This scan needs to verify that it doesn't find
597 * the last_half_cycle.
598 */
599 start_blk = 0;
600 ASSERT(head_blk <= INT_MAX);
601 if ((error = xlog_find_verify_cycle(log,
602 start_blk, (int)head_blk,
603 stop_on_cycle, &new_blk)))
604 goto bp_err;
605 if (new_blk != -1)
606 head_blk = new_blk;
607 }
608
609 validate_head:
610 /*
611 * Now we need to make sure head_blk is not pointing to a block in
612 * the middle of a log record.
613 */
614 num_scan_bblks = XLOG_REC_SHIFT(log);
615 if (head_blk >= num_scan_bblks) {
616 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
617
618 /* start ptr at last block ptr before head_blk */
619 if ((error = xlog_find_verify_log_record(log, start_blk,
620 &head_blk, 0)) == -1) {
621 error = XFS_ERROR(EIO);
622 goto bp_err;
623 } else if (error)
624 goto bp_err;
625 } else {
626 start_blk = 0;
627 ASSERT(head_blk <= INT_MAX);
628 if ((error = xlog_find_verify_log_record(log, start_blk,
629 &head_blk, 0)) == -1) {
630 /* We hit the beginning of the log during our search */
631 start_blk = log_bbnum - (num_scan_bblks - head_blk);
632 new_blk = log_bbnum;
633 ASSERT(start_blk <= INT_MAX &&
634 (xfs_daddr_t) log_bbnum-start_blk >= 0);
635 ASSERT(head_blk <= INT_MAX);
636 if ((error = xlog_find_verify_log_record(log,
637 start_blk, &new_blk,
638 (int)head_blk)) == -1) {
639 error = XFS_ERROR(EIO);
640 goto bp_err;
641 } else if (error)
642 goto bp_err;
643 if (new_blk != log_bbnum)
644 head_blk = new_blk;
645 } else if (error)
646 goto bp_err;
647 }
648
649 xlog_put_bp(bp);
650 if (head_blk == log_bbnum)
651 *return_head_blk = 0;
652 else
653 *return_head_blk = head_blk;
654 /*
655 * When returning here, we have a good block number. Bad block
656 * means that during a previous crash, we didn't have a clean break
657 * from cycle number N to cycle number N-1. In this case, we need
658 * to find the first block with cycle number N-1.
659 */
660 return 0;
661
662 bp_err:
663 xlog_put_bp(bp);
664
665 if (error)
666 xfs_warn(log->l_mp, "failed to find log head");
667 return error;
668 }
669
670 /*
671 * Find the sync block number or the tail of the log.
672 *
673 * This will be the block number of the last record to have its
674 * associated buffers synced to disk. Every log record header has
675 * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
676 * to get a sync block number. The only concern is to figure out which
677 * log record header to believe.
678 *
679 * The following algorithm uses the log record header with the largest
680 * lsn. The entire log record does not need to be valid. We only care
681 * that the header is valid.
682 *
683 * We could speed up search by using current head_blk buffer, but it is not
684 * available.
685 */
686 int
687 xlog_find_tail(
688 struct xlog *log,
689 xfs_daddr_t *head_blk,
690 xfs_daddr_t *tail_blk)
691 {
692 xlog_rec_header_t *rhead;
693 xlog_op_header_t *op_head;
694 xfs_caddr_t offset = NULL;
695 xfs_buf_t *bp;
696 int error, i, found;
697 xfs_daddr_t umount_data_blk;
698 xfs_daddr_t after_umount_blk;
699 xfs_lsn_t tail_lsn;
700 int hblks;
701
702 found = 0;
703
704 /*
705 * Find previous log record
706 */
707 if ((error = xlog_find_head(log, head_blk)))
708 return error;
709
710 bp = xlog_get_bp(log, 1);
711 if (!bp)
712 return ENOMEM;
713 if (*head_blk == 0) { /* special case */
714 error = xlog_bread(log, 0, 1, bp, &offset);
715 if (error)
716 goto done;
717
718 if (xlog_get_cycle(offset) == 0) {
719 *tail_blk = 0;
720 /* leave all other log inited values alone */
721 goto done;
722 }
723 }
724
725 /*
726 * Search backwards looking for log record header block
727 */
728 ASSERT(*head_blk < INT_MAX);
729 for (i = (int)(*head_blk) - 1; i >= 0; i--) {
730 error = xlog_bread(log, i, 1, bp, &offset);
731 if (error)
732 goto done;
733
734 if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
735 found = 1;
736 break;
737 }
738 }
739 /*
740 * If we haven't found the log record header block, start looking
741 * again from the end of the physical log. XXXmiken: There should be
742 * a check here to make sure we didn't search more than N blocks in
743 * the previous code.
744 */
745 if (!found) {
746 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
747 error = xlog_bread(log, i, 1, bp, &offset);
748 if (error)
749 goto done;
750
751 if (*(__be32 *)offset ==
752 cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
753 found = 2;
754 break;
755 }
756 }
757 }
758 if (!found) {
759 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
760 xlog_put_bp(bp);
761 ASSERT(0);
762 return XFS_ERROR(EIO);
763 }
764
765 /* find blk_no of tail of log */
766 rhead = (xlog_rec_header_t *)offset;
767 *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
768
769 /*
770 * Reset log values according to the state of the log when we
771 * crashed. In the case where head_blk == 0, we bump curr_cycle
772 * one because the next write starts a new cycle rather than
773 * continuing the cycle of the last good log record. At this
774 * point we have guaranteed that all partial log records have been
775 * accounted for. Therefore, we know that the last good log record
776 * written was complete and ended exactly on the end boundary
777 * of the physical log.
778 */
779 log->l_prev_block = i;
780 log->l_curr_block = (int)*head_blk;
781 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
782 if (found == 2)
783 log->l_curr_cycle++;
784 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
785 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
786 xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
787 BBTOB(log->l_curr_block));
788 xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
789 BBTOB(log->l_curr_block));
790
791 /*
792 * Look for unmount record. If we find it, then we know there
793 * was a clean unmount. Since 'i' could be the last block in
794 * the physical log, we convert to a log block before comparing
795 * to the head_blk.
796 *
797 * Save the current tail lsn to use to pass to
798 * xlog_clear_stale_blocks() below. We won't want to clear the
799 * unmount record if there is one, so we pass the lsn of the
800 * unmount record rather than the block after it.
801 */
802 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
803 int h_size = be32_to_cpu(rhead->h_size);
804 int h_version = be32_to_cpu(rhead->h_version);
805
806 if ((h_version & XLOG_VERSION_2) &&
807 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
808 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
809 if (h_size % XLOG_HEADER_CYCLE_SIZE)
810 hblks++;
811 } else {
812 hblks = 1;
813 }
814 } else {
815 hblks = 1;
816 }
817 after_umount_blk = (i + hblks + (int)
818 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
819 tail_lsn = atomic64_read(&log->l_tail_lsn);
820 if (*head_blk == after_umount_blk &&
821 be32_to_cpu(rhead->h_num_logops) == 1) {
822 umount_data_blk = (i + hblks) % log->l_logBBsize;
823 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
824 if (error)
825 goto done;
826
827 op_head = (xlog_op_header_t *)offset;
828 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
829 /*
830 * Set tail and last sync so that newly written
831 * log records will point recovery to after the
832 * current unmount record.
833 */
834 xlog_assign_atomic_lsn(&log->l_tail_lsn,
835 log->l_curr_cycle, after_umount_blk);
836 xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
837 log->l_curr_cycle, after_umount_blk);
838 *tail_blk = after_umount_blk;
839
840 /*
841 * Note that the unmount was clean. If the unmount
842 * was not clean, we need to know this to rebuild the
843 * superblock counters from the perag headers if we
844 * have a filesystem using non-persistent counters.
845 */
846 log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
847 }
848 }
849
850 /*
851 * Make sure that there are no blocks in front of the head
852 * with the same cycle number as the head. This can happen
853 * because we allow multiple outstanding log writes concurrently,
854 * and the later writes might make it out before earlier ones.
855 *
856 * We use the lsn from before modifying it so that we'll never
857 * overwrite the unmount record after a clean unmount.
858 *
859 * Do this only if we are going to recover the filesystem
860 *
861 * NOTE: This used to say "if (!readonly)"
862 * However on Linux, we can & do recover a read-only filesystem.
863 * We only skip recovery if NORECOVERY is specified on mount,
864 * in which case we would not be here.
865 *
866 * But... if the -device- itself is readonly, just skip this.
867 * We can't recover this device anyway, so it won't matter.
868 */
869 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
870 error = xlog_clear_stale_blocks(log, tail_lsn);
871
872 done:
873 xlog_put_bp(bp);
874
875 if (error)
876 xfs_warn(log->l_mp, "failed to locate log tail");
877 return error;
878 }
879
880 /*
881 * Is the log zeroed at all?
882 *
883 * The last binary search should be changed to perform an X block read
884 * once X becomes small enough. You can then search linearly through
885 * the X blocks. This will cut down on the number of reads we need to do.
886 *
887 * If the log is partially zeroed, this routine will pass back the blkno
888 * of the first block with cycle number 0. It won't have a complete LR
889 * preceding it.
890 *
891 * Return:
892 * 0 => the log is completely written to
893 * -1 => use *blk_no as the first block of the log
894 * >0 => error has occurred
895 */
896 int
897 xlog_find_zeroed(
898 struct xlog *log,
899 xfs_daddr_t *blk_no)
900 {
901 xfs_buf_t *bp;
902 xfs_caddr_t offset;
903 uint first_cycle, last_cycle;
904 xfs_daddr_t new_blk, last_blk, start_blk;
905 xfs_daddr_t num_scan_bblks;
906 int error, log_bbnum = log->l_logBBsize;
907
908 *blk_no = 0;
909
910 /* check totally zeroed log */
911 bp = xlog_get_bp(log, 1);
912 if (!bp)
913 return ENOMEM;
914 error = xlog_bread(log, 0, 1, bp, &offset);
915 if (error)
916 goto bp_err;
917
918 first_cycle = xlog_get_cycle(offset);
919 if (first_cycle == 0) { /* completely zeroed log */
920 *blk_no = 0;
921 xlog_put_bp(bp);
922 return -1;
923 }
924
925 /* check partially zeroed log */
926 error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
927 if (error)
928 goto bp_err;
929
930 last_cycle = xlog_get_cycle(offset);
931 if (last_cycle != 0) { /* log completely written to */
932 xlog_put_bp(bp);
933 return 0;
934 } else if (first_cycle != 1) {
935 /*
936 * If the cycle of the last block is zero, the cycle of
937 * the first block must be 1. If it's not, maybe we're
938 * not looking at a log... Bail out.
939 */
940 xfs_warn(log->l_mp,
941 "Log inconsistent or not a log (last==0, first!=1)");
942 error = XFS_ERROR(EINVAL);
943 goto bp_err;
944 }
945
946 /* we have a partially zeroed log */
947 last_blk = log_bbnum-1;
948 if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
949 goto bp_err;
950
951 /*
952 * Validate the answer. Because there is no way to guarantee that
953 * the entire log is made up of log records which are the same size,
954 * we scan over the defined maximum blocks. At this point, the maximum
955 * is not chosen to mean anything special. XXXmiken
956 */
957 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
958 ASSERT(num_scan_bblks <= INT_MAX);
959
960 if (last_blk < num_scan_bblks)
961 num_scan_bblks = last_blk;
962 start_blk = last_blk - num_scan_bblks;
963
964 /*
965 * We search for any instances of cycle number 0 that occur before
966 * our current estimate of the head. What we're trying to detect is
967 * 1 ... | 0 | 1 | 0...
968 * ^ binary search ends here
969 */
970 if ((error = xlog_find_verify_cycle(log, start_blk,
971 (int)num_scan_bblks, 0, &new_blk)))
972 goto bp_err;
973 if (new_blk != -1)
974 last_blk = new_blk;
975
976 /*
977 * Potentially backup over partial log record write. We don't need
978 * to search the end of the log because we know it is zero.
979 */
980 if ((error = xlog_find_verify_log_record(log, start_blk,
981 &last_blk, 0)) == -1) {
982 error = XFS_ERROR(EIO);
983 goto bp_err;
984 } else if (error)
985 goto bp_err;
986
987 *blk_no = last_blk;
988 bp_err:
989 xlog_put_bp(bp);
990 if (error)
991 return error;
992 return -1;
993 }
994
995 STATIC xlog_recover_t *
996 xlog_recover_find_tid(
997 struct hlist_head *head,
998 xlog_tid_t tid)
999 {
1000 xlog_recover_t *trans;
1001 struct hlist_node *n;
1002
1003 hlist_for_each_entry(trans, n, head, r_list) {
1004 if (trans->r_log_tid == tid)
1005 return trans;
1006 }
1007 return NULL;
1008 }
1009
1010 STATIC void
1011 xlog_recover_new_tid(
1012 struct hlist_head *head,
1013 xlog_tid_t tid,
1014 xfs_lsn_t lsn)
1015 {
1016 xlog_recover_t *trans;
1017
1018 trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1019 trans->r_log_tid = tid;
1020 trans->r_lsn = lsn;
1021 INIT_LIST_HEAD(&trans->r_itemq);
1022
1023 INIT_HLIST_NODE(&trans->r_list);
1024 hlist_add_head(&trans->r_list, head);
1025 }
1026
1027 STATIC void
1028 xlog_recover_add_item(
1029 struct list_head *head)
1030 {
1031 xlog_recover_item_t *item;
1032
1033 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1034 INIT_LIST_HEAD(&item->ri_list);
1035 list_add_tail(&item->ri_list, head);
1036 }
1037
1038 #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
1039
1040 STATIC int
1041 xlog_recover_add_to_cont_trans(
1042 struct xlog *log,
1043 struct xlog_recover *trans,
1044 xfs_caddr_t dp,
1045 int len)
1046 {
1047 xlog_recover_item_t *item;
1048 xfs_caddr_t ptr, old_ptr;
1049 int old_len;
1050
1051 if (list_empty(&trans->r_itemq)) {
1052 /* finish copying rest of trans header */
1053 xlog_recover_add_item(&trans->r_itemq);
1054 ptr = (xfs_caddr_t) &trans->r_theader +
1055 sizeof(xfs_trans_header_t) - len;
1056 memcpy(ptr, dp, len); /* d, s, l */
1057 return 0;
1058 }
1059 /* take the tail entry */
1060 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1061
1062 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1063 old_len = item->ri_buf[item->ri_cnt-1].i_len;
1064
1065 ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
1066 memcpy(&ptr[old_len], dp, len); /* d, s, l */
1067 item->ri_buf[item->ri_cnt-1].i_len += len;
1068 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1069 trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1070 return 0;
1071 }
1072
1073 /*
1074 * The next region to add is the start of a new region. It could be
1075 * a whole region or it could be the first part of a new region. Because
1076 * of this, the assumption here is that the type and size fields of all
1077 * format structures fit into the first 32 bits of the structure.
1078 *
1079 * This works because all regions must be 32 bit aligned. Therefore, we
1080 * either have both fields or we have neither field. In the case we have
1081 * neither field, the data part of the region is zero length. We only have
1082 * a log_op_header and can throw away the header since a new one will appear
1083 * later. If we have at least 4 bytes, then we can determine how many regions
1084 * will appear in the current log item.
1085 */
1086 STATIC int
1087 xlog_recover_add_to_trans(
1088 struct xlog *log,
1089 struct xlog_recover *trans,
1090 xfs_caddr_t dp,
1091 int len)
1092 {
1093 xfs_inode_log_format_t *in_f; /* any will do */
1094 xlog_recover_item_t *item;
1095 xfs_caddr_t ptr;
1096
1097 if (!len)
1098 return 0;
1099 if (list_empty(&trans->r_itemq)) {
1100 /* we need to catch log corruptions here */
1101 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1102 xfs_warn(log->l_mp, "%s: bad header magic number",
1103 __func__);
1104 ASSERT(0);
1105 return XFS_ERROR(EIO);
1106 }
1107 if (len == sizeof(xfs_trans_header_t))
1108 xlog_recover_add_item(&trans->r_itemq);
1109 memcpy(&trans->r_theader, dp, len); /* d, s, l */
1110 return 0;
1111 }
1112
1113 ptr = kmem_alloc(len, KM_SLEEP);
1114 memcpy(ptr, dp, len);
1115 in_f = (xfs_inode_log_format_t *)ptr;
1116
1117 /* take the tail entry */
1118 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1119 if (item->ri_total != 0 &&
1120 item->ri_total == item->ri_cnt) {
1121 /* tail item is in use, get a new one */
1122 xlog_recover_add_item(&trans->r_itemq);
1123 item = list_entry(trans->r_itemq.prev,
1124 xlog_recover_item_t, ri_list);
1125 }
1126
1127 if (item->ri_total == 0) { /* first region to be added */
1128 if (in_f->ilf_size == 0 ||
1129 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1130 xfs_warn(log->l_mp,
1131 "bad number of regions (%d) in inode log format",
1132 in_f->ilf_size);
1133 ASSERT(0);
1134 return XFS_ERROR(EIO);
1135 }
1136
1137 item->ri_total = in_f->ilf_size;
1138 item->ri_buf =
1139 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
1140 KM_SLEEP);
1141 }
1142 ASSERT(item->ri_total > item->ri_cnt);
1143 /* Description region is ri_buf[0] */
1144 item->ri_buf[item->ri_cnt].i_addr = ptr;
1145 item->ri_buf[item->ri_cnt].i_len = len;
1146 item->ri_cnt++;
1147 trace_xfs_log_recover_item_add(log, trans, item, 0);
1148 return 0;
1149 }
1150
1151 /*
1152 * Free up any resources allocated by the transaction
1153 *
1154 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
1155 */
1156 STATIC void
1157 xlog_recover_free_trans(
1158 struct xlog_recover *trans)
1159 {
1160 xlog_recover_item_t *item, *n;
1161 int i;
1162
1163 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
1164 /* Free the regions in the item. */
1165 list_del(&item->ri_list);
1166 for (i = 0; i < item->ri_cnt; i++)
1167 kmem_free(item->ri_buf[i].i_addr);
1168 /* Free the item itself */
1169 kmem_free(item->ri_buf);
1170 kmem_free(item);
1171 }
1172 /* Free the transaction recover structure */
1173 kmem_free(trans);
1174 }
1175
1176 /*
1177 * Perform the transaction.
1178 *
1179 * If the transaction modifies a buffer or inode, do it now. Otherwise,
1180 * EFIs and EFDs get queued up by adding entries into the AIL for them.
1181 */
1182 STATIC int
1183 xlog_recover_commit_trans(
1184 struct xlog *log,
1185 struct xlog_recover *trans,
1186 int pass)
1187 {
1188 int error = 0;
1189
1190 hlist_del(&trans->r_list);
1191 if ((error = xlog_recover_do_trans(log, trans, pass)))
1192 return error;
1193
1194 xlog_recover_free_trans(trans);
1195 return 0;
1196 }
1197
1198 STATIC int
1199 xlog_recover_unmount_trans(
1200 xlog_recover_t *trans)
1201 {
1202 /* Do nothing now */
1203 xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
1204 return 0;
1205 }
1206
1207 /*
1208 * There are two valid states of the r_state field. 0 indicates that the
1209 * transaction structure is in a normal state. We have either seen the
1210 * start of the transaction or the last operation we added was not a partial
1211 * operation. If the last operation we added to the transaction was a
1212 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
1213 *
1214 * NOTE: skip LRs with 0 data length.
1215 */
1216 STATIC int
1217 xlog_recover_process_data(
1218 struct xlog *log,
1219 struct hlist_head rhash[],
1220 struct xlog_rec_header *rhead,
1221 xfs_caddr_t dp,
1222 int pass)
1223 {
1224 xfs_caddr_t lp;
1225 int num_logops;
1226 xlog_op_header_t *ohead;
1227 xlog_recover_t *trans;
1228 xlog_tid_t tid;
1229 int error;
1230 unsigned long hash;
1231 uint flags;
1232
1233 lp = dp + be32_to_cpu(rhead->h_len);
1234 num_logops = be32_to_cpu(rhead->h_num_logops);
1235
1236 /* check the log format matches our own - else we can't recover */
1237 if (xlog_header_check_recover(log->l_mp, rhead))
1238 return (XFS_ERROR(EIO));
1239
1240 while ((dp < lp) && num_logops) {
1241 ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
1242 ohead = (xlog_op_header_t *)dp;
1243 dp += sizeof(xlog_op_header_t);
1244 if (ohead->oh_clientid != XFS_TRANSACTION &&
1245 ohead->oh_clientid != XFS_LOG) {
1246 xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
1247 __func__, ohead->oh_clientid);
1248 ASSERT(0);
1249 return (XFS_ERROR(EIO));
1250 }
1251 tid = be32_to_cpu(ohead->oh_tid);
1252 hash = XLOG_RHASH(tid);
1253 trans = xlog_recover_find_tid(&rhash[hash], tid);
1254 if (trans == NULL) { /* not found; add new tid */
1255 if (ohead->oh_flags & XLOG_START_TRANS)
1256 xlog_recover_new_tid(&rhash[hash], tid,
1257 be64_to_cpu(rhead->h_lsn));
1258 } else {
1259 if (dp + be32_to_cpu(ohead->oh_len) > lp) {
1260 xfs_warn(log->l_mp, "%s: bad length 0x%x",
1261 __func__, be32_to_cpu(ohead->oh_len));
1262 return (XFS_ERROR(EIO));
1263 }
1264 flags = ohead->oh_flags & ~XLOG_END_TRANS;
1265 if (flags & XLOG_WAS_CONT_TRANS)
1266 flags &= ~XLOG_CONTINUE_TRANS;
1267 switch (flags) {
1268 case XLOG_COMMIT_TRANS:
1269 error = xlog_recover_commit_trans(log,
1270 trans, pass);
1271 break;
1272 case XLOG_UNMOUNT_TRANS:
1273 error = xlog_recover_unmount_trans(trans);
1274 break;
1275 case XLOG_WAS_CONT_TRANS:
1276 error = xlog_recover_add_to_cont_trans(log,
1277 trans, dp,
1278 be32_to_cpu(ohead->oh_len));
1279 break;
1280 case XLOG_START_TRANS:
1281 xfs_warn(log->l_mp, "%s: bad transaction",
1282 __func__);
1283 ASSERT(0);
1284 error = XFS_ERROR(EIO);
1285 break;
1286 case 0:
1287 case XLOG_CONTINUE_TRANS:
1288 error = xlog_recover_add_to_trans(log, trans,
1289 dp, be32_to_cpu(ohead->oh_len));
1290 break;
1291 default:
1292 xfs_warn(log->l_mp, "%s: bad flag 0x%x",
1293 __func__, flags);
1294 ASSERT(0);
1295 error = XFS_ERROR(EIO);
1296 break;
1297 }
1298 if (error)
1299 return error;
1300 }
1301 dp += be32_to_cpu(ohead->oh_len);
1302 num_logops--;
1303 }
1304 return 0;
1305 }
1306
1307 /*
1308 * Upack the log buffer data and crc check it. If the check fails, issue a
1309 * warning if and only if the CRC in the header is non-zero. This makes the
1310 * check an advisory warning, and the zero CRC check will prevent failure
1311 * warnings from being emitted when upgrading the kernel from one that does not
1312 * add CRCs by default.
1313 *
1314 * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
1315 * corruption failure
1316 *
1317 * XXX: we do not calculate the CRC here yet. It's not clear what we should do
1318 * with CRC errors here in userspace, so we'll address that problem later on.
1319 */
1320 #define xlog_cksum(l,r,dp,len) ((r)->h_crc)
1321 STATIC int
1322 xlog_unpack_data_crc(
1323 struct xlog_rec_header *rhead,
1324 xfs_caddr_t dp,
1325 struct xlog *log)
1326 {
1327 __le32 crc;
1328
1329 crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
1330 if (crc != rhead->h_crc) {
1331 if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
1332 xfs_alert(log->l_mp,
1333 "log record CRC mismatch: found 0x%x, expected 0x%x.",
1334 le32_to_cpu(rhead->h_crc),
1335 le32_to_cpu(crc));
1336 xfs_hex_dump(dp, 32);
1337 }
1338
1339 /*
1340 * If we've detected a log record corruption, then we can't
1341 * recover past this point. Abort recovery if we are enforcing
1342 * CRC protection by punting an error back up the stack.
1343 */
1344 if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
1345 return EFSCORRUPTED;
1346 }
1347
1348 return 0;
1349 }
1350
1351 STATIC int
1352 xlog_unpack_data(
1353 struct xlog_rec_header *rhead,
1354 xfs_caddr_t dp,
1355 struct xlog *log)
1356 {
1357 int i, j, k;
1358 int error;
1359
1360 error = xlog_unpack_data_crc(rhead, dp, log);
1361 if (error)
1362 return error;
1363
1364 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
1365 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
1366 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
1367 dp += BBSIZE;
1368 }
1369
1370 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1371 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
1372 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
1373 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1374 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1375 *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
1376 dp += BBSIZE;
1377 }
1378 }
1379
1380 return 0;
1381 }
1382
1383 STATIC int
1384 xlog_valid_rec_header(
1385 struct xlog *log,
1386 struct xlog_rec_header *rhead,
1387 xfs_daddr_t blkno)
1388 {
1389 int hlen;
1390
1391 if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
1392 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
1393 XFS_ERRLEVEL_LOW, log->l_mp);
1394 return XFS_ERROR(EFSCORRUPTED);
1395 }
1396 if (unlikely(
1397 (!rhead->h_version ||
1398 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
1399 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
1400 __func__, be32_to_cpu(rhead->h_version));
1401 return XFS_ERROR(EIO);
1402 }
1403
1404 /* LR body must have data or it wouldn't have been written */
1405 hlen = be32_to_cpu(rhead->h_len);
1406 if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
1407 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
1408 XFS_ERRLEVEL_LOW, log->l_mp);
1409 return XFS_ERROR(EFSCORRUPTED);
1410 }
1411 if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
1412 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
1413 XFS_ERRLEVEL_LOW, log->l_mp);
1414 return XFS_ERROR(EFSCORRUPTED);
1415 }
1416 return 0;
1417 }
1418
1419 /*
1420 * Read the log from tail to head and process the log records found.
1421 * Handle the two cases where the tail and head are in the same cycle
1422 * and where the active portion of the log wraps around the end of
1423 * the physical log separately. The pass parameter is passed through
1424 * to the routines called to process the data and is not looked at
1425 * here.
1426 */
1427 int
1428 xlog_do_recovery_pass(
1429 struct xlog *log,
1430 xfs_daddr_t head_blk,
1431 xfs_daddr_t tail_blk,
1432 int pass)
1433 {
1434 xlog_rec_header_t *rhead;
1435 xfs_daddr_t blk_no;
1436 xfs_caddr_t offset;
1437 xfs_buf_t *hbp, *dbp;
1438 int error = 0, h_size;
1439 int bblks, split_bblks;
1440 int hblks, split_hblks, wrapped_hblks;
1441 struct hlist_head rhash[XLOG_RHASH_SIZE];
1442
1443 ASSERT(head_blk != tail_blk);
1444
1445 /*
1446 * Read the header of the tail block and get the iclog buffer size from
1447 * h_size. Use this to tell how many sectors make up the log header.
1448 */
1449 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1450 /*
1451 * When using variable length iclogs, read first sector of
1452 * iclog header and extract the header size from it. Get a
1453 * new hbp that is the correct size.
1454 */
1455 hbp = xlog_get_bp(log, 1);
1456 if (!hbp)
1457 return ENOMEM;
1458
1459 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
1460 if (error)
1461 goto bread_err1;
1462
1463 rhead = (xlog_rec_header_t *)offset;
1464 error = xlog_valid_rec_header(log, rhead, tail_blk);
1465 if (error)
1466 goto bread_err1;
1467 h_size = be32_to_cpu(rhead->h_size);
1468 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
1469 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1470 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1471 if (h_size % XLOG_HEADER_CYCLE_SIZE)
1472 hblks++;
1473 xlog_put_bp(hbp);
1474 hbp = xlog_get_bp(log, hblks);
1475 } else {
1476 hblks = 1;
1477 }
1478 } else {
1479 ASSERT(log->l_sectBBsize == 1);
1480 hblks = 1;
1481 hbp = xlog_get_bp(log, 1);
1482 h_size = XLOG_BIG_RECORD_BSIZE;
1483 }
1484
1485 if (!hbp)
1486 return ENOMEM;
1487 dbp = xlog_get_bp(log, BTOBB(h_size));
1488 if (!dbp) {
1489 xlog_put_bp(hbp);
1490 return ENOMEM;
1491 }
1492
1493 memset(rhash, 0, sizeof(rhash));
1494 if (tail_blk <= head_blk) {
1495 for (blk_no = tail_blk; blk_no < head_blk; ) {
1496 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
1497 if (error)
1498 goto bread_err2;
1499
1500 rhead = (xlog_rec_header_t *)offset;
1501 error = xlog_valid_rec_header(log, rhead, blk_no);
1502 if (error)
1503 goto bread_err2;
1504
1505 /* blocks in data section */
1506 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1507 error = xlog_bread(log, blk_no + hblks, bblks, dbp,
1508 &offset);
1509 if (error)
1510 goto bread_err2;
1511
1512 error = xlog_unpack_data(rhead, offset, log);
1513 if (error)
1514 goto bread_err2;
1515
1516 error = xlog_recover_process_data(log,
1517 rhash, rhead, offset, pass);
1518 if (error)
1519 goto bread_err2;
1520 blk_no += bblks + hblks;
1521 }
1522 } else {
1523 /*
1524 * Perform recovery around the end of the physical log.
1525 * When the head is not on the same cycle number as the tail,
1526 * we can't do a sequential recovery as above.
1527 */
1528 blk_no = tail_blk;
1529 while (blk_no < log->l_logBBsize) {
1530 /*
1531 * Check for header wrapping around physical end-of-log
1532 */
1533 offset = hbp->b_addr;
1534 split_hblks = 0;
1535 wrapped_hblks = 0;
1536 if (blk_no + hblks <= log->l_logBBsize) {
1537 /* Read header in one read */
1538 error = xlog_bread(log, blk_no, hblks, hbp,
1539 &offset);
1540 if (error)
1541 goto bread_err2;
1542 } else {
1543 /* This LR is split across physical log end */
1544 if (blk_no != log->l_logBBsize) {
1545 /* some data before physical log end */
1546 ASSERT(blk_no <= INT_MAX);
1547 split_hblks = log->l_logBBsize - (int)blk_no;
1548 ASSERT(split_hblks > 0);
1549 error = xlog_bread(log, blk_no,
1550 split_hblks, hbp,
1551 &offset);
1552 if (error)
1553 goto bread_err2;
1554 }
1555
1556 /*
1557 * Note: this black magic still works with
1558 * large sector sizes (non-512) only because:
1559 * - we increased the buffer size originally
1560 * by 1 sector giving us enough extra space
1561 * for the second read;
1562 * - the log start is guaranteed to be sector
1563 * aligned;
1564 * - we read the log end (LR header start)
1565 * _first_, then the log start (LR header end)
1566 * - order is important.
1567 */
1568 wrapped_hblks = hblks - split_hblks;
1569 error = xlog_bread_offset(log, 0,
1570 wrapped_hblks, hbp,
1571 offset + BBTOB(split_hblks));
1572 if (error)
1573 goto bread_err2;
1574 }
1575 rhead = (xlog_rec_header_t *)offset;
1576 error = xlog_valid_rec_header(log, rhead,
1577 split_hblks ? blk_no : 0);
1578 if (error)
1579 goto bread_err2;
1580
1581 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1582 blk_no += hblks;
1583
1584 /* Read in data for log record */
1585 if (blk_no + bblks <= log->l_logBBsize) {
1586 error = xlog_bread(log, blk_no, bblks, dbp,
1587 &offset);
1588 if (error)
1589 goto bread_err2;
1590 } else {
1591 /* This log record is split across the
1592 * physical end of log */
1593 offset = dbp->b_addr;
1594 split_bblks = 0;
1595 if (blk_no != log->l_logBBsize) {
1596 /* some data is before the physical
1597 * end of log */
1598 ASSERT(!wrapped_hblks);
1599 ASSERT(blk_no <= INT_MAX);
1600 split_bblks =
1601 log->l_logBBsize - (int)blk_no;
1602 ASSERT(split_bblks > 0);
1603 error = xlog_bread(log, blk_no,
1604 split_bblks, dbp,
1605 &offset);
1606 if (error)
1607 goto bread_err2;
1608 }
1609
1610 /*
1611 * Note: this black magic still works with
1612 * large sector sizes (non-512) only because:
1613 * - we increased the buffer size originally
1614 * by 1 sector giving us enough extra space
1615 * for the second read;
1616 * - the log start is guaranteed to be sector
1617 * aligned;
1618 * - we read the log end (LR header start)
1619 * _first_, then the log start (LR header end)
1620 * - order is important.
1621 */
1622 error = xlog_bread_offset(log, 0,
1623 bblks - split_bblks, dbp,
1624 offset + BBTOB(split_bblks));
1625 if (error)
1626 goto bread_err2;
1627 }
1628
1629 error = xlog_unpack_data(rhead, offset, log);
1630 if (error)
1631 goto bread_err2;
1632
1633 error = xlog_recover_process_data(log, rhash,
1634 rhead, offset, pass);
1635 if (error)
1636 goto bread_err2;
1637 blk_no += bblks;
1638 }
1639
1640 ASSERT(blk_no >= log->l_logBBsize);
1641 blk_no -= log->l_logBBsize;
1642
1643 /* read first part of physical log */
1644 while (blk_no < head_blk) {
1645 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
1646 if (error)
1647 goto bread_err2;
1648
1649 rhead = (xlog_rec_header_t *)offset;
1650 error = xlog_valid_rec_header(log, rhead, blk_no);
1651 if (error)
1652 goto bread_err2;
1653
1654 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1655 error = xlog_bread(log, blk_no+hblks, bblks, dbp,
1656 &offset);
1657 if (error)
1658 goto bread_err2;
1659
1660 error = xlog_unpack_data(rhead, offset, log);
1661 if (error)
1662 goto bread_err2;
1663
1664 error = xlog_recover_process_data(log, rhash,
1665 rhead, offset, pass);
1666 if (error)
1667 goto bread_err2;
1668 blk_no += bblks + hblks;
1669 }
1670 }
1671
1672 bread_err2:
1673 xlog_put_bp(dbp);
1674 bread_err1:
1675 xlog_put_bp(hbp);
1676 return error;
1677 }