]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - libxlog/xfs_log_recover.c
add libxlog directory.
[thirdparty/xfsprogs-dev.git] / libxlog / xfs_log_recover.c
1 /*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33 #include <libxlog.h>
34
35 /*
36 * This routine finds (to an approximation) the first block in the physical
37 * log which contains the given cycle. It uses a binary search algorithm.
38 * Note that the algorithm can not be perfect because the disk will not
39 * necessarily be perfect.
40 */
41 int
42 xlog_find_cycle_start(xlog_t *log,
43 xfs_buf_t *bp,
44 xfs_daddr_t first_blk,
45 xfs_daddr_t *last_blk,
46 uint cycle)
47 {
48 xfs_daddr_t mid_blk;
49 uint mid_cycle;
50 int error;
51
52 mid_blk = BLK_AVG(first_blk, *last_blk);
53 while (mid_blk != first_blk && mid_blk != *last_blk) {
54 if ((error = xlog_bread(log, mid_blk, 1, bp)))
55 return error;
56 mid_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
57 if (mid_cycle == cycle) {
58 *last_blk = mid_blk;
59 /* last_half_cycle == mid_cycle */
60 } else {
61 first_blk = mid_blk;
62 /* first_half_cycle == mid_cycle */
63 }
64 mid_blk = BLK_AVG(first_blk, *last_blk);
65 }
66 ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
67 (mid_blk == *last_blk && mid_blk-1 == first_blk));
68
69 return 0;
70 } /* xlog_find_cycle_start */
71
72
73 /*
74 * Check that the range of blocks does not contain the cycle number
75 * given. The scan needs to occur from front to back and the ptr into the
76 * region must be updated since a later routine will need to perform another
77 * test. If the region is completely good, we end up returning the same
78 * last block number.
79 *
80 * Return -1 if we encounter no errors. This is an invalid block number
81 * since we don't ever expect logs to get this large.
82 */
83
84 STATIC xfs_daddr_t
85 xlog_find_verify_cycle( xlog_t *log,
86 xfs_daddr_t start_blk,
87 int nbblks,
88 uint stop_on_cycle_no)
89 {
90 int i, j;
91 uint cycle;
92 xfs_buf_t *bp;
93 char *buf = NULL;
94 int error = 0;
95 xfs_daddr_t bufblks = nbblks;
96
97 while (!(bp = xlog_get_bp(bufblks, log->l_mp))) {
98 /* can't get enough memory to do everything in one big buffer */
99 bufblks >>= 1;
100 if (!bufblks)
101 return -ENOMEM;
102 }
103
104
105 for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
106 int bcount = min(bufblks, (start_blk + nbblks - i));
107
108 if ((error = xlog_bread(log, i, bcount, bp)))
109 goto out;
110
111 buf = XFS_BUF_PTR(bp);
112 for (j = 0; j < bcount; j++) {
113 cycle = GET_CYCLE(buf, ARCH_CONVERT);
114 if (cycle == stop_on_cycle_no) {
115 error = i;
116 goto out;
117 }
118
119 buf += BBSIZE;
120 }
121 }
122
123 error = -1;
124
125 out:
126 xlog_put_bp(bp);
127
128 return error;
129 } /* xlog_find_verify_cycle */
130
131
132 /*
133 * Potentially backup over partial log record write.
134 *
135 * In the typical case, last_blk is the number of the block directly after
136 * a good log record. Therefore, we subtract one to get the block number
137 * of the last block in the given buffer. extra_bblks contains the number
138 * of blocks we would have read on a previous read. This happens when the
139 * last log record is split over the end of the physical log.
140 *
141 * extra_bblks is the number of blocks potentially verified on a previous
142 * call to this routine.
143 */
144
145 STATIC int
146 xlog_find_verify_log_record(xlog_t *log,
147 xfs_daddr_t start_blk,
148 xfs_daddr_t *last_blk,
149 int extra_bblks)
150 {
151 xfs_daddr_t i;
152 xfs_buf_t *bp;
153 char *buf = NULL;
154 xlog_rec_header_t *head = NULL;
155 int error = 0;
156 int smallmem = 0;
157 int num_blks = *last_blk - start_blk;
158
159 ASSERT(start_blk != 0 || *last_blk != start_blk);
160
161 if (!(bp = xlog_get_bp(num_blks, log->l_mp))) {
162 if (!(bp = xlog_get_bp(1, log->l_mp)))
163 return -ENOMEM;
164 smallmem = 1;
165 buf = XFS_BUF_PTR(bp);
166 } else {
167 if ((error = xlog_bread(log, start_blk, num_blks, bp)))
168 goto out;
169 buf = XFS_BUF_PTR(bp) + (num_blks - 1) * BBSIZE;
170 }
171
172
173 for (i=(*last_blk)-1; i>=0; i--) {
174 if (i < start_blk) {
175 /* legal log record not found */
176 xlog_warn("XFS: Log inconsistent (didn't find previous header)");
177 #ifdef __KERNEL__
178 ASSERT(0);
179 #endif
180 error = XFS_ERROR(EIO);
181 goto out;
182 }
183
184 if (smallmem && (error = xlog_bread(log, i, 1, bp)))
185 goto out;
186 head = (xlog_rec_header_t*)buf;
187
188 if (INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM)
189 break;
190
191 if (!smallmem)
192 buf -= BBSIZE;
193 }
194
195 /*
196 * We hit the beginning of the physical log & still no header. Return
197 * to caller. If caller can handle a return of -1, then this routine
198 * will be called again for the end of the physical log.
199 */
200 if (i == -1) {
201 error = -1;
202 goto out;
203 }
204
205 /* we have the final block of the good log (the first block
206 * of the log record _before_ the head. So we check the uuid.
207 */
208
209 if ((error = xlog_header_check_mount(log->l_mp, head)))
210 goto out;
211
212 /*
213 * We may have found a log record header before we expected one.
214 * last_blk will be the 1st block # with a given cycle #. We may end
215 * up reading an entire log record. In this case, we don't want to
216 * reset last_blk. Only when last_blk points in the middle of a log
217 * record do we update last_blk.
218 */
219 if (*last_blk - i + extra_bblks
220 != BTOBB(INT_GET(head->h_len, ARCH_CONVERT))+1)
221 *last_blk = i;
222
223 out:
224 xlog_put_bp(bp);
225
226 return error;
227 } /* xlog_find_verify_log_record */
228
229 /*
230 * Head is defined to be the point of the log where the next log write
231 * write could go. This means that incomplete LR writes at the end are
232 * eliminated when calculating the head. We aren't guaranteed that previous
233 * LR have complete transactions. We only know that a cycle number of
234 * current cycle number -1 won't be present in the log if we start writing
235 * from our current block number.
236 *
237 * last_blk contains the block number of the first block with a given
238 * cycle number.
239 *
240 * Also called from xfs_log_print.c
241 *
242 * Return: zero if normal, non-zero if error.
243 */
244 int
245 xlog_find_head(xlog_t *log,
246 xfs_daddr_t *return_head_blk)
247 {
248 xfs_buf_t *bp;
249 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
250 int num_scan_bblks;
251 uint first_half_cycle, last_half_cycle;
252 uint stop_on_cycle;
253 int error, log_bbnum = log->l_logBBsize;
254
255 /* Is the end of the log device zeroed? */
256 if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
257 *return_head_blk = first_blk;
258
259 /* is the whole lot zeroed? */
260 if (!first_blk) {
261 /* Linux XFS shouldn't generate totally zeroed logs -
262 * mkfs etc write a dummy unmount record to a fresh
263 * log so we can store the uuid in there
264 */
265 xlog_warn("XFS: totally zeroed log\n");
266 }
267
268 return 0;
269 } else if (error) {
270 xlog_warn("XFS: empty log check failed");
271 return error;
272 }
273
274 first_blk = 0; /* get cycle # of 1st block */
275 bp = xlog_get_bp(1,log->l_mp);
276 if (!bp)
277 return -ENOMEM;
278 if ((error = xlog_bread(log, 0, 1, bp)))
279 goto bp_err;
280 first_half_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
281
282 last_blk = head_blk = log_bbnum-1; /* get cycle # of last block */
283 if ((error = xlog_bread(log, last_blk, 1, bp)))
284 goto bp_err;
285 last_half_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
286 ASSERT(last_half_cycle != 0);
287
288 /*
289 * If the 1st half cycle number is equal to the last half cycle number,
290 * then the entire log is stamped with the same cycle number. In this
291 * case, head_blk can't be set to zero (which makes sense). The below
292 * math doesn't work out properly with head_blk equal to zero. Instead,
293 * we set it to log_bbnum which is an illegal block number, but this
294 * value makes the math correct. If head_blk doesn't changed through
295 * all the tests below, *head_blk is set to zero at the very end rather
296 * than log_bbnum. In a sense, log_bbnum and zero are the same block
297 * in a circular file.
298 */
299 if (first_half_cycle == last_half_cycle) {
300 /*
301 * In this case we believe that the entire log should have cycle
302 * number last_half_cycle. We need to scan backwards from the
303 * end verifying that there are no holes still containing
304 * last_half_cycle - 1. If we find such a hole, then the start
305 * of that hole will be the new head. The simple case looks like
306 * x | x ... | x - 1 | x
307 * Another case that fits this picture would be
308 * x | x + 1 | x ... | x
309 * In this case the head really is somwhere at the end of the
310 * log, as one of the latest writes at the beginning was incomplete.
311 * One more case is
312 * x | x + 1 | x ... | x - 1 | x
313 * This is really the combination of the above two cases, and the
314 * head has to end up at the start of the x-1 hole at the end of
315 * the log.
316 *
317 * In the 256k log case, we will read from the beginning to the
318 * end of the log and search for cycle numbers equal to x-1. We
319 * don't worry about the x+1 blocks that we encounter, because
320 * we know that they cannot be the head since the log started with
321 * x.
322 */
323 head_blk = log_bbnum;
324 stop_on_cycle = last_half_cycle - 1;
325 } else {
326 /*
327 * In this case we want to find the first block with cycle number
328 * matching last_half_cycle. We expect the log to be some
329 * variation on
330 * x + 1 ... | x ...
331 * The first block with cycle number x (last_half_cycle) will be
332 * where the new head belongs. First we do a binary search for
333 * the first occurrence of last_half_cycle. The binary search
334 * may not be totally accurate, so then we scan back from there
335 * looking for occurrences of last_half_cycle before us. If
336 * that backwards scan wraps around the beginning of the log,
337 * then we look for occurrences of last_half_cycle - 1 at the
338 * end of the log. The cases we're looking for look like
339 * x + 1 ... | x | x + 1 | x ...
340 * ^ binary search stopped here
341 * or
342 * x + 1 ... | x ... | x - 1 | x
343 * <---------> less than scan distance
344 */
345 stop_on_cycle = last_half_cycle;
346 if ((error = xlog_find_cycle_start(log, bp, first_blk,
347 &head_blk, last_half_cycle)))
348 goto bp_err;
349 }
350
351 /*
352 * Now validate the answer. Scan back some number of maximum possible
353 * blocks and make sure each one has the expected cycle number. The
354 * maximum is determined by the total possible amount of buffering
355 * in the in-core log. The following number can be made tighter if
356 * we actually look at the block size of the filesystem.
357 */
358 num_scan_bblks = BTOBB(XLOG_MAX_ICLOGS<<XLOG_MAX_RECORD_BSHIFT);
359 if (head_blk >= num_scan_bblks) {
360 /*
361 * We are guaranteed that the entire check can be performed
362 * in one buffer.
363 */
364 start_blk = head_blk - num_scan_bblks;
365 new_blk = xlog_find_verify_cycle(log, start_blk, num_scan_bblks,
366 stop_on_cycle);
367 if (new_blk != -1)
368 head_blk = new_blk;
369 } else { /* need to read 2 parts of log */
370 /*
371 * We are going to scan backwards in the log in two parts. First
372 * we scan the physical end of the log. In this part of the log,
373 * we are looking for blocks with cycle number last_half_cycle - 1.
374 * If we find one, then we know that the log starts there, as we've
375 * found a hole that didn't get written in going around the end
376 * of the physical log. The simple case for this is
377 * x + 1 ... | x ... | x - 1 | x
378 * <---------> less than scan distance
379 * If all of the blocks at the end of the log have cycle number
380 * last_half_cycle, then we check the blocks at the start of the
381 * log looking for occurrences of last_half_cycle. If we find one,
382 * then our current estimate for the location of the first
383 * occurrence of last_half_cycle is wrong and we move back to the
384 * hole we've found. This case looks like
385 * x + 1 ... | x | x + 1 | x ...
386 * ^ binary search stopped here
387 * Another case we need to handle that only occurs in 256k logs is
388 * x + 1 ... | x ... | x+1 | x ...
389 * ^ binary search stops here
390 * In a 256k log, the scan at the end of the log will see the x+1
391 * blocks. We need to skip past those since that is certainly not
392 * the head of the log. By searching for last_half_cycle-1 we
393 * accomplish that.
394 */
395 start_blk = log_bbnum - num_scan_bblks + head_blk;
396 ASSERT(head_blk <= INT_MAX && (xfs_daddr_t) num_scan_bblks-head_blk >= 0);
397 new_blk= xlog_find_verify_cycle(log, start_blk,
398 num_scan_bblks-(int)head_blk, (stop_on_cycle - 1));
399 if (new_blk != -1) {
400 head_blk = new_blk;
401 goto bad_blk;
402 }
403
404 /*
405 * Scan beginning of log now. The last part of the physical log
406 * is good. This scan needs to verify that it doesn't find the
407 * last_half_cycle.
408 */
409 start_blk = 0;
410 ASSERT(head_blk <= INT_MAX);
411 new_blk = xlog_find_verify_cycle(log, start_blk, (int) head_blk,
412 stop_on_cycle);
413 if (new_blk != -1)
414 head_blk = new_blk;
415 }
416
417 bad_blk:
418 /*
419 * Now we need to make sure head_blk is not pointing to a block in
420 * the middle of a log record.
421 */
422 num_scan_bblks = BTOBB(XLOG_MAX_RECORD_BSIZE);
423 if (head_blk >= num_scan_bblks) {
424 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
425
426 /* start ptr at last block ptr before head_blk */
427 if ((error = xlog_find_verify_log_record(log,
428 start_blk,
429 &head_blk,
430 0)) == -1) {
431 error = XFS_ERROR(EIO);
432 goto bp_err;
433 } else if (error)
434 goto bp_err;
435 } else {
436 start_blk = 0;
437 ASSERT(head_blk <= INT_MAX);
438 if ((error = xlog_find_verify_log_record(log,
439 start_blk,
440 &head_blk,
441 0)) == -1) {
442 /* We hit the beginning of the log during our search */
443 start_blk = log_bbnum - num_scan_bblks + head_blk;
444 new_blk = log_bbnum;
445 ASSERT(start_blk <= INT_MAX && (xfs_daddr_t) log_bbnum-start_blk >= 0);
446 ASSERT(head_blk <= INT_MAX);
447 if ((error = xlog_find_verify_log_record(log,
448 start_blk,
449 &new_blk,
450 (int)head_blk)) == -1) {
451 error = XFS_ERROR(EIO);
452 goto bp_err;
453 } else if (error)
454 goto bp_err;
455 if (new_blk != log_bbnum)
456 head_blk = new_blk;
457 } else if (error)
458 goto bp_err;
459 }
460
461 xlog_put_bp(bp);
462 if (head_blk == log_bbnum)
463 *return_head_blk = 0;
464 else
465 *return_head_blk = head_blk;
466 /*
467 * When returning here, we have a good block number. Bad block
468 * means that during a previous crash, we didn't have a clean break
469 * from cycle number N to cycle number N-1. In this case, we need
470 * to find the first block with cycle number N-1.
471 */
472 return 0;
473
474 bp_err:
475 xlog_put_bp(bp);
476
477 if (error)
478 xlog_warn("XFS: failed to find log head");
479
480 return error;
481 } /* xlog_find_head */
482
483 /*
484 * Find the sync block number or the tail of the log.
485 *
486 * This will be the block number of the last record to have its
487 * associated buffers synced to disk. Every log record header has
488 * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
489 * to get a sync block number. The only concern is to figure out which
490 * log record header to believe.
491 *
492 * The following algorithm uses the log record header with the largest
493 * lsn. The entire log record does not need to be valid. We only care
494 * that the header is valid.
495 *
496 * We could speed up search by using current head_blk buffer, but it is not
497 * available.
498 */
499 int
500 xlog_find_tail(xlog_t *log,
501 xfs_daddr_t *head_blk,
502 xfs_daddr_t *tail_blk,
503 int readonly)
504 {
505 xlog_rec_header_t *rhead;
506 xlog_op_header_t *op_head;
507 xfs_buf_t *bp;
508 int error, i, found;
509 xfs_daddr_t umount_data_blk;
510 xfs_daddr_t after_umount_blk;
511 xfs_lsn_t tail_lsn;
512
513 found = error = 0;
514
515 /*
516 * Find previous log record
517 */
518 if ((error = xlog_find_head(log, head_blk)))
519 return error;
520
521 bp = xlog_get_bp(1,log->l_mp);
522 if (!bp)
523 return -ENOMEM;
524 if (*head_blk == 0) { /* special case */
525 if ((error = xlog_bread(log, 0, 1, bp)))
526 goto bread_err;
527 if (GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT) == 0) {
528 *tail_blk = 0;
529 /* leave all other log inited values alone */
530 goto exit;
531 }
532 }
533
534 /*
535 * Search backwards looking for log record header block
536 */
537 ASSERT(*head_blk < INT_MAX);
538 for (i=(int)(*head_blk)-1; i>=0; i--) {
539 if ((error = xlog_bread(log, i, 1, bp)))
540 goto bread_err;
541 if (INT_GET(*(uint *)(XFS_BUF_PTR(bp)), ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM) {
542 found = 1;
543 break;
544 }
545 }
546 /*
547 * If we haven't found the log record header block, start looking
548 * again from the end of the physical log. XXXmiken: There should be
549 * a check here to make sure we didn't search more than N blocks in
550 * the previous code.
551 */
552 if (!found) {
553 for (i=log->l_logBBsize-1; i>=(int)(*head_blk); i--) {
554 if ((error = xlog_bread(log, i, 1, bp)))
555 goto bread_err;
556 if (INT_GET(*(uint*)(XFS_BUF_PTR(bp)), ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM) {
557 found = 2;
558 break;
559 }
560 }
561 }
562 if (!found) {
563 xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
564 ASSERT(0);
565 return XFS_ERROR(EIO);
566 }
567
568 /* find blk_no of tail of log */
569 rhead = (xlog_rec_header_t *)XFS_BUF_PTR(bp);
570 *tail_blk = BLOCK_LSN(rhead->h_tail_lsn, ARCH_CONVERT);
571
572 /*
573 * Reset log values according to the state of the log when we
574 * crashed. In the case where head_blk == 0, we bump curr_cycle
575 * one because the next write starts a new cycle rather than
576 * continuing the cycle of the last good log record. At this
577 * point we have guaranteed that all partial log records have been
578 * accounted for. Therefore, we know that the last good log record
579 * written was complete and ended exactly on the end boundary
580 * of the physical log.
581 */
582 log->l_prev_block = i;
583 log->l_curr_block = (int)*head_blk;
584 log->l_curr_cycle = INT_GET(rhead->h_cycle, ARCH_CONVERT);
585 if (found == 2)
586 log->l_curr_cycle++;
587 log->l_tail_lsn = INT_GET(rhead->h_tail_lsn, ARCH_CONVERT);
588 log->l_last_sync_lsn = INT_GET(rhead->h_lsn, ARCH_CONVERT);
589 log->l_grant_reserve_cycle = log->l_curr_cycle;
590 log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
591 log->l_grant_write_cycle = log->l_curr_cycle;
592 log->l_grant_write_bytes = BBTOB(log->l_curr_block);
593
594 /*
595 * Look for unmount record. If we find it, then we know there
596 * was a clean unmount. Since 'i' could be the last block in
597 * the physical log, we convert to a log block before comparing
598 * to the head_blk.
599 *
600 * Save the current tail lsn to use to pass to
601 * xlog_clear_stale_blocks() below. We won't want to clear the
602 * unmount record if there is one, so we pass the lsn of the
603 * unmount record rather than the block after it.
604 */
605 after_umount_blk = (i + 2) % log->l_logBBsize;
606 tail_lsn = log->l_tail_lsn;
607 if (*head_blk == after_umount_blk && INT_GET(rhead->h_num_logops, ARCH_CONVERT) == 1) {
608 umount_data_blk = (i + 1) % log->l_logBBsize;
609 if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
610 goto bread_err;
611 }
612 op_head = (xlog_op_header_t *)XFS_BUF_PTR(bp);
613 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
614 /*
615 * Set tail and last sync so that newly written
616 * log records will point recovery to after the
617 * current unmount record.
618 */
619 ASSIGN_ANY_LSN(log->l_tail_lsn, log->l_curr_cycle,
620 after_umount_blk, ARCH_NOCONVERT);
621 ASSIGN_ANY_LSN(log->l_last_sync_lsn, log->l_curr_cycle,
622 after_umount_blk, ARCH_NOCONVERT);
623 *tail_blk = after_umount_blk;
624 }
625 }
626
627 #ifdef __KERNEL__
628 /*
629 * Make sure that there are no blocks in front of the head
630 * with the same cycle number as the head. This can happen
631 * because we allow multiple outstanding log writes concurrently,
632 * and the later writes might make it out before earlier ones.
633 *
634 * We use the lsn from before modifying it so that we'll never
635 * overwrite the unmount record after a clean unmount.
636 *
637 * Do this only if we are going to recover the filesystem
638 */
639 if (!readonly)
640 error = xlog_clear_stale_blocks(log, tail_lsn);
641 #endif
642
643 bread_err:
644 exit:
645 xlog_put_bp(bp);
646
647 if (error)
648 xlog_warn("XFS: failed to locate log tail");
649
650 return error;
651 } /* xlog_find_tail */
652
653
654 /*
655 * Is the log zeroed at all?
656 *
657 * The last binary search should be changed to perform an X block read
658 * once X becomes small enough. You can then search linearly through
659 * the X blocks. This will cut down on the number of reads we need to do.
660 *
661 * If the log is partially zeroed, this routine will pass back the blkno
662 * of the first block with cycle number 0. It won't have a complete LR
663 * preceding it.
664 *
665 * Return:
666 * 0 => the log is completely written to
667 * -1 => use *blk_no as the first block of the log
668 * >0 => error has occurred
669 */
670 int
671 xlog_find_zeroed(struct log *log,
672 xfs_daddr_t *blk_no)
673 {
674 xfs_buf_t *bp;
675 uint first_cycle, last_cycle;
676 xfs_daddr_t new_blk, last_blk, start_blk;
677 xfs_daddr_t num_scan_bblks;
678 int error, log_bbnum = log->l_logBBsize;
679
680 error = 0;
681 /* check totally zeroed log */
682 bp = xlog_get_bp(1,log->l_mp);
683 if (!bp)
684 return -ENOMEM;
685 if ((error = xlog_bread(log, 0, 1, bp)))
686 goto bp_err;
687 first_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
688 if (first_cycle == 0) { /* completely zeroed log */
689 *blk_no = 0;
690 xlog_put_bp(bp);
691 return -1;
692 }
693
694 /* check partially zeroed log */
695 if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
696 goto bp_err;
697 last_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
698 if (last_cycle != 0) { /* log completely written to */
699 xlog_put_bp(bp);
700 return 0;
701 } else if (first_cycle != 1) {
702 /*
703 * If the cycle of the last block is zero, the cycle of
704 * the first block must be 1. If it's not, maybe we're
705 * not looking at a log... Bail out.
706 */
707 xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
708 return XFS_ERROR(EINVAL);
709 }
710
711 /* we have a partially zeroed log */
712 last_blk = log_bbnum-1;
713 if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
714 goto bp_err;
715
716 /*
717 * Validate the answer. Because there is no way to guarantee that
718 * the entire log is made up of log records which are the same size,
719 * we scan over the defined maximum blocks. At this point, the maximum
720 * is not chosen to mean anything special. XXXmiken
721 */
722 num_scan_bblks = BTOBB(XLOG_MAX_ICLOGS<<XLOG_MAX_RECORD_BSHIFT);
723 ASSERT(num_scan_bblks <= INT_MAX);
724
725 if (last_blk < num_scan_bblks)
726 num_scan_bblks = last_blk;
727 start_blk = last_blk - num_scan_bblks;
728
729 /*
730 * We search for any instances of cycle number 0 that occur before
731 * our current estimate of the head. What we're trying to detect is
732 * 1 ... | 0 | 1 | 0...
733 * ^ binary search ends here
734 */
735 new_blk = xlog_find_verify_cycle(log, start_blk,
736 (int)num_scan_bblks, 0);
737 if (new_blk != -1)
738 last_blk = new_blk;
739
740 /*
741 * Potentially backup over partial log record write. We don't need
742 * to search the end of the log because we know it is zero.
743 */
744 if ((error = xlog_find_verify_log_record(log, start_blk,
745 &last_blk, 0)))
746 goto bp_err;
747
748 *blk_no = last_blk;
749 bp_err:
750 xlog_put_bp(bp);
751 if (error)
752 return error;
753 return -1;
754 } /* xlog_find_zeroed */
755
756 /* stuff for transactional view */
757 STATIC void
758 xlog_unpack_data(xlog_rec_header_t *rhead,
759 xfs_caddr_t dp,
760 xlog_t *log)
761 {
762 int i;
763 #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
764 uint *up = (uint *)dp;
765 uint chksum = 0;
766 #endif
767
768 for (i=0; i<BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); i++) {
769 INT_SET(*(uint *)dp, ARCH_CONVERT, INT_GET(*(uint *)&rhead->h_cycle_data[i], ARCH_CONVERT));
770 dp += BBSIZE;
771 }
772 #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
773 /* divide length by 4 to get # words */
774 for (i=0; i < INT_GET(rhead->h_len, ARCH_CONVERT) >> 2; i++) {
775 chksum ^= INT_GET(*up, ARCH_CONVERT);
776 up++;
777 }
778 if (chksum != INT_GET(rhead->h_chksum, ARCH_CONVERT)) {
779 if (!INT_ISZERO(rhead->h_chksum, ARCH_CONVERT) ||
780 ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
781 cmn_err(CE_DEBUG,
782 "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)",
783 INT_GET(rhead->h_chksum, ARCH_CONVERT), chksum);
784 cmn_err(CE_DEBUG,
785 "XFS: Disregard message if filesystem was created with non-DEBUG kernel");
786 log->l_flags |= XLOG_CHKSUM_MISMATCH;
787 }
788 }
789 #endif /* DEBUG && XFS_LOUD_RECOVERY */
790 } /* xlog_unpack_data */
791
792
793 STATIC xlog_recover_t *
794 xlog_recover_find_tid(xlog_recover_t *q,
795 xlog_tid_t tid)
796 {
797 xlog_recover_t *p = q;
798
799 while (p != NULL) {
800 if (p->r_log_tid == tid)
801 break;
802 p = p->r_next;
803 }
804 return p;
805 } /* xlog_recover_find_tid */
806
807 STATIC void
808 xlog_recover_put_hashq(xlog_recover_t **q,
809 xlog_recover_t *trans)
810 {
811 trans->r_next = *q;
812 *q = trans;
813 } /* xlog_recover_put_hashq */
814
815 STATIC void
816 xlog_recover_new_tid(xlog_recover_t **q,
817 xlog_tid_t tid,
818 xfs_lsn_t lsn)
819 {
820 xlog_recover_t *trans;
821
822 trans = kmem_zalloc(sizeof(xlog_recover_t), 0);
823 trans->r_log_tid = tid;
824 trans->r_lsn = lsn;
825 xlog_recover_put_hashq(q, trans);
826 } /* xlog_recover_new_tid */
827
828
829 STATIC int
830 xlog_recover_unlink_tid(xlog_recover_t **q,
831 xlog_recover_t *trans)
832 {
833 xlog_recover_t *tp;
834 int found = 0;
835
836 ASSERT(trans != 0);
837 if (trans == *q) {
838 *q = (*q)->r_next;
839 } else {
840 tp = *q;
841 while (tp != 0) {
842 if (tp->r_next == trans) {
843 found = 1;
844 break;
845 }
846 tp = tp->r_next;
847 }
848 if (!found) {
849 xlog_warn(
850 "XFS: xlog_recover_unlink_tid: trans not found");
851 ASSERT(0);
852 return XFS_ERROR(EIO);
853 }
854 tp->r_next = tp->r_next->r_next;
855 }
856 return 0;
857 } /* xlog_recover_unlink_tid */
858
859 /*
860 * Free up any resources allocated by the transaction
861 *
862 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
863 */
864 STATIC void
865 xlog_recover_free_trans(xlog_recover_t *trans)
866 {
867 xlog_recover_item_t *first_item, *item, *free_item;
868 int i;
869
870 item = first_item = trans->r_itemq;
871 do {
872 free_item = item;
873 item = item->ri_next;
874 /* Free the regions in the item. */
875 for (i = 0; i < free_item->ri_cnt; i++) {
876 kmem_free(free_item->ri_buf[i].i_addr,
877 free_item->ri_buf[i].i_len);
878 }
879 /* Free the item itself */
880 kmem_free(free_item->ri_buf,
881 (free_item->ri_total * sizeof(xfs_log_iovec_t)));
882 kmem_free(free_item, sizeof(xlog_recover_item_t));
883 } while (first_item != item);
884 /* Free the transaction recover structure */
885 kmem_free(trans, sizeof(xlog_recover_t));
886 } /* xlog_recover_free_trans */
887
888
889 STATIC int
890 xlog_recover_commit_trans(xlog_t *log,
891 xlog_recover_t **q,
892 xlog_recover_t *trans,
893 int pass)
894 {
895 int error;
896
897 if ((error = xlog_recover_unlink_tid(q, trans)))
898 return error;
899 if ((error = xlog_recover_do_trans(log, trans, pass)))
900 return error;
901 xlog_recover_free_trans(trans); /* no error */
902 return 0;
903 } /* xlog_recover_commit_trans */
904
905 STATIC void
906 xlog_recover_insert_item_backq(xlog_recover_item_t **q,
907 xlog_recover_item_t *item)
908 {
909 if (*q == 0) {
910 item->ri_prev = item->ri_next = item;
911 *q = item;
912 } else {
913 item->ri_next = *q;
914 item->ri_prev = (*q)->ri_prev;
915 (*q)->ri_prev = item;
916 item->ri_prev->ri_next = item;
917 }
918 } /* xlog_recover_insert_item_backq */
919
920 STATIC void
921 xlog_recover_add_item(xlog_recover_item_t **itemq)
922 {
923 xlog_recover_item_t *item;
924
925 item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
926 xlog_recover_insert_item_backq(itemq, item);
927 } /* xlog_recover_add_item */
928
929 /* The next region to add is the start of a new region. It could be
930 * a whole region or it could be the first part of a new region. Because
931 * of this, the assumption here is that the type and size fields of all
932 * format structures fit into the first 32 bits of the structure.
933 *
934 * This works because all regions must be 32 bit aligned. Therefore, we
935 * either have both fields or we have neither field. In the case we have
936 * neither field, the data part of the region is zero length. We only have
937 * a log_op_header and can throw away the header since a new one will appear
938 * later. If we have at least 4 bytes, then we can determine how many regions
939 * will appear in the current log item.
940 */
941 STATIC int
942 xlog_recover_add_to_trans(xlog_recover_t *trans,
943 xfs_caddr_t dp,
944 int len)
945 {
946 xfs_inode_log_format_t *in_f; /* any will do */
947 xlog_recover_item_t *item;
948 xfs_caddr_t ptr;
949
950 if (!len)
951 return 0;
952 ptr = kmem_zalloc(len, 0);
953 bcopy(dp, ptr, len);
954
955 in_f = (xfs_inode_log_format_t *)ptr;
956 item = trans->r_itemq;
957 if (item == 0) {
958 ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC);
959 if (len == sizeof(xfs_trans_header_t))
960 xlog_recover_add_item(&trans->r_itemq);
961 bcopy(dp, &trans->r_theader, len); /* s, d, l */
962 return 0;
963 }
964 if (item->ri_prev->ri_total != 0 &&
965 item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
966 xlog_recover_add_item(&trans->r_itemq);
967 }
968 item = trans->r_itemq;
969 item = item->ri_prev;
970
971 if (item->ri_total == 0) { /* first region to be added */
972 item->ri_total = in_f->ilf_size;
973 ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM);
974 item->ri_buf = kmem_zalloc((item->ri_total *
975 sizeof(xfs_log_iovec_t)), 0);
976 }
977 ASSERT(item->ri_total > item->ri_cnt);
978 /* Description region is ri_buf[0] */
979 item->ri_buf[item->ri_cnt].i_addr = ptr;
980 item->ri_buf[item->ri_cnt].i_len = len;
981 item->ri_cnt++;
982 return 0;
983 } /* xlog_recover_add_to_trans */
984
985 STATIC int
986 xlog_recover_add_to_cont_trans(xlog_recover_t *trans,
987 xfs_caddr_t dp,
988 int len)
989 {
990 xlog_recover_item_t *item;
991 xfs_caddr_t ptr, old_ptr;
992 int old_len;
993
994 item = trans->r_itemq;
995 if (item == 0) {
996 /* finish copying rest of trans header */
997 xlog_recover_add_item(&trans->r_itemq);
998 ptr = (xfs_caddr_t)&trans->r_theader+sizeof(xfs_trans_header_t)-len;
999 bcopy(dp, ptr, len); /* s, d, l */
1000 return 0;
1001 }
1002 item = item->ri_prev;
1003
1004 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1005 old_len = item->ri_buf[item->ri_cnt-1].i_len;
1006
1007 ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0);
1008 bcopy(dp , &ptr[old_len], len); /* s, d, l */
1009 item->ri_buf[item->ri_cnt-1].i_len += len;
1010 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1011 return 0;
1012 } /* xlog_recover_add_to_cont_trans */
1013
1014 STATIC int
1015 xlog_recover_unmount_trans(xlog_recover_t *trans)
1016 {
1017 /* Do nothing now */
1018 xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
1019 return( 0 );
1020 } /* xlog_recover_unmount_trans */
1021
1022
1023 STATIC int
1024 xlog_recover_process_data(xlog_t *log,
1025 xlog_recover_t *rhash[],
1026 xlog_rec_header_t *rhead,
1027 xfs_caddr_t dp,
1028 int pass)
1029 {
1030 xfs_caddr_t lp = dp+INT_GET(rhead->h_len, ARCH_CONVERT);
1031 int num_logops = INT_GET(rhead->h_num_logops, ARCH_CONVERT);
1032 xlog_op_header_t *ohead;
1033 xlog_recover_t *trans;
1034 xlog_tid_t tid;
1035 int error;
1036 unsigned long hash;
1037 uint flags;
1038
1039 /* check the log format matches our own - else we can't recover */
1040 if (xlog_header_check_recover(log->l_mp, rhead))
1041 return (XFS_ERROR(EIO));
1042
1043 while (dp < lp) {
1044 ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
1045 ohead = (xlog_op_header_t *)dp;
1046 dp += sizeof(xlog_op_header_t);
1047 if (ohead->oh_clientid != XFS_TRANSACTION &&
1048 ohead->oh_clientid != XFS_LOG) {
1049 xlog_warn("XFS: xlog_recover_process_data: bad clientid");
1050 ASSERT(0);
1051 return (XFS_ERROR(EIO));
1052 }
1053 tid = INT_GET(ohead->oh_tid, ARCH_CONVERT);
1054 hash = XLOG_RHASH(tid);
1055 trans = xlog_recover_find_tid(rhash[hash], tid);
1056 if (trans == NULL) { /* not found; add new tid */
1057 if (ohead->oh_flags & XLOG_START_TRANS)
1058 xlog_recover_new_tid(&rhash[hash], tid, INT_GET(rhead->h_lsn, ARCH_CONVERT));
1059 } else {
1060 ASSERT(dp+INT_GET(ohead->oh_len, ARCH_CONVERT) <= lp);
1061 flags = ohead->oh_flags & ~XLOG_END_TRANS;
1062 if (flags & XLOG_WAS_CONT_TRANS)
1063 flags &= ~XLOG_CONTINUE_TRANS;
1064 switch (flags) {
1065 case XLOG_COMMIT_TRANS: {
1066 error = xlog_recover_commit_trans(log, &rhash[hash],
1067 trans, pass);
1068 break;
1069 }
1070 case XLOG_UNMOUNT_TRANS: {
1071 error = xlog_recover_unmount_trans(trans);
1072 break;
1073 }
1074 case XLOG_WAS_CONT_TRANS: {
1075 error = xlog_recover_add_to_cont_trans(trans, dp,
1076 INT_GET(ohead->oh_len, ARCH_CONVERT));
1077 break;
1078 }
1079 case XLOG_START_TRANS : {
1080 xlog_warn("XFS: xlog_recover_process_data: bad transaction");
1081 ASSERT(0);
1082 error = XFS_ERROR(EIO);
1083 break;
1084 }
1085 case 0:
1086 case XLOG_CONTINUE_TRANS: {
1087 error = xlog_recover_add_to_trans(trans, dp,
1088 INT_GET(ohead->oh_len, ARCH_CONVERT));
1089 break;
1090 }
1091 default: {
1092 xlog_warn("XFS: xlog_recover_process_data: bad flag");
1093 ASSERT(0);
1094 error = XFS_ERROR(EIO);
1095 break;
1096 }
1097 } /* switch */
1098 if (error)
1099 return error;
1100 } /* if */
1101 dp += INT_GET(ohead->oh_len, ARCH_CONVERT);
1102 num_logops--;
1103 }
1104 return( 0 );
1105 } /* xlog_recover_process_data */
1106
1107 /*
1108 * Read the log from tail to head and process the log records found.
1109 * Handle the two cases where the tail and head are in the same cycle
1110 * and where the active portion of the log wraps around the end of
1111 * the physical log separately. The pass parameter is passed through
1112 * to the routines called to process the data and is not looked at
1113 * here.
1114 */
1115 int
1116 xlog_do_recovery_pass(xlog_t *log,
1117 xfs_daddr_t head_blk,
1118 xfs_daddr_t tail_blk,
1119 int pass)
1120 {
1121 xlog_rec_header_t *rhead;
1122 xfs_daddr_t blk_no;
1123 xfs_caddr_t bufaddr;
1124 xfs_buf_t *hbp, *dbp;
1125 int error;
1126 int bblks, split_bblks;
1127 xlog_recover_t *rhash[XLOG_RHASH_SIZE];
1128
1129 error = 0;
1130 hbp = xlog_get_bp(1,log->l_mp);
1131 if (!hbp)
1132 return -ENOMEM;
1133 dbp = xlog_get_bp(BTOBB(XLOG_MAX_RECORD_BSIZE),log->l_mp);
1134 if (!dbp) {
1135 xlog_put_bp(hbp);
1136 return -ENOMEM;
1137 }
1138 bzero(rhash, sizeof(rhash));
1139 if (tail_blk <= head_blk) {
1140 for (blk_no = tail_blk; blk_no < head_blk; ) {
1141 if ((error = xlog_bread(log, blk_no, 1, hbp)))
1142 goto bread_err;
1143 rhead = (xlog_rec_header_t *)XFS_BUF_PTR(hbp);
1144 ASSERT(INT_GET(rhead->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
1145 ASSERT(BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) <= INT_MAX));
1146 bblks = (int) BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); /* blocks in data section */
1147 if (bblks > 0) {
1148 if ((error = xlog_bread(log, blk_no+1, bblks, dbp)))
1149 goto bread_err;
1150 xlog_unpack_data(rhead, XFS_BUF_PTR(dbp), log);
1151 if ((error = xlog_recover_process_data(log, rhash,
1152 rhead, XFS_BUF_PTR(dbp),
1153 pass)))
1154 goto bread_err;
1155 }
1156 blk_no += (bblks+1);
1157 }
1158 } else {
1159 /*
1160 * Perform recovery around the end of the physical log. When the head
1161 * is not on the same cycle number as the tail, we can't do a sequential
1162 * recovery as above.
1163 */
1164 blk_no = tail_blk;
1165 while (blk_no < log->l_logBBsize) {
1166
1167 /* Read header of one block */
1168 if ((error = xlog_bread(log, blk_no, 1, hbp)))
1169 goto bread_err;
1170 rhead = (xlog_rec_header_t *)XFS_BUF_PTR(hbp);
1171 ASSERT(INT_GET(rhead->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
1172 ASSERT(BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) <= INT_MAX));
1173 bblks = (int) BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
1174
1175 /* LR body must have data or it wouldn't have been written */
1176 ASSERT(bblks > 0);
1177 blk_no++; /* successfully read header */
1178 ASSERT(blk_no <= log->l_logBBsize);
1179
1180 if ((INT_GET(rhead->h_magicno, ARCH_CONVERT) != XLOG_HEADER_MAGIC_NUM) ||
1181 (BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) > INT_MAX)) ||
1182 (bblks <= 0) ||
1183 (blk_no > log->l_logBBsize)) {
1184 error = EFSCORRUPTED;
1185 goto bread_err;
1186 }
1187
1188 /* Read in data for log record */
1189 if (blk_no+bblks <= log->l_logBBsize) {
1190 if ((error = xlog_bread(log, blk_no, bblks, dbp)))
1191 goto bread_err;
1192 } else {
1193 /* This log record is split across physical end of log */
1194 split_bblks = 0;
1195 if (blk_no != log->l_logBBsize) {
1196
1197 /* some data is before physical end of log */
1198 ASSERT(blk_no <= INT_MAX);
1199 split_bblks = log->l_logBBsize - (int)blk_no;
1200 ASSERT(split_bblks > 0);
1201 if ((error = xlog_bread(log, blk_no, split_bblks, dbp)))
1202 goto bread_err;
1203 }
1204 bufaddr = XFS_BUF_PTR(dbp);
1205 XFS_BUF_SET_PTR(dbp, bufaddr + BBTOB(split_bblks),
1206 BBTOB(bblks - split_bblks));
1207 if ((error = xlog_bread(log, 0, bblks - split_bblks, dbp)))
1208 goto bread_err;
1209 XFS_BUF_SET_PTR(dbp, bufaddr, XLOG_MAX_RECORD_BSIZE);
1210 }
1211 xlog_unpack_data(rhead, XFS_BUF_PTR(dbp), log);
1212 if ((error = xlog_recover_process_data(log, rhash,
1213 rhead, XFS_BUF_PTR(dbp),
1214 pass)))
1215 goto bread_err;
1216 blk_no += bblks;
1217 }
1218
1219 ASSERT(blk_no >= log->l_logBBsize);
1220 blk_no -= log->l_logBBsize;
1221
1222 /* read first part of physical log */
1223 while (blk_no < head_blk) {
1224 if ((error = xlog_bread(log, blk_no, 1, hbp)))
1225 goto bread_err;
1226 rhead = (xlog_rec_header_t *)XFS_BUF_PTR(hbp);
1227 ASSERT(INT_GET(rhead->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
1228 ASSERT(BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) <= INT_MAX));
1229 bblks = (int) BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
1230 ASSERT(bblks > 0);
1231 if ((error = xlog_bread(log, blk_no+1, bblks, dbp)))
1232 goto bread_err;
1233 xlog_unpack_data(rhead, XFS_BUF_PTR(dbp), log);
1234 if ((error = xlog_recover_process_data(log, rhash,
1235 rhead, XFS_BUF_PTR(dbp),
1236 pass)))
1237 goto bread_err;
1238 blk_no += (bblks+1);
1239 }
1240 }
1241
1242 bread_err:
1243 xlog_put_bp(dbp);
1244 xlog_put_bp(hbp);
1245
1246 return error;
1247 }