]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - libxlog/xfs_log_recover.c
774ad59952dc0cf14f044ab317778ca177eae911
[thirdparty/xfsprogs-dev.git] / libxlog / xfs_log_recover.c
1 /*
2 * Copyright (c) 2000 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33 #include <libxlog.h>
34
35 /*
36 * This routine finds (to an approximation) the first block in the physical
37 * log which contains the given cycle. It uses a binary search algorithm.
38 * Note that the algorithm can not be perfect because the disk will not
39 * necessarily be perfect.
40 */
41 int
42 xlog_find_cycle_start(xlog_t *log,
43 xfs_buf_t *bp,
44 xfs_daddr_t first_blk,
45 xfs_daddr_t *last_blk,
46 uint cycle)
47 {
48 xfs_daddr_t mid_blk;
49 uint mid_cycle;
50 int error;
51
52 mid_blk = BLK_AVG(first_blk, *last_blk);
53 while (mid_blk != first_blk && mid_blk != *last_blk) {
54 if ((error = xlog_bread(log, mid_blk, 1, bp)))
55 return error;
56 mid_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
57 if (mid_cycle == cycle) {
58 *last_blk = mid_blk;
59 /* last_half_cycle == mid_cycle */
60 } else {
61 first_blk = mid_blk;
62 /* first_half_cycle == mid_cycle */
63 }
64 mid_blk = BLK_AVG(first_blk, *last_blk);
65 }
66 ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
67 (mid_blk == *last_blk && mid_blk-1 == first_blk));
68
69 return 0;
70 } /* xlog_find_cycle_start */
71
72
73 /*
74 * Check that the range of blocks does not contain the cycle number
75 * given. The scan needs to occur from front to back and the ptr into the
76 * region must be updated since a later routine will need to perform another
77 * test. If the region is completely good, we end up returning the same
78 * last block number.
79 *
80 * Set blkno to -1 if we encounter no errors. This is an invalid block number
81 * since we don't ever expect logs to get this large.
82 */
83
84 STATIC int
85 xlog_find_verify_cycle( xlog_t *log,
86 xfs_daddr_t start_blk,
87 int nbblks,
88 uint stop_on_cycle_no,
89 xfs_daddr_t *new_blk)
90 {
91 xfs_daddr_t i, j;
92 uint cycle;
93 xfs_buf_t *bp;
94 char *buf = NULL;
95 int error = 0;
96 xfs_daddr_t bufblks = nbblks;
97
98 while (!(bp = xlog_get_bp(bufblks, log->l_mp))) {
99 /* can't get enough memory to do everything in one big buffer */
100 bufblks >>= 1;
101 if (!bufblks)
102 return ENOMEM;
103 }
104
105
106 for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
107 int bcount = min(bufblks, (start_blk + nbblks - i));
108
109 if ((error = xlog_bread(log, i, bcount, bp)))
110 goto out;
111
112 buf = XFS_BUF_PTR(bp);
113 for (j = 0; j < bcount; j++) {
114 cycle = GET_CYCLE(buf, ARCH_CONVERT);
115 if (cycle == stop_on_cycle_no) {
116 *new_blk = i+j;
117 goto out;
118 }
119
120 buf += BBSIZE;
121 }
122 }
123
124 *new_blk = -1;
125
126 out:
127 xlog_put_bp(bp);
128
129 return error;
130 } /* xlog_find_verify_cycle */
131
132
133 /*
134 * Potentially backup over partial log record write.
135 *
136 * In the typical case, last_blk is the number of the block directly after
137 * a good log record. Therefore, we subtract one to get the block number
138 * of the last block in the given buffer. extra_bblks contains the number
139 * of blocks we would have read on a previous read. This happens when the
140 * last log record is split over the end of the physical log.
141 *
142 * extra_bblks is the number of blocks potentially verified on a previous
143 * call to this routine.
144 */
145
146 STATIC int
147 xlog_find_verify_log_record(xlog_t *log,
148 xfs_daddr_t start_blk,
149 xfs_daddr_t *last_blk,
150 int extra_bblks)
151 {
152 xfs_daddr_t i;
153 xfs_buf_t *bp;
154 char *buf = NULL;
155 xlog_rec_header_t *head = NULL;
156 int error = 0;
157 int smallmem = 0;
158 int num_blks = *last_blk - start_blk;
159
160 ASSERT(start_blk != 0 || *last_blk != start_blk);
161
162 if (!(bp = xlog_get_bp(num_blks, log->l_mp))) {
163 if (!(bp = xlog_get_bp(1, log->l_mp)))
164 return ENOMEM;
165 smallmem = 1;
166 buf = XFS_BUF_PTR(bp);
167 } else {
168 if ((error = xlog_bread(log, start_blk, num_blks, bp)))
169 goto out;
170 buf = XFS_BUF_PTR(bp) + (num_blks - 1) * BBSIZE;
171 }
172
173
174 for (i=(*last_blk)-1; i>=0; i--) {
175 if (i < start_blk) {
176 /* legal log record not found */
177 xlog_warn("XFS: Log inconsistent (didn't find previous header)");
178 ASSERT(0);
179 error = XFS_ERROR(EIO);
180 goto out;
181 }
182
183 if (smallmem && (error = xlog_bread(log, i, 1, bp)))
184 goto out;
185 head = (xlog_rec_header_t*)buf;
186
187 if (INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM)
188 break;
189
190 if (!smallmem)
191 buf -= BBSIZE;
192 }
193
194 /*
195 * We hit the beginning of the physical log & still no header. Return
196 * to caller. If caller can handle a return of -1, then this routine
197 * will be called again for the end of the physical log.
198 */
199 if (i == -1) {
200 error = -1;
201 goto out;
202 }
203
204 /* we have the final block of the good log (the first block
205 * of the log record _before_ the head. So we check the uuid.
206 */
207
208 if ((error = xlog_header_check_mount(log->l_mp, head)))
209 goto out;
210
211 /*
212 * We may have found a log record header before we expected one.
213 * last_blk will be the 1st block # with a given cycle #. We may end
214 * up reading an entire log record. In this case, we don't want to
215 * reset last_blk. Only when last_blk points in the middle of a log
216 * record do we update last_blk.
217 */
218 if (*last_blk - i + extra_bblks
219 != BTOBB(INT_GET(head->h_len, ARCH_CONVERT))+1)
220 *last_blk = i;
221
222 out:
223 xlog_put_bp(bp);
224
225 return error;
226 } /* xlog_find_verify_log_record */
227
228 /*
229 * Head is defined to be the point of the log where the next log write
230 * write could go. This means that incomplete LR writes at the end are
231 * eliminated when calculating the head. We aren't guaranteed that previous
232 * LR have complete transactions. We only know that a cycle number of
233 * current cycle number -1 won't be present in the log if we start writing
234 * from our current block number.
235 *
236 * last_blk contains the block number of the first block with a given
237 * cycle number.
238 *
239 * Also called from xfs_log_print.c
240 *
241 * Return: zero if normal, non-zero if error.
242 */
243 int
244 xlog_find_head(xlog_t *log,
245 xfs_daddr_t *return_head_blk)
246 {
247 xfs_buf_t *bp;
248 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
249 int num_scan_bblks;
250 uint first_half_cycle, last_half_cycle;
251 uint stop_on_cycle;
252 int error, log_bbnum = log->l_logBBsize;
253
254 /* Is the end of the log device zeroed? */
255 if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
256 *return_head_blk = first_blk;
257
258 /* is the whole lot zeroed? */
259 if (!first_blk) {
260 /* Linux XFS shouldn't generate totally zeroed logs -
261 * mkfs etc write a dummy unmount record to a fresh
262 * log so we can store the uuid in there
263 */
264 xlog_warn("XFS: totally zeroed log\n");
265 }
266
267 return 0;
268 } else if (error) {
269 xlog_warn("XFS: empty log check failed");
270 return error;
271 }
272
273 first_blk = 0; /* get cycle # of 1st block */
274 bp = xlog_get_bp(1,log->l_mp);
275 if (!bp)
276 return ENOMEM;
277 if ((error = xlog_bread(log, 0, 1, bp)))
278 goto bp_err;
279 first_half_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
280
281 last_blk = head_blk = log_bbnum-1; /* get cycle # of last block */
282 if ((error = xlog_bread(log, last_blk, 1, bp)))
283 goto bp_err;
284 last_half_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
285 ASSERT(last_half_cycle != 0);
286
287 /*
288 * If the 1st half cycle number is equal to the last half cycle number,
289 * then the entire log is stamped with the same cycle number. In this
290 * case, head_blk can't be set to zero (which makes sense). The below
291 * math doesn't work out properly with head_blk equal to zero. Instead,
292 * we set it to log_bbnum which is an illegal block number, but this
293 * value makes the math correct. If head_blk doesn't changed through
294 * all the tests below, *head_blk is set to zero at the very end rather
295 * than log_bbnum. In a sense, log_bbnum and zero are the same block
296 * in a circular file.
297 */
298 if (first_half_cycle == last_half_cycle) {
299 /*
300 * In this case we believe that the entire log should have cycle
301 * number last_half_cycle. We need to scan backwards from the
302 * end verifying that there are no holes still containing
303 * last_half_cycle - 1. If we find such a hole, then the start
304 * of that hole will be the new head. The simple case looks like
305 * x | x ... | x - 1 | x
306 * Another case that fits this picture would be
307 * x | x + 1 | x ... | x
308 * In this case the head really is somwhere at the end of the
309 * log, as one of the latest writes at the beginning was incomplete.
310 * One more case is
311 * x | x + 1 | x ... | x - 1 | x
312 * This is really the combination of the above two cases, and the
313 * head has to end up at the start of the x-1 hole at the end of
314 * the log.
315 *
316 * In the 256k log case, we will read from the beginning to the
317 * end of the log and search for cycle numbers equal to x-1. We
318 * don't worry about the x+1 blocks that we encounter, because
319 * we know that they cannot be the head since the log started with
320 * x.
321 */
322 head_blk = log_bbnum;
323 stop_on_cycle = last_half_cycle - 1;
324 } else {
325 /*
326 * In this case we want to find the first block with cycle number
327 * matching last_half_cycle. We expect the log to be some
328 * variation on
329 * x + 1 ... | x ...
330 * The first block with cycle number x (last_half_cycle) will be
331 * where the new head belongs. First we do a binary search for
332 * the first occurrence of last_half_cycle. The binary search
333 * may not be totally accurate, so then we scan back from there
334 * looking for occurrences of last_half_cycle before us. If
335 * that backwards scan wraps around the beginning of the log,
336 * then we look for occurrences of last_half_cycle - 1 at the
337 * end of the log. The cases we're looking for look like
338 * x + 1 ... | x | x + 1 | x ...
339 * ^ binary search stopped here
340 * or
341 * x + 1 ... | x ... | x - 1 | x
342 * <---------> less than scan distance
343 */
344 stop_on_cycle = last_half_cycle;
345 if ((error = xlog_find_cycle_start(log, bp, first_blk,
346 &head_blk, last_half_cycle)))
347 goto bp_err;
348 }
349
350 /*
351 * Now validate the answer. Scan back some number of maximum possible
352 * blocks and make sure each one has the expected cycle number. The
353 * maximum is determined by the total possible amount of buffering
354 * in the in-core log. The following number can be made tighter if
355 * we actually look at the block size of the filesystem.
356 */
357 num_scan_bblks = BTOBB(XLOG_MAX_ICLOGS<<XLOG_MAX_RECORD_BSHIFT);
358 if (head_blk >= num_scan_bblks) {
359 /*
360 * We are guaranteed that the entire check can be performed
361 * in one buffer.
362 */
363 start_blk = head_blk - num_scan_bblks;
364 if ((error = xlog_find_verify_cycle(log, start_blk, num_scan_bblks,
365 stop_on_cycle, &new_blk)))
366 goto bp_err;
367 if (new_blk != -1)
368 head_blk = new_blk;
369 } else { /* need to read 2 parts of log */
370 /*
371 * We are going to scan backwards in the log in two parts. First
372 * we scan the physical end of the log. In this part of the log,
373 * we are looking for blocks with cycle number last_half_cycle - 1.
374 * If we find one, then we know that the log starts there, as we've
375 * found a hole that didn't get written in going around the end
376 * of the physical log. The simple case for this is
377 * x + 1 ... | x ... | x - 1 | x
378 * <---------> less than scan distance
379 * If all of the blocks at the end of the log have cycle number
380 * last_half_cycle, then we check the blocks at the start of the
381 * log looking for occurrences of last_half_cycle. If we find one,
382 * then our current estimate for the location of the first
383 * occurrence of last_half_cycle is wrong and we move back to the
384 * hole we've found. This case looks like
385 * x + 1 ... | x | x + 1 | x ...
386 * ^ binary search stopped here
387 * Another case we need to handle that only occurs in 256k logs is
388 * x + 1 ... | x ... | x+1 | x ...
389 * ^ binary search stops here
390 * In a 256k log, the scan at the end of the log will see the x+1
391 * blocks. We need to skip past those since that is certainly not
392 * the head of the log. By searching for last_half_cycle-1 we
393 * accomplish that.
394 */
395 start_blk = log_bbnum - num_scan_bblks + head_blk;
396 ASSERT(head_blk <= INT_MAX && (xfs_daddr_t) num_scan_bblks-head_blk >= 0);
397 if ((error = xlog_find_verify_cycle(log, start_blk,
398 num_scan_bblks-(int)head_blk, (stop_on_cycle - 1),
399 &new_blk)))
400 goto bp_err;
401 if (new_blk != -1) {
402 head_blk = new_blk;
403 goto bad_blk;
404 }
405
406 /*
407 * Scan beginning of log now. The last part of the physical log
408 * is good. This scan needs to verify that it doesn't find the
409 * last_half_cycle.
410 */
411 start_blk = 0;
412 ASSERT(head_blk <= INT_MAX);
413 if ((error = xlog_find_verify_cycle(log, start_blk, (int) head_blk,
414 stop_on_cycle, &new_blk)))
415 goto bp_err;
416 if (new_blk != -1)
417 head_blk = new_blk;
418 }
419
420 bad_blk:
421 /*
422 * Now we need to make sure head_blk is not pointing to a block in
423 * the middle of a log record.
424 */
425 num_scan_bblks = BTOBB(XLOG_MAX_RECORD_BSIZE);
426 if (head_blk >= num_scan_bblks) {
427 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
428
429 /* start ptr at last block ptr before head_blk */
430 if ((error = xlog_find_verify_log_record(log,
431 start_blk,
432 &head_blk,
433 0)) == -1) {
434 error = XFS_ERROR(EIO);
435 goto bp_err;
436 } else if (error)
437 goto bp_err;
438 } else {
439 start_blk = 0;
440 ASSERT(head_blk <= INT_MAX);
441 if ((error = xlog_find_verify_log_record(log,
442 start_blk,
443 &head_blk,
444 0)) == -1) {
445 /* We hit the beginning of the log during our search */
446 start_blk = log_bbnum - num_scan_bblks + head_blk;
447 new_blk = log_bbnum;
448 ASSERT(start_blk <= INT_MAX && (xfs_daddr_t) log_bbnum-start_blk >= 0);
449 ASSERT(head_blk <= INT_MAX);
450 if ((error = xlog_find_verify_log_record(log,
451 start_blk,
452 &new_blk,
453 (int)head_blk)) == -1) {
454 error = XFS_ERROR(EIO);
455 goto bp_err;
456 } else if (error)
457 goto bp_err;
458 if (new_blk != log_bbnum)
459 head_blk = new_blk;
460 } else if (error)
461 goto bp_err;
462 }
463
464 xlog_put_bp(bp);
465 if (head_blk == log_bbnum)
466 *return_head_blk = 0;
467 else
468 *return_head_blk = head_blk;
469 /*
470 * When returning here, we have a good block number. Bad block
471 * means that during a previous crash, we didn't have a clean break
472 * from cycle number N to cycle number N-1. In this case, we need
473 * to find the first block with cycle number N-1.
474 */
475 return 0;
476
477 bp_err:
478 xlog_put_bp(bp);
479
480 if (error)
481 xlog_warn("XFS: failed to find log head");
482
483 return error;
484 } /* xlog_find_head */
485
486 /*
487 * Find the sync block number or the tail of the log.
488 *
489 * This will be the block number of the last record to have its
490 * associated buffers synced to disk. Every log record header has
491 * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
492 * to get a sync block number. The only concern is to figure out which
493 * log record header to believe.
494 *
495 * The following algorithm uses the log record header with the largest
496 * lsn. The entire log record does not need to be valid. We only care
497 * that the header is valid.
498 *
499 * We could speed up search by using current head_blk buffer, but it is not
500 * available.
501 */
502 int
503 xlog_find_tail(xlog_t *log,
504 xfs_daddr_t *head_blk,
505 xfs_daddr_t *tail_blk,
506 int readonly)
507 {
508 xlog_rec_header_t *rhead;
509 xlog_op_header_t *op_head;
510 xfs_buf_t *bp;
511 int error, i, found;
512 xfs_daddr_t umount_data_blk;
513 xfs_daddr_t after_umount_blk;
514 xfs_lsn_t tail_lsn;
515
516 found = error = 0;
517
518 /*
519 * Find previous log record
520 */
521 if ((error = xlog_find_head(log, head_blk)))
522 return error;
523
524 bp = xlog_get_bp(1,log->l_mp);
525 if (!bp)
526 return ENOMEM;
527 if (*head_blk == 0) { /* special case */
528 if ((error = xlog_bread(log, 0, 1, bp)))
529 goto bread_err;
530 if (GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT) == 0) {
531 *tail_blk = 0;
532 /* leave all other log inited values alone */
533 goto exit;
534 }
535 }
536
537 /*
538 * Search backwards looking for log record header block
539 */
540 ASSERT(*head_blk < INT_MAX);
541 for (i=(int)(*head_blk)-1; i>=0; i--) {
542 if ((error = xlog_bread(log, i, 1, bp)))
543 goto bread_err;
544 if (INT_GET(*(uint *)(XFS_BUF_PTR(bp)), ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM) {
545 found = 1;
546 break;
547 }
548 }
549 /*
550 * If we haven't found the log record header block, start looking
551 * again from the end of the physical log. XXXmiken: There should be
552 * a check here to make sure we didn't search more than N blocks in
553 * the previous code.
554 */
555 if (!found) {
556 for (i=log->l_logBBsize-1; i>=(int)(*head_blk); i--) {
557 if ((error = xlog_bread(log, i, 1, bp)))
558 goto bread_err;
559 if (INT_GET(*(uint*)(XFS_BUF_PTR(bp)), ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM) {
560 found = 2;
561 break;
562 }
563 }
564 }
565 if (!found) {
566 xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
567 ASSERT(0);
568 return XFS_ERROR(EIO);
569 }
570
571 /* find blk_no of tail of log */
572 rhead = (xlog_rec_header_t *)XFS_BUF_PTR(bp);
573 *tail_blk = BLOCK_LSN(rhead->h_tail_lsn, ARCH_CONVERT);
574
575 /*
576 * Reset log values according to the state of the log when we
577 * crashed. In the case where head_blk == 0, we bump curr_cycle
578 * one because the next write starts a new cycle rather than
579 * continuing the cycle of the last good log record. At this
580 * point we have guaranteed that all partial log records have been
581 * accounted for. Therefore, we know that the last good log record
582 * written was complete and ended exactly on the end boundary
583 * of the physical log.
584 */
585 log->l_prev_block = i;
586 log->l_curr_block = (int)*head_blk;
587 log->l_curr_cycle = INT_GET(rhead->h_cycle, ARCH_CONVERT);
588 if (found == 2)
589 log->l_curr_cycle++;
590 log->l_tail_lsn = INT_GET(rhead->h_tail_lsn, ARCH_CONVERT);
591 log->l_last_sync_lsn = INT_GET(rhead->h_lsn, ARCH_CONVERT);
592 log->l_grant_reserve_cycle = log->l_curr_cycle;
593 log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
594 log->l_grant_write_cycle = log->l_curr_cycle;
595 log->l_grant_write_bytes = BBTOB(log->l_curr_block);
596
597 /*
598 * Look for unmount record. If we find it, then we know there
599 * was a clean unmount. Since 'i' could be the last block in
600 * the physical log, we convert to a log block before comparing
601 * to the head_blk.
602 *
603 * Save the current tail lsn to use to pass to
604 * xlog_clear_stale_blocks() below. We won't want to clear the
605 * unmount record if there is one, so we pass the lsn of the
606 * unmount record rather than the block after it.
607 */
608 after_umount_blk = (i + 2) % log->l_logBBsize;
609 tail_lsn = log->l_tail_lsn;
610 if (*head_blk == after_umount_blk && INT_GET(rhead->h_num_logops, ARCH_CONVERT) == 1) {
611 umount_data_blk = (i + 1) % log->l_logBBsize;
612 if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
613 goto bread_err;
614 }
615 op_head = (xlog_op_header_t *)XFS_BUF_PTR(bp);
616 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
617 /*
618 * Set tail and last sync so that newly written
619 * log records will point recovery to after the
620 * current unmount record.
621 */
622 ASSIGN_ANY_LSN(log->l_tail_lsn, log->l_curr_cycle,
623 after_umount_blk, ARCH_NOCONVERT);
624 ASSIGN_ANY_LSN(log->l_last_sync_lsn, log->l_curr_cycle,
625 after_umount_blk, ARCH_NOCONVERT);
626 *tail_blk = after_umount_blk;
627 }
628 }
629
630 #ifdef __KERNEL__
631 /*
632 * Make sure that there are no blocks in front of the head
633 * with the same cycle number as the head. This can happen
634 * because we allow multiple outstanding log writes concurrently,
635 * and the later writes might make it out before earlier ones.
636 *
637 * We use the lsn from before modifying it so that we'll never
638 * overwrite the unmount record after a clean unmount.
639 *
640 * Do this only if we are going to recover the filesystem
641 */
642 if (!readonly)
643 error = xlog_clear_stale_blocks(log, tail_lsn);
644 #endif
645
646 bread_err:
647 exit:
648 xlog_put_bp(bp);
649
650 if (error)
651 xlog_warn("XFS: failed to locate log tail");
652
653 return error;
654 } /* xlog_find_tail */
655
656
657 /*
658 * Is the log zeroed at all?
659 *
660 * The last binary search should be changed to perform an X block read
661 * once X becomes small enough. You can then search linearly through
662 * the X blocks. This will cut down on the number of reads we need to do.
663 *
664 * If the log is partially zeroed, this routine will pass back the blkno
665 * of the first block with cycle number 0. It won't have a complete LR
666 * preceding it.
667 *
668 * Return:
669 * 0 => the log is completely written to
670 * -1 => use *blk_no as the first block of the log
671 * >0 => error has occurred
672 */
673 int
674 xlog_find_zeroed(struct log *log,
675 xfs_daddr_t *blk_no)
676 {
677 xfs_buf_t *bp;
678 uint first_cycle, last_cycle;
679 xfs_daddr_t new_blk, last_blk, start_blk;
680 xfs_daddr_t num_scan_bblks;
681 int error, log_bbnum = log->l_logBBsize;
682
683 error = 0;
684 /* check totally zeroed log */
685 bp = xlog_get_bp(1,log->l_mp);
686 if (!bp)
687 return ENOMEM;
688 if ((error = xlog_bread(log, 0, 1, bp)))
689 goto bp_err;
690 first_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
691 if (first_cycle == 0) { /* completely zeroed log */
692 *blk_no = 0;
693 xlog_put_bp(bp);
694 return -1;
695 }
696
697 /* check partially zeroed log */
698 if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
699 goto bp_err;
700 last_cycle = GET_CYCLE(XFS_BUF_PTR(bp), ARCH_CONVERT);
701 if (last_cycle != 0) { /* log completely written to */
702 xlog_put_bp(bp);
703 return 0;
704 } else if (first_cycle != 1) {
705 /*
706 * If the cycle of the last block is zero, the cycle of
707 * the first block must be 1. If it's not, maybe we're
708 * not looking at a log... Bail out.
709 */
710 xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
711 return XFS_ERROR(EINVAL);
712 }
713
714 /* we have a partially zeroed log */
715 last_blk = log_bbnum-1;
716 if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
717 goto bp_err;
718
719 /*
720 * Validate the answer. Because there is no way to guarantee that
721 * the entire log is made up of log records which are the same size,
722 * we scan over the defined maximum blocks. At this point, the maximum
723 * is not chosen to mean anything special. XXXmiken
724 */
725 num_scan_bblks = BTOBB(XLOG_MAX_ICLOGS<<XLOG_MAX_RECORD_BSHIFT);
726 ASSERT(num_scan_bblks <= INT_MAX);
727
728 if (last_blk < num_scan_bblks)
729 num_scan_bblks = last_blk;
730 start_blk = last_blk - num_scan_bblks;
731
732 /*
733 * We search for any instances of cycle number 0 that occur before
734 * our current estimate of the head. What we're trying to detect is
735 * 1 ... | 0 | 1 | 0...
736 * ^ binary search ends here
737 */
738 if ((error = xlog_find_verify_cycle(log, start_blk,
739 (int)num_scan_bblks, 0, &new_blk)))
740 goto bp_err;
741 if (new_blk != -1)
742 last_blk = new_blk;
743
744 /*
745 * Potentially backup over partial log record write. We don't need
746 * to search the end of the log because we know it is zero.
747 */
748 if ((error = xlog_find_verify_log_record(log, start_blk,
749 &last_blk, 0)) == -1) {
750 error = XFS_ERROR(EIO);
751 goto bp_err;
752 } else if (error)
753 goto bp_err;
754
755 *blk_no = last_blk;
756 bp_err:
757 xlog_put_bp(bp);
758 if (error)
759 return error;
760 return -1;
761 } /* xlog_find_zeroed */
762
763 /* stuff for transactional view */
764 STATIC void
765 xlog_unpack_data(xlog_rec_header_t *rhead,
766 xfs_caddr_t dp,
767 xlog_t *log)
768 {
769 int i;
770 #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
771 uint *up = (uint *)dp;
772 uint chksum = 0;
773 #endif
774
775 for (i=0; i<BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); i++) {
776 /* these are both on-disk, so don't endian flip twice */
777 *(uint *)dp = *(uint *)&rhead->h_cycle_data[i];
778 dp += BBSIZE;
779 }
780 #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
781 /* divide length by 4 to get # words */
782 for (i=0; i < INT_GET(rhead->h_len, ARCH_CONVERT) >> 2; i++) {
783 chksum ^= INT_GET(*up, ARCH_CONVERT);
784 up++;
785 }
786 if (chksum != INT_GET(rhead->h_chksum, ARCH_CONVERT)) {
787 if (!INT_ISZERO(rhead->h_chksum, ARCH_CONVERT) ||
788 ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
789 cmn_err(CE_DEBUG,
790 "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)",
791 INT_GET(rhead->h_chksum, ARCH_CONVERT), chksum);
792 cmn_err(CE_DEBUG,
793 "XFS: Disregard message if filesystem was created with non-DEBUG kernel");
794 log->l_flags |= XLOG_CHKSUM_MISMATCH;
795 }
796 }
797 #endif /* DEBUG && XFS_LOUD_RECOVERY */
798 } /* xlog_unpack_data */
799
800
801 STATIC xlog_recover_t *
802 xlog_recover_find_tid(xlog_recover_t *q,
803 xlog_tid_t tid)
804 {
805 xlog_recover_t *p = q;
806
807 while (p != NULL) {
808 if (p->r_log_tid == tid)
809 break;
810 p = p->r_next;
811 }
812 return p;
813 } /* xlog_recover_find_tid */
814
815 STATIC void
816 xlog_recover_put_hashq(xlog_recover_t **q,
817 xlog_recover_t *trans)
818 {
819 trans->r_next = *q;
820 *q = trans;
821 } /* xlog_recover_put_hashq */
822
823 STATIC void
824 xlog_recover_new_tid(xlog_recover_t **q,
825 xlog_tid_t tid,
826 xfs_lsn_t lsn)
827 {
828 xlog_recover_t *trans;
829
830 trans = kmem_zalloc(sizeof(xlog_recover_t), 0);
831 trans->r_log_tid = tid;
832 trans->r_lsn = lsn;
833 xlog_recover_put_hashq(q, trans);
834 } /* xlog_recover_new_tid */
835
836
837 STATIC int
838 xlog_recover_unlink_tid(xlog_recover_t **q,
839 xlog_recover_t *trans)
840 {
841 xlog_recover_t *tp;
842 int found = 0;
843
844 ASSERT(trans != 0);
845 if (trans == *q) {
846 *q = (*q)->r_next;
847 } else {
848 tp = *q;
849 while (tp != 0) {
850 if (tp->r_next == trans) {
851 found = 1;
852 break;
853 }
854 tp = tp->r_next;
855 }
856 if (!found) {
857 xlog_warn(
858 "XFS: xlog_recover_unlink_tid: trans not found");
859 ASSERT(0);
860 return XFS_ERROR(EIO);
861 }
862 tp->r_next = tp->r_next->r_next;
863 }
864 return 0;
865 } /* xlog_recover_unlink_tid */
866
867 /*
868 * Free up any resources allocated by the transaction
869 *
870 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
871 */
872 STATIC void
873 xlog_recover_free_trans(xlog_recover_t *trans)
874 {
875 xlog_recover_item_t *first_item, *item, *free_item;
876 int i;
877
878 item = first_item = trans->r_itemq;
879 do {
880 free_item = item;
881 item = item->ri_next;
882 /* Free the regions in the item. */
883 for (i = 0; i < free_item->ri_cnt; i++) {
884 kmem_free(free_item->ri_buf[i].i_addr,
885 free_item->ri_buf[i].i_len);
886 }
887 /* Free the item itself */
888 kmem_free(free_item->ri_buf,
889 (free_item->ri_total * sizeof(xfs_log_iovec_t)));
890 kmem_free(free_item, sizeof(xlog_recover_item_t));
891 } while (first_item != item);
892 /* Free the transaction recover structure */
893 kmem_free(trans, sizeof(xlog_recover_t));
894 } /* xlog_recover_free_trans */
895
896
897 STATIC int
898 xlog_recover_commit_trans(xlog_t *log,
899 xlog_recover_t **q,
900 xlog_recover_t *trans,
901 int pass)
902 {
903 int error;
904
905 if ((error = xlog_recover_unlink_tid(q, trans)))
906 return error;
907 if ((error = xlog_recover_do_trans(log, trans, pass)))
908 return error;
909 xlog_recover_free_trans(trans); /* no error */
910 return 0;
911 } /* xlog_recover_commit_trans */
912
913 STATIC void
914 xlog_recover_insert_item_backq(xlog_recover_item_t **q,
915 xlog_recover_item_t *item)
916 {
917 if (*q == 0) {
918 item->ri_prev = item->ri_next = item;
919 *q = item;
920 } else {
921 item->ri_next = *q;
922 item->ri_prev = (*q)->ri_prev;
923 (*q)->ri_prev = item;
924 item->ri_prev->ri_next = item;
925 }
926 } /* xlog_recover_insert_item_backq */
927
928 STATIC void
929 xlog_recover_add_item(xlog_recover_item_t **itemq)
930 {
931 xlog_recover_item_t *item;
932
933 item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
934 xlog_recover_insert_item_backq(itemq, item);
935 } /* xlog_recover_add_item */
936
937 /* The next region to add is the start of a new region. It could be
938 * a whole region or it could be the first part of a new region. Because
939 * of this, the assumption here is that the type and size fields of all
940 * format structures fit into the first 32 bits of the structure.
941 *
942 * This works because all regions must be 32 bit aligned. Therefore, we
943 * either have both fields or we have neither field. In the case we have
944 * neither field, the data part of the region is zero length. We only have
945 * a log_op_header and can throw away the header since a new one will appear
946 * later. If we have at least 4 bytes, then we can determine how many regions
947 * will appear in the current log item.
948 */
949 STATIC int
950 xlog_recover_add_to_trans(xlog_recover_t *trans,
951 xfs_caddr_t dp,
952 int len)
953 {
954 xfs_inode_log_format_t *in_f; /* any will do */
955 xlog_recover_item_t *item;
956 xfs_caddr_t ptr;
957
958 if (!len)
959 return 0;
960 ptr = kmem_zalloc(len, 0);
961 bcopy(dp, ptr, len);
962
963 in_f = (xfs_inode_log_format_t *)ptr;
964 item = trans->r_itemq;
965 if (item == 0) {
966 ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC);
967 if (len == sizeof(xfs_trans_header_t))
968 xlog_recover_add_item(&trans->r_itemq);
969 bcopy(dp, &trans->r_theader, len); /* s, d, l */
970 return 0;
971 }
972 if (item->ri_prev->ri_total != 0 &&
973 item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
974 xlog_recover_add_item(&trans->r_itemq);
975 }
976 item = trans->r_itemq;
977 item = item->ri_prev;
978
979 if (item->ri_total == 0) { /* first region to be added */
980 item->ri_total = in_f->ilf_size;
981 ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM);
982 item->ri_buf = kmem_zalloc((item->ri_total *
983 sizeof(xfs_log_iovec_t)), 0);
984 }
985 ASSERT(item->ri_total > item->ri_cnt);
986 /* Description region is ri_buf[0] */
987 item->ri_buf[item->ri_cnt].i_addr = ptr;
988 item->ri_buf[item->ri_cnt].i_len = len;
989 item->ri_cnt++;
990 return 0;
991 } /* xlog_recover_add_to_trans */
992
993 STATIC int
994 xlog_recover_add_to_cont_trans(xlog_recover_t *trans,
995 xfs_caddr_t dp,
996 int len)
997 {
998 xlog_recover_item_t *item;
999 xfs_caddr_t ptr, old_ptr;
1000 int old_len;
1001
1002 item = trans->r_itemq;
1003 if (item == 0) {
1004 /* finish copying rest of trans header */
1005 xlog_recover_add_item(&trans->r_itemq);
1006 ptr = (xfs_caddr_t)&trans->r_theader+sizeof(xfs_trans_header_t)-len;
1007 bcopy(dp, ptr, len); /* s, d, l */
1008 return 0;
1009 }
1010 item = item->ri_prev;
1011
1012 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1013 old_len = item->ri_buf[item->ri_cnt-1].i_len;
1014
1015 ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0);
1016 bcopy(dp , &ptr[old_len], len); /* s, d, l */
1017 item->ri_buf[item->ri_cnt-1].i_len += len;
1018 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1019 return 0;
1020 } /* xlog_recover_add_to_cont_trans */
1021
1022 STATIC int
1023 xlog_recover_unmount_trans(xlog_recover_t *trans)
1024 {
1025 /* Do nothing now */
1026 xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
1027 return( 0 );
1028 } /* xlog_recover_unmount_trans */
1029
1030
1031 STATIC int
1032 xlog_recover_process_data(xlog_t *log,
1033 xlog_recover_t *rhash[],
1034 xlog_rec_header_t *rhead,
1035 xfs_caddr_t dp,
1036 int pass)
1037 {
1038 xfs_caddr_t lp = dp+INT_GET(rhead->h_len, ARCH_CONVERT);
1039 int num_logops = INT_GET(rhead->h_num_logops, ARCH_CONVERT);
1040 xlog_op_header_t *ohead;
1041 xlog_recover_t *trans;
1042 xlog_tid_t tid;
1043 int error;
1044 unsigned long hash;
1045 uint flags;
1046
1047 /* check the log format matches our own - else we can't recover */
1048 if (xlog_header_check_recover(log->l_mp, rhead))
1049 return (XFS_ERROR(EIO));
1050
1051 while (dp < lp) {
1052 ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
1053 ohead = (xlog_op_header_t *)dp;
1054 dp += sizeof(xlog_op_header_t);
1055 if (ohead->oh_clientid != XFS_TRANSACTION &&
1056 ohead->oh_clientid != XFS_LOG) {
1057 xlog_warn("XFS: xlog_recover_process_data: bad clientid");
1058 ASSERT(0);
1059 return (XFS_ERROR(EIO));
1060 }
1061 tid = INT_GET(ohead->oh_tid, ARCH_CONVERT);
1062 hash = XLOG_RHASH(tid);
1063 trans = xlog_recover_find_tid(rhash[hash], tid);
1064 if (trans == NULL) { /* not found; add new tid */
1065 if (ohead->oh_flags & XLOG_START_TRANS)
1066 xlog_recover_new_tid(&rhash[hash], tid, INT_GET(rhead->h_lsn, ARCH_CONVERT));
1067 } else {
1068 ASSERT(dp+INT_GET(ohead->oh_len, ARCH_CONVERT) <= lp);
1069 flags = ohead->oh_flags & ~XLOG_END_TRANS;
1070 if (flags & XLOG_WAS_CONT_TRANS)
1071 flags &= ~XLOG_CONTINUE_TRANS;
1072 switch (flags) {
1073 case XLOG_COMMIT_TRANS: {
1074 error = xlog_recover_commit_trans(log, &rhash[hash],
1075 trans, pass);
1076 break;
1077 }
1078 case XLOG_UNMOUNT_TRANS: {
1079 error = xlog_recover_unmount_trans(trans);
1080 break;
1081 }
1082 case XLOG_WAS_CONT_TRANS: {
1083 error = xlog_recover_add_to_cont_trans(trans, dp,
1084 INT_GET(ohead->oh_len, ARCH_CONVERT));
1085 break;
1086 }
1087 case XLOG_START_TRANS : {
1088 xlog_warn("XFS: xlog_recover_process_data: bad transaction");
1089 ASSERT(0);
1090 error = XFS_ERROR(EIO);
1091 break;
1092 }
1093 case 0:
1094 case XLOG_CONTINUE_TRANS: {
1095 error = xlog_recover_add_to_trans(trans, dp,
1096 INT_GET(ohead->oh_len, ARCH_CONVERT));
1097 break;
1098 }
1099 default: {
1100 xlog_warn("XFS: xlog_recover_process_data: bad flag");
1101 ASSERT(0);
1102 error = XFS_ERROR(EIO);
1103 break;
1104 }
1105 } /* switch */
1106 if (error)
1107 return error;
1108 } /* if */
1109 dp += INT_GET(ohead->oh_len, ARCH_CONVERT);
1110 num_logops--;
1111 }
1112 return( 0 );
1113 } /* xlog_recover_process_data */
1114
1115 /*
1116 * Read the log from tail to head and process the log records found.
1117 * Handle the two cases where the tail and head are in the same cycle
1118 * and where the active portion of the log wraps around the end of
1119 * the physical log separately. The pass parameter is passed through
1120 * to the routines called to process the data and is not looked at
1121 * here.
1122 */
1123 int
1124 xlog_do_recovery_pass(xlog_t *log,
1125 xfs_daddr_t head_blk,
1126 xfs_daddr_t tail_blk,
1127 int pass)
1128 {
1129 xlog_rec_header_t *rhead;
1130 xfs_daddr_t blk_no;
1131 xfs_caddr_t bufaddr;
1132 xfs_buf_t *hbp, *dbp;
1133 int error;
1134 int bblks, split_bblks;
1135 xlog_recover_t *rhash[XLOG_RHASH_SIZE];
1136
1137 error = 0;
1138 hbp = xlog_get_bp(1,log->l_mp);
1139 if (!hbp)
1140 return ENOMEM;
1141 dbp = xlog_get_bp(BTOBB(XLOG_MAX_RECORD_BSIZE),log->l_mp);
1142 if (!dbp) {
1143 xlog_put_bp(hbp);
1144 return ENOMEM;
1145 }
1146 bzero(rhash, sizeof(rhash));
1147 if (tail_blk <= head_blk) {
1148 for (blk_no = tail_blk; blk_no < head_blk; ) {
1149 if ((error = xlog_bread(log, blk_no, 1, hbp)))
1150 goto bread_err;
1151 rhead = (xlog_rec_header_t *)XFS_BUF_PTR(hbp);
1152 ASSERT(INT_GET(rhead->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
1153 ASSERT(BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) <= INT_MAX));
1154 bblks = (int) BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); /* blocks in data section */
1155 if (bblks > 0) {
1156 if ((error = xlog_bread(log, blk_no+1, bblks, dbp)))
1157 goto bread_err;
1158 xlog_unpack_data(rhead, XFS_BUF_PTR(dbp), log);
1159 if ((error = xlog_recover_process_data(log, rhash,
1160 rhead, XFS_BUF_PTR(dbp),
1161 pass)))
1162 goto bread_err;
1163 }
1164 blk_no += (bblks+1);
1165 }
1166 } else {
1167 /*
1168 * Perform recovery around the end of the physical log. When the head
1169 * is not on the same cycle number as the tail, we can't do a sequential
1170 * recovery as above.
1171 */
1172 blk_no = tail_blk;
1173 while (blk_no < log->l_logBBsize) {
1174
1175 /* Read header of one block */
1176 if ((error = xlog_bread(log, blk_no, 1, hbp)))
1177 goto bread_err;
1178 rhead = (xlog_rec_header_t *)XFS_BUF_PTR(hbp);
1179 ASSERT(INT_GET(rhead->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
1180 ASSERT(BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) <= INT_MAX));
1181 bblks = (int) BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
1182
1183 /* LR body must have data or it wouldn't have been written */
1184 ASSERT(bblks > 0);
1185 blk_no++; /* successfully read header */
1186 ASSERT(blk_no <= log->l_logBBsize);
1187
1188 if ((INT_GET(rhead->h_magicno, ARCH_CONVERT) != XLOG_HEADER_MAGIC_NUM) ||
1189 (BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) > INT_MAX)) ||
1190 (bblks <= 0) ||
1191 (blk_no > log->l_logBBsize)) {
1192 error = EFSCORRUPTED;
1193 goto bread_err;
1194 }
1195
1196 /* Read in data for log record */
1197 if (blk_no+bblks <= log->l_logBBsize) {
1198 if ((error = xlog_bread(log, blk_no, bblks, dbp)))
1199 goto bread_err;
1200 } else {
1201 /* This log record is split across physical end of log */
1202 split_bblks = 0;
1203 if (blk_no != log->l_logBBsize) {
1204
1205 /* some data is before physical end of log */
1206 ASSERT(blk_no <= INT_MAX);
1207 split_bblks = log->l_logBBsize - (int)blk_no;
1208 ASSERT(split_bblks > 0);
1209 if ((error = xlog_bread(log, blk_no, split_bblks, dbp)))
1210 goto bread_err;
1211 }
1212 bufaddr = XFS_BUF_PTR(dbp);
1213 XFS_BUF_SET_PTR(dbp, bufaddr + BBTOB(split_bblks),
1214 BBTOB(bblks - split_bblks));
1215 if ((error = xlog_bread(log, 0, bblks - split_bblks, dbp)))
1216 goto bread_err;
1217 XFS_BUF_SET_PTR(dbp, bufaddr, XLOG_MAX_RECORD_BSIZE);
1218 }
1219 xlog_unpack_data(rhead, XFS_BUF_PTR(dbp), log);
1220 if ((error = xlog_recover_process_data(log, rhash,
1221 rhead, XFS_BUF_PTR(dbp),
1222 pass)))
1223 goto bread_err;
1224 blk_no += bblks;
1225 }
1226
1227 ASSERT(blk_no >= log->l_logBBsize);
1228 blk_no -= log->l_logBBsize;
1229
1230 /* read first part of physical log */
1231 while (blk_no < head_blk) {
1232 if ((error = xlog_bread(log, blk_no, 1, hbp)))
1233 goto bread_err;
1234 rhead = (xlog_rec_header_t *)XFS_BUF_PTR(hbp);
1235 ASSERT(INT_GET(rhead->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM);
1236 ASSERT(BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT) <= INT_MAX));
1237 bblks = (int) BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT));
1238 ASSERT(bblks > 0);
1239 if ((error = xlog_bread(log, blk_no+1, bblks, dbp)))
1240 goto bread_err;
1241 xlog_unpack_data(rhead, XFS_BUF_PTR(dbp), log);
1242 if ((error = xlog_recover_process_data(log, rhash,
1243 rhead, XFS_BUF_PTR(dbp),
1244 pass)))
1245 goto bread_err;
1246 blk_no += (bblks+1);
1247 }
1248 }
1249
1250 bread_err:
1251 xlog_put_bp(dbp);
1252 xlog_put_bp(hbp);
1253
1254 return error;
1255 }