]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - libxlog/xfs_log_recover.c
fs: xfs: Remove KM_NOSLEEP and KM_SLEEP.
[thirdparty/xfsprogs-dev.git] / libxlog / xfs_log_recover.c
CommitLineData
959ef981 1// SPDX-License-Identifier: GPL-2.0
d321ceac 2/*
5e656dbb 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
da23017d 4 * All Rights Reserved.
d321ceac 5 */
6b803e5a
CH
6#include "libxfs.h"
7#include "libxlog.h"
d321ceac 8
a562a63b
NS
9#define xfs_readonly_buftarg(buftarg) (0)
10
999f0b9c
DC
11/* avoid set-but-unused var warning. gcc is not very bright. */
12#define xlog_clear_stale_blocks(log, taillsn) ({ \
13 (taillsn) = (taillsn); \
14 (0); \
15})
16
cc085d77 17#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
2aa2e7b9
BN
18
19/*
c40bdaa2
DC
20 * Verify the given count of basic blocks is valid number of blocks
21 * to specify for an operation involving the given XFS log buffer.
22 * Returns nonzero if the count is valid, 0 otherwise.
2aa2e7b9
BN
23 */
24
c40bdaa2
DC
25static inline int
26xlog_buf_bbcount_valid(
999f0b9c 27 struct xlog *log,
c40bdaa2
DC
28 int bbcount)
29{
30 return bbcount > 0 && bbcount <= log->l_logBBsize;
31}
2aa2e7b9 32
c40bdaa2
DC
33/*
34 * Allocate a buffer to hold log data. The buffer needs to be able
35 * to map to a range of nbblks basic blocks at any valid (basic
36 * block) offset within the log.
37 */
2aa2e7b9
BN
38xfs_buf_t *
39xlog_get_bp(
999f0b9c 40 struct xlog *log,
c40bdaa2 41 int nbblks)
2aa2e7b9 42{
c40bdaa2 43 if (!xlog_buf_bbcount_valid(log, nbblks)) {
999f0b9c 44 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
c40bdaa2
DC
45 nbblks);
46 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
47 return NULL;
2aa2e7b9 48 }
c40bdaa2
DC
49
50 /*
51 * We do log I/O in units of log sectors (a power-of-2
52 * multiple of the basic block size), so we round up the
999f0b9c 53 * requested size to accommodate the basic blocks required
c40bdaa2
DC
54 * for complete log sectors.
55 *
56 * In addition, the buffer may be used for a non-sector-
57 * aligned block offset, in which case an I/O of the
58 * requested size could extend beyond the end of the
59 * buffer. If the requested size is only 1 basic block it
60 * will never straddle a sector boundary, so this won't be
61 * an issue. Nor will this be a problem if the log I/O is
62 * done in basic blocks (sector size 1). But otherwise we
63 * extend the buffer by one extra log sector to ensure
999f0b9c 64 * there's space to accommodate this possibility.
c40bdaa2
DC
65 */
66 if (nbblks > 1 && log->l_sectBBsize > 1)
67 nbblks += log->l_sectBBsize;
999f0b9c 68 nbblks = round_up(nbblks, log->l_sectBBsize);
c40bdaa2
DC
69
70 return libxfs_getbufr(log->l_dev, (xfs_daddr_t)-1, nbblks);
2aa2e7b9
BN
71}
72
73void
74xlog_put_bp(
75 xfs_buf_t *bp)
76{
77 libxfs_putbufr(bp);
78}
79
c40bdaa2
DC
80/*
81 * Return the address of the start of the given block number's data
82 * in a log buffer. The buffer covers a log sector-aligned region.
83 */
d60ba955 84STATIC char *
c40bdaa2 85xlog_align(
999f0b9c 86 struct xlog *log,
c40bdaa2
DC
87 xfs_daddr_t blk_no,
88 int nbblks,
999f0b9c 89 struct xfs_buf *bp)
c40bdaa2 90{
999f0b9c 91 xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
c40bdaa2 92
999f0b9c
DC
93 ASSERT(offset + nbblks <= bp->b_length);
94 return bp->b_addr + BBTOB(offset);
c40bdaa2 95}
2aa2e7b9 96
999f0b9c 97
2aa2e7b9
BN
98/*
99 * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
100 */
101int
c40bdaa2 102xlog_bread_noalign(
999f0b9c 103 struct xlog *log,
2aa2e7b9
BN
104 xfs_daddr_t blk_no,
105 int nbblks,
999f0b9c 106 struct xfs_buf *bp)
2aa2e7b9 107{
c40bdaa2 108 if (!xlog_buf_bbcount_valid(log, nbblks)) {
999f0b9c 109 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
c40bdaa2
DC
110 nbblks);
111 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
112 return EFSCORRUPTED;
113 }
114
999f0b9c
DC
115 blk_no = round_down(blk_no, log->l_sectBBsize);
116 nbblks = round_up(nbblks, log->l_sectBBsize);
2aa2e7b9
BN
117
118 ASSERT(nbblks > 0);
119 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
2aa2e7b9
BN
120
121 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
d73a5b92 122 bp->b_bcount = BBTOB(nbblks);
999f0b9c 123 bp->b_error = 0;
2aa2e7b9
BN
124
125 return libxfs_readbufr(log->l_dev, XFS_BUF_ADDR(bp), bp, nbblks, 0);
126}
127
c40bdaa2
DC
128int
129xlog_bread(
999f0b9c 130 struct xlog *log,
2aa2e7b9
BN
131 xfs_daddr_t blk_no,
132 int nbblks,
999f0b9c 133 struct xfs_buf *bp,
d60ba955 134 char **offset)
2aa2e7b9 135{
c40bdaa2 136 int error;
2aa2e7b9 137
c40bdaa2
DC
138 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
139 if (error)
140 return error;
2aa2e7b9 141
c40bdaa2
DC
142 *offset = xlog_align(log, blk_no, nbblks, bp);
143 return 0;
2aa2e7b9
BN
144}
145
999f0b9c
DC
146/*
147 * Read at an offset into the buffer. Returns with the buffer in it's original
148 * state regardless of the result of the read.
149 */
150STATIC int
151xlog_bread_offset(
152 struct xlog *log,
153 xfs_daddr_t blk_no, /* block to read from */
154 int nbblks, /* blocks to read */
155 struct xfs_buf *bp,
d60ba955 156 char *offset)
999f0b9c 157{
d60ba955 158 char *orig_offset = bp->b_addr;
999f0b9c
DC
159 int orig_len = bp->b_bcount;
160 int error, error2;
161
fc9f709f 162 error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
999f0b9c
DC
163 if (error)
164 return error;
165
166 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
167
168 /* must reset buffer pointer even on error */
fc9f709f 169 error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
999f0b9c
DC
170 if (error)
171 return error;
172 return error2;
173}
174
d321ceac
NS
175/*
176 * This routine finds (to an approximation) the first block in the physical
4ed50f8a 177 * log which contains the given cycle. It uses a binary search algorithm.
d321ceac
NS
178 * Note that the algorithm can not be perfect because the disk will not
179 * necessarily be perfect.
180 */
181int
a562a63b 182xlog_find_cycle_start(
999f0b9c
DC
183 struct xlog *log,
184 struct xfs_buf *bp,
a562a63b
NS
185 xfs_daddr_t first_blk,
186 xfs_daddr_t *last_blk,
187 uint cycle)
d321ceac 188{
d60ba955 189 char *offset;
ffe29fb5 190 xfs_daddr_t mid_blk;
c40bdaa2 191 xfs_daddr_t end_blk;
ffe29fb5
NS
192 uint mid_cycle;
193 int error;
d321ceac 194
c40bdaa2
DC
195 end_blk = *last_blk;
196 mid_blk = BLK_AVG(first_blk, end_blk);
197 while (mid_blk != first_blk && mid_blk != end_blk) {
198 error = xlog_bread(log, mid_blk, 1, bp, &offset);
199 if (error)
d321ceac 200 return error;
5e656dbb 201 mid_cycle = xlog_get_cycle(offset);
c40bdaa2
DC
202 if (mid_cycle == cycle)
203 end_blk = mid_blk; /* last_half_cycle == mid_cycle */
204 else
205 first_blk = mid_blk; /* first_half_cycle == mid_cycle */
206 mid_blk = BLK_AVG(first_blk, end_blk);
d321ceac 207 }
c40bdaa2
DC
208 ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
209 (mid_blk == end_blk && mid_blk-1 == first_blk));
210
211 *last_blk = end_blk;
d321ceac
NS
212
213 return 0;
a562a63b 214}
d321ceac
NS
215
216/*
c40bdaa2
DC
217 * Check that a range of blocks does not contain stop_on_cycle_no.
218 * Fill in *new_blk with the block offset where such a block is
219 * found, or with -1 (an invalid block number) if there is no such
220 * block in the range. The scan needs to occur from front to back
221 * and the pointer into the region must be updated since a later
222 * routine will need to perform another test.
d321ceac 223 */
ce029dc1 224STATIC int
a562a63b 225xlog_find_verify_cycle(
999f0b9c 226 struct xlog *log,
a562a63b
NS
227 xfs_daddr_t start_blk,
228 int nbblks,
229 uint stop_on_cycle_no,
230 xfs_daddr_t *new_blk)
d321ceac 231{
a562a63b
NS
232 xfs_daddr_t i, j;
233 uint cycle;
234 xfs_buf_t *bp;
e59b15cf 235 int bufblks;
d60ba955 236 char *buf = NULL;
a562a63b 237 int error = 0;
85a875e9 238
c40bdaa2
DC
239 /*
240 * Greedily allocate a buffer big enough to handle the full
241 * range of basic blocks we'll be examining. If that fails,
242 * try a smaller size. We need to be able to read at least
243 * a log sector, or we're out of luck.
244 */
85a875e9 245 bufblks = 1 << ffs(nbblks);
999f0b9c
DC
246 while (bufblks > log->l_logBBsize)
247 bufblks >>= 1;
a562a63b 248 while (!(bp = xlog_get_bp(log, bufblks))) {
d321ceac 249 bufblks >>= 1;
999f0b9c 250 if (bufblks < log->l_sectBBsize)
5000d01d
SL
251 return ENOMEM;
252 }
253
ffe29fb5
NS
254 for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
255 int bcount;
d321ceac 256
ffe29fb5 257 bcount = min(bufblks, (start_blk + nbblks - i));
d321ceac 258
c40bdaa2
DC
259 error = xlog_bread(log, i, bcount, bp, &buf);
260 if (error)
5000d01d 261 goto out;
d321ceac 262
d321ceac 263 for (j = 0; j < bcount; j++) {
5e656dbb 264 cycle = xlog_get_cycle(buf);
d321ceac 265 if (cycle == stop_on_cycle_no) {
e56fcdce 266 *new_blk = i+j;
d321ceac
NS
267 goto out;
268 }
5000d01d
SL
269
270 buf += BBSIZE;
d321ceac
NS
271 }
272 }
273
ce029dc1 274 *new_blk = -1;
d321ceac
NS
275
276out:
277 xlog_put_bp(bp);
d321ceac 278 return error;
a562a63b 279}
d321ceac
NS
280
281/*
282 * Potentially backup over partial log record write.
283 *
284 * In the typical case, last_blk is the number of the block directly after
285 * a good log record. Therefore, we subtract one to get the block number
286 * of the last block in the given buffer. extra_bblks contains the number
287 * of blocks we would have read on a previous read. This happens when the
288 * last log record is split over the end of the physical log.
289 *
290 * extra_bblks is the number of blocks potentially verified on a previous
291 * call to this routine.
292 */
d321ceac 293STATIC int
a562a63b 294xlog_find_verify_log_record(
999f0b9c 295 struct xlog *log,
a562a63b
NS
296 xfs_daddr_t start_blk,
297 xfs_daddr_t *last_blk,
298 int extra_bblks)
d321ceac 299{
a562a63b
NS
300 xfs_daddr_t i;
301 xfs_buf_t *bp;
d60ba955 302 char *offset = NULL;
a562a63b
NS
303 xlog_rec_header_t *head = NULL;
304 int error = 0;
305 int smallmem = 0;
306 int num_blks = *last_blk - start_blk;
307 int xhdrs;
308
309 ASSERT(start_blk != 0 || *last_blk != start_blk);
310
311 if (!(bp = xlog_get_bp(log, num_blks))) {
312 if (!(bp = xlog_get_bp(log, 1)))
313 return ENOMEM;
314 smallmem = 1;
315 } else {
c40bdaa2
DC
316 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
317 if (error)
a562a63b 318 goto out;
a562a63b 319 offset += ((num_blks - 1) << BBSHIFT);
d321ceac
NS
320 }
321
a562a63b
NS
322 for (i = (*last_blk) - 1; i >= 0; i--) {
323 if (i < start_blk) {
05bba5b7 324 /* valid log record not found */
999f0b9c
DC
325 xfs_warn(log->l_mp,
326 "Log inconsistent (didn't find previous header)");
a562a63b
NS
327 ASSERT(0);
328 error = XFS_ERROR(EIO);
329 goto out;
330 }
d321ceac 331
a562a63b 332 if (smallmem) {
c40bdaa2
DC
333 error = xlog_bread(log, i, 1, bp, &offset);
334 if (error)
a562a63b 335 goto out;
a562a63b
NS
336 }
337
338 head = (xlog_rec_header_t *)offset;
d321ceac 339
999f0b9c 340 if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
a562a63b
NS
341 break;
342
343 if (!smallmem)
344 offset -= BBSIZE;
345 }
346
347 /*
348 * We hit the beginning of the physical log & still no header. Return
349 * to caller. If caller can handle a return of -1, then this routine
350 * will be called again for the end of the physical log.
351 */
352 if (i == -1) {
353 error = -1;
354 goto out;
355 }
356
357 /*
358 * We have the final block of the good log (the first block
359 * of the log record _before_ the head. So we check the uuid.
360 */
361 if ((error = xlog_header_check_mount(log->l_mp, head)))
362 goto out;
363
364 /*
365 * We may have found a log record header before we expected one.
366 * last_blk will be the 1st block # with a given cycle #. We may end
367 * up reading an entire log record. In this case, we don't want to
368 * reset last_blk. Only when last_blk points in the middle of a log
369 * record do we update last_blk.
370 */
5e656dbb
BN
371 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
372 uint h_size = be32_to_cpu(head->h_size);
a562a63b
NS
373
374 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
375 if (h_size % XLOG_HEADER_CYCLE_SIZE)
376 xhdrs++;
377 } else {
378 xhdrs = 1;
379 }
380
5e656dbb
BN
381 if (*last_blk - i + extra_bblks !=
382 BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
a562a63b
NS
383 *last_blk = i;
384
385out:
386 xlog_put_bp(bp);
387 return error;
388}
d321ceac
NS
389
390/*
391 * Head is defined to be the point of the log where the next log write
392 * write could go. This means that incomplete LR writes at the end are
393 * eliminated when calculating the head. We aren't guaranteed that previous
5000d01d 394 * LR have complete transactions. We only know that a cycle number of
d321ceac
NS
395 * current cycle number -1 won't be present in the log if we start writing
396 * from our current block number.
397 *
398 * last_blk contains the block number of the first block with a given
399 * cycle number.
400 *
d321ceac
NS
401 * Return: zero if normal, non-zero if error.
402 */
c40bdaa2 403STATIC int
a562a63b 404xlog_find_head(
999f0b9c 405 struct xlog *log,
a562a63b 406 xfs_daddr_t *return_head_blk)
d321ceac 407{
a562a63b 408 xfs_buf_t *bp;
d60ba955 409 char *offset;
a562a63b
NS
410 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
411 int num_scan_bblks;
412 uint first_half_cycle, last_half_cycle;
413 uint stop_on_cycle;
414 int error, log_bbnum = log->l_logBBsize;
415
416 /* Is the end of the log device zeroed? */
417 if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
418 *return_head_blk = first_blk;
419
420 /* Is the whole lot zeroed? */
c40bdaa2 421 if (!first_blk) {
a562a63b
NS
422 /* Linux XFS shouldn't generate totally zeroed logs -
423 * mkfs etc write a dummy unmount record to a fresh
424 * log so we can store the uuid in there
425 */
999f0b9c 426 xfs_warn(log->l_mp, "totally zeroed log");
a562a63b
NS
427 }
428
429 return 0;
430 } else if (error) {
999f0b9c 431 xfs_warn(log->l_mp, "empty log check failed");
a562a63b 432 return error;
5000d01d
SL
433 }
434
a562a63b
NS
435 first_blk = 0; /* get cycle # of 1st block */
436 bp = xlog_get_bp(log, 1);
437 if (!bp)
438 return ENOMEM;
c40bdaa2
DC
439
440 error = xlog_bread(log, 0, 1, bp, &offset);
441 if (error)
a562a63b 442 goto bp_err;
c40bdaa2 443
5e656dbb 444 first_half_cycle = xlog_get_cycle(offset);
a562a63b
NS
445
446 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
c40bdaa2
DC
447 error = xlog_bread(log, last_blk, 1, bp, &offset);
448 if (error)
a562a63b 449 goto bp_err;
c40bdaa2 450
5e656dbb 451 last_half_cycle = xlog_get_cycle(offset);
a562a63b
NS
452 ASSERT(last_half_cycle != 0);
453
d321ceac 454 /*
a562a63b
NS
455 * If the 1st half cycle number is equal to the last half cycle number,
456 * then the entire log is stamped with the same cycle number. In this
457 * case, head_blk can't be set to zero (which makes sense). The below
458 * math doesn't work out properly with head_blk equal to zero. Instead,
05bba5b7 459 * we set it to log_bbnum which is an invalid block number, but this
a562a63b
NS
460 * value makes the math correct. If head_blk doesn't changed through
461 * all the tests below, *head_blk is set to zero at the very end rather
462 * than log_bbnum. In a sense, log_bbnum and zero are the same block
463 * in a circular file.
d321ceac 464 */
a562a63b
NS
465 if (first_half_cycle == last_half_cycle) {
466 /*
467 * In this case we believe that the entire log should have
468 * cycle number last_half_cycle. We need to scan backwards
469 * from the end verifying that there are no holes still
470 * containing last_half_cycle - 1. If we find such a hole,
471 * then the start of that hole will be the new head. The
472 * simple case looks like
473 * x | x ... | x - 1 | x
474 * Another case that fits this picture would be
475 * x | x + 1 | x ... | x
5e656dbb 476 * In this case the head really is somewhere at the end of the
a562a63b
NS
477 * log, as one of the latest writes at the beginning was
478 * incomplete.
479 * One more case is
480 * x | x + 1 | x ... | x - 1 | x
481 * This is really the combination of the above two cases, and
482 * the head has to end up at the start of the x-1 hole at the
483 * end of the log.
484 *
485 * In the 256k log case, we will read from the beginning to the
486 * end of the log and search for cycle numbers equal to x-1.
487 * We don't worry about the x+1 blocks that we encounter,
488 * because we know that they cannot be the head since the log
489 * started with x.
490 */
491 head_blk = log_bbnum;
492 stop_on_cycle = last_half_cycle - 1;
493 } else {
494 /*
495 * In this case we want to find the first block with cycle
496 * number matching last_half_cycle. We expect the log to be
497 * some variation on
c40bdaa2 498 * x + 1 ... | x ... | x
a562a63b
NS
499 * The first block with cycle number x (last_half_cycle) will
500 * be where the new head belongs. First we do a binary search
501 * for the first occurrence of last_half_cycle. The binary
502 * search may not be totally accurate, so then we scan back
503 * from there looking for occurrences of last_half_cycle before
504 * us. If that backwards scan wraps around the beginning of
505 * the log, then we look for occurrences of last_half_cycle - 1
506 * at the end of the log. The cases we're looking for look
507 * like
c40bdaa2
DC
508 * v binary search stopped here
509 * x + 1 ... | x | x + 1 | x ... | x
510 * ^ but we want to locate this spot
a562a63b 511 * or
a562a63b 512 * <---------> less than scan distance
c40bdaa2
DC
513 * x + 1 ... | x ... | x - 1 | x
514 * ^ we want to locate this spot
a562a63b
NS
515 */
516 stop_on_cycle = last_half_cycle;
517 if ((error = xlog_find_cycle_start(log, bp, first_blk,
518 &head_blk, last_half_cycle)))
519 goto bp_err;
520 }
521
d321ceac 522 /*
a562a63b
NS
523 * Now validate the answer. Scan back some number of maximum possible
524 * blocks and make sure each one has the expected cycle number. The
525 * maximum is determined by the total possible amount of buffering
526 * in the in-core log. The following number can be made tighter if
527 * we actually look at the block size of the filesystem.
d321ceac 528 */
a562a63b
NS
529 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
530 if (head_blk >= num_scan_bblks) {
531 /*
532 * We are guaranteed that the entire check can be performed
533 * in one buffer.
534 */
535 start_blk = head_blk - num_scan_bblks;
536 if ((error = xlog_find_verify_cycle(log,
537 start_blk, num_scan_bblks,
538 stop_on_cycle, &new_blk)))
539 goto bp_err;
540 if (new_blk != -1)
541 head_blk = new_blk;
542 } else { /* need to read 2 parts of log */
543 /*
544 * We are going to scan backwards in the log in two parts.
545 * First we scan the physical end of the log. In this part
546 * of the log, we are looking for blocks with cycle number
547 * last_half_cycle - 1.
548 * If we find one, then we know that the log starts there, as
549 * we've found a hole that didn't get written in going around
550 * the end of the physical log. The simple case for this is
551 * x + 1 ... | x ... | x - 1 | x
552 * <---------> less than scan distance
553 * If all of the blocks at the end of the log have cycle number
554 * last_half_cycle, then we check the blocks at the start of
555 * the log looking for occurrences of last_half_cycle. If we
556 * find one, then our current estimate for the location of the
557 * first occurrence of last_half_cycle is wrong and we move
558 * back to the hole we've found. This case looks like
559 * x + 1 ... | x | x + 1 | x ...
560 * ^ binary search stopped here
561 * Another case we need to handle that only occurs in 256k
562 * logs is
563 * x + 1 ... | x ... | x+1 | x ...
564 * ^ binary search stops here
565 * In a 256k log, the scan at the end of the log will see the
566 * x + 1 blocks. We need to skip past those since that is
567 * certainly not the head of the log. By searching for
568 * last_half_cycle-1 we accomplish that.
569 */
a562a63b 570 ASSERT(head_blk <= INT_MAX &&
c40bdaa2
DC
571 (xfs_daddr_t) num_scan_bblks >= head_blk);
572 start_blk = log_bbnum - (num_scan_bblks - head_blk);
a562a63b
NS
573 if ((error = xlog_find_verify_cycle(log, start_blk,
574 num_scan_bblks - (int)head_blk,
575 (stop_on_cycle - 1), &new_blk)))
576 goto bp_err;
577 if (new_blk != -1) {
578 head_blk = new_blk;
c40bdaa2 579 goto validate_head;
a562a63b
NS
580 }
581
582 /*
583 * Scan beginning of log now. The last part of the physical
584 * log is good. This scan needs to verify that it doesn't find
585 * the last_half_cycle.
586 */
587 start_blk = 0;
588 ASSERT(head_blk <= INT_MAX);
589 if ((error = xlog_find_verify_cycle(log,
590 start_blk, (int)head_blk,
591 stop_on_cycle, &new_blk)))
592 goto bp_err;
593 if (new_blk != -1)
594 head_blk = new_blk;
595 }
596
c40bdaa2 597validate_head:
5000d01d 598 /*
a562a63b
NS
599 * Now we need to make sure head_blk is not pointing to a block in
600 * the middle of a log record.
d321ceac 601 */
a562a63b
NS
602 num_scan_bblks = XLOG_REC_SHIFT(log);
603 if (head_blk >= num_scan_bblks) {
604 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
605
606 /* start ptr at last block ptr before head_blk */
607 if ((error = xlog_find_verify_log_record(log, start_blk,
608 &head_blk, 0)) == -1) {
609 error = XFS_ERROR(EIO);
610 goto bp_err;
611 } else if (error)
612 goto bp_err;
613 } else {
614 start_blk = 0;
615 ASSERT(head_blk <= INT_MAX);
616 if ((error = xlog_find_verify_log_record(log, start_blk,
617 &head_blk, 0)) == -1) {
618 /* We hit the beginning of the log during our search */
c40bdaa2 619 start_blk = log_bbnum - (num_scan_bblks - head_blk);
a562a63b
NS
620 new_blk = log_bbnum;
621 ASSERT(start_blk <= INT_MAX &&
622 (xfs_daddr_t) log_bbnum-start_blk >= 0);
623 ASSERT(head_blk <= INT_MAX);
624 if ((error = xlog_find_verify_log_record(log,
625 start_blk, &new_blk,
626 (int)head_blk)) == -1) {
627 error = XFS_ERROR(EIO);
628 goto bp_err;
629 } else if (error)
630 goto bp_err;
631 if (new_blk != log_bbnum)
632 head_blk = new_blk;
633 } else if (error)
634 goto bp_err;
d321ceac
NS
635 }
636
a562a63b
NS
637 xlog_put_bp(bp);
638 if (head_blk == log_bbnum)
639 *return_head_blk = 0;
640 else
641 *return_head_blk = head_blk;
d321ceac 642 /*
a562a63b
NS
643 * When returning here, we have a good block number. Bad block
644 * means that during a previous crash, we didn't have a clean break
645 * from cycle number N to cycle number N-1. In this case, we need
646 * to find the first block with cycle number N-1.
d321ceac 647 */
a562a63b 648 return 0;
d321ceac 649
a562a63b 650 bp_err:
d321ceac
NS
651 xlog_put_bp(bp);
652
5000d01d 653 if (error)
999f0b9c 654 xfs_warn(log->l_mp, "failed to find log head");
d321ceac 655 return error;
a562a63b 656}
d321ceac
NS
657
658/*
659 * Find the sync block number or the tail of the log.
660 *
661 * This will be the block number of the last record to have its
662 * associated buffers synced to disk. Every log record header has
663 * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
4ed50f8a 664 * to get a sync block number. The only concern is to figure out which
d321ceac
NS
665 * log record header to believe.
666 *
667 * The following algorithm uses the log record header with the largest
4ed50f8a 668 * lsn. The entire log record does not need to be valid. We only care
d321ceac
NS
669 * that the header is valid.
670 *
671 * We could speed up search by using current head_blk buffer, but it is not
672 * available.
673 */
674int
a562a63b 675xlog_find_tail(
999f0b9c 676 struct xlog *log,
a562a63b 677 xfs_daddr_t *head_blk,
5e656dbb 678 xfs_daddr_t *tail_blk)
d321ceac
NS
679{
680 xlog_rec_header_t *rhead;
681 xlog_op_header_t *op_head;
d60ba955 682 char *offset = NULL;
d321ceac
NS
683 xfs_buf_t *bp;
684 int error, i, found;
685 xfs_daddr_t umount_data_blk;
686 xfs_daddr_t after_umount_blk;
687 xfs_lsn_t tail_lsn;
73bf5988 688 int hblks;
5000d01d 689
1b6a0044 690 found = 0;
d321ceac
NS
691
692 /*
5000d01d 693 * Find previous log record
d321ceac
NS
694 */
695 if ((error = xlog_find_head(log, head_blk)))
696 return error;
697
a562a63b 698 bp = xlog_get_bp(log, 1);
d321ceac 699 if (!bp)
ce029dc1 700 return ENOMEM;
d321ceac 701 if (*head_blk == 0) { /* special case */
c40bdaa2
DC
702 error = xlog_bread(log, 0, 1, bp, &offset);
703 if (error)
704 goto done;
705
5e656dbb 706 if (xlog_get_cycle(offset) == 0) {
d321ceac
NS
707 *tail_blk = 0;
708 /* leave all other log inited values alone */
c40bdaa2 709 goto done;
d321ceac
NS
710 }
711 }
712
713 /*
714 * Search backwards looking for log record header block
715 */
716 ASSERT(*head_blk < INT_MAX);
1b6a0044 717 for (i = (int)(*head_blk) - 1; i >= 0; i--) {
c40bdaa2
DC
718 error = xlog_bread(log, i, 1, bp, &offset);
719 if (error)
720 goto done;
721
999f0b9c 722 if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
d321ceac
NS
723 found = 1;
724 break;
725 }
726 }
727 /*
728 * If we haven't found the log record header block, start looking
729 * again from the end of the physical log. XXXmiken: There should be
730 * a check here to make sure we didn't search more than N blocks in
731 * the previous code.
732 */
733 if (!found) {
1b6a0044 734 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
c40bdaa2
DC
735 error = xlog_bread(log, i, 1, bp, &offset);
736 if (error)
737 goto done;
738
999f0b9c
DC
739 if (*(__be32 *)offset ==
740 cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
d321ceac
NS
741 found = 2;
742 break;
743 }
744 }
745 }
746 if (!found) {
999f0b9c 747 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
dc93954a 748 xlog_put_bp(bp);
d321ceac
NS
749 ASSERT(0);
750 return XFS_ERROR(EIO);
751 }
752
753 /* find blk_no of tail of log */
a562a63b 754 rhead = (xlog_rec_header_t *)offset;
5e656dbb 755 *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
d321ceac
NS
756
757 /*
758 * Reset log values according to the state of the log when we
759 * crashed. In the case where head_blk == 0, we bump curr_cycle
760 * one because the next write starts a new cycle rather than
761 * continuing the cycle of the last good log record. At this
762 * point we have guaranteed that all partial log records have been
763 * accounted for. Therefore, we know that the last good log record
764 * written was complete and ended exactly on the end boundary
765 * of the physical log.
766 */
767 log->l_prev_block = i;
768 log->l_curr_block = (int)*head_blk;
5e656dbb 769 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
d321ceac
NS
770 if (found == 2)
771 log->l_curr_cycle++;
c40bdaa2
DC
772 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
773 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
999f0b9c 774 xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
c40bdaa2 775 BBTOB(log->l_curr_block));
999f0b9c 776 xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
c40bdaa2 777 BBTOB(log->l_curr_block));
d321ceac
NS
778
779 /*
780 * Look for unmount record. If we find it, then we know there
4ed50f8a 781 * was a clean unmount. Since 'i' could be the last block in
d321ceac
NS
782 * the physical log, we convert to a log block before comparing
783 * to the head_blk.
784 *
785 * Save the current tail lsn to use to pass to
786 * xlog_clear_stale_blocks() below. We won't want to clear the
787 * unmount record if there is one, so we pass the lsn of the
788 * unmount record rather than the block after it.
789 */
5e656dbb
BN
790 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
791 int h_size = be32_to_cpu(rhead->h_size);
792 int h_version = be32_to_cpu(rhead->h_version);
1b6a0044
NS
793
794 if ((h_version & XLOG_VERSION_2) &&
73bf5988
SL
795 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
796 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
797 if (h_size % XLOG_HEADER_CYCLE_SIZE)
798 hblks++;
799 } else {
800 hblks = 1;
801 }
802 } else {
803 hblks = 1;
804 }
1b6a0044 805 after_umount_blk = (i + hblks + (int)
5e656dbb 806 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
c40bdaa2 807 tail_lsn = atomic64_read(&log->l_tail_lsn);
1b6a0044 808 if (*head_blk == after_umount_blk &&
5e656dbb 809 be32_to_cpu(rhead->h_num_logops) == 1) {
73bf5988 810 umount_data_blk = (i + hblks) % log->l_logBBsize;
c40bdaa2
DC
811 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
812 if (error)
813 goto done;
814
a562a63b 815 op_head = (xlog_op_header_t *)offset;
d321ceac
NS
816 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
817 /*
818 * Set tail and last sync so that newly written
819 * log records will point recovery to after the
820 * current unmount record.
821 */
c40bdaa2
DC
822 xlog_assign_atomic_lsn(&log->l_tail_lsn,
823 log->l_curr_cycle, after_umount_blk);
824 xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
825 log->l_curr_cycle, after_umount_blk);
d321ceac 826 *tail_blk = after_umount_blk;
46eca962
NS
827
828 /*
829 * Note that the unmount was clean. If the unmount
830 * was not clean, we need to know this to rebuild the
831 * superblock counters from the perag headers if we
832 * have a filesystem using non-persistent counters.
833 */
834 log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
d321ceac
NS
835 }
836 }
837
d321ceac
NS
838 /*
839 * Make sure that there are no blocks in front of the head
840 * with the same cycle number as the head. This can happen
841 * because we allow multiple outstanding log writes concurrently,
842 * and the later writes might make it out before earlier ones.
843 *
844 * We use the lsn from before modifying it so that we'll never
845 * overwrite the unmount record after a clean unmount.
846 *
847 * Do this only if we are going to recover the filesystem
32181a02
NS
848 *
849 * NOTE: This used to say "if (!readonly)"
850 * However on Linux, we can & do recover a read-only filesystem.
851 * We only skip recovery if NORECOVERY is specified on mount,
852 * in which case we would not be here.
853 *
854 * But... if the -device- itself is readonly, just skip this.
855 * We can't recover this device anyway, so it won't matter.
d321ceac 856 */
c40bdaa2 857 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
d321ceac 858 error = xlog_clear_stale_blocks(log, tail_lsn);
d321ceac 859
c40bdaa2 860done:
d321ceac
NS
861 xlog_put_bp(bp);
862
5000d01d 863 if (error)
999f0b9c 864 xfs_warn(log->l_mp, "failed to locate log tail");
d321ceac 865 return error;
a562a63b 866}
4ed50f8a 867
d321ceac
NS
868/*
869 * Is the log zeroed at all?
870 *
871 * The last binary search should be changed to perform an X block read
4ed50f8a 872 * once X becomes small enough. You can then search linearly through
d321ceac
NS
873 * the X blocks. This will cut down on the number of reads we need to do.
874 *
875 * If the log is partially zeroed, this routine will pass back the blkno
876 * of the first block with cycle number 0. It won't have a complete LR
877 * preceding it.
878 *
879 * Return:
880 * 0 => the log is completely written to
881 * -1 => use *blk_no as the first block of the log
882 * >0 => error has occurred
883 */
884int
a562a63b 885xlog_find_zeroed(
999f0b9c 886 struct xlog *log,
a562a63b 887 xfs_daddr_t *blk_no)
d321ceac
NS
888{
889 xfs_buf_t *bp;
d60ba955 890 char *offset;
4ed50f8a 891 uint first_cycle, last_cycle;
d321ceac 892 xfs_daddr_t new_blk, last_blk, start_blk;
4ed50f8a
RC
893 xfs_daddr_t num_scan_bblks;
894 int error, log_bbnum = log->l_logBBsize;
d321ceac 895
5e656dbb
BN
896 *blk_no = 0;
897
d321ceac 898 /* check totally zeroed log */
a562a63b 899 bp = xlog_get_bp(log, 1);
d321ceac 900 if (!bp)
ce029dc1 901 return ENOMEM;
c40bdaa2
DC
902 error = xlog_bread(log, 0, 1, bp, &offset);
903 if (error)
d321ceac 904 goto bp_err;
c40bdaa2 905
5e656dbb 906 first_cycle = xlog_get_cycle(offset);
d321ceac
NS
907 if (first_cycle == 0) { /* completely zeroed log */
908 *blk_no = 0;
909 xlog_put_bp(bp);
910 return -1;
911 }
912
913 /* check partially zeroed log */
c40bdaa2
DC
914 error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
915 if (error)
d321ceac 916 goto bp_err;
c40bdaa2 917
5e656dbb 918 last_cycle = xlog_get_cycle(offset);
d321ceac
NS
919 if (last_cycle != 0) { /* log completely written to */
920 xlog_put_bp(bp);
921 return 0;
922 } else if (first_cycle != 1) {
923 /*
924 * If the cycle of the last block is zero, the cycle of
5000d01d
SL
925 * the first block must be 1. If it's not, maybe we're
926 * not looking at a log... Bail out.
d321ceac 927 */
999f0b9c
DC
928 xfs_warn(log->l_mp,
929 "Log inconsistent or not a log (last==0, first!=1)");
4623d104
ES
930 error = XFS_ERROR(EINVAL);
931 goto bp_err;
d321ceac 932 }
5000d01d 933
d321ceac
NS
934 /* we have a partially zeroed log */
935 last_blk = log_bbnum-1;
936 if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
937 goto bp_err;
938
939 /*
4ed50f8a 940 * Validate the answer. Because there is no way to guarantee that
d321ceac
NS
941 * the entire log is made up of log records which are the same size,
942 * we scan over the defined maximum blocks. At this point, the maximum
943 * is not chosen to mean anything special. XXXmiken
944 */
73bf5988 945 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
d321ceac 946 ASSERT(num_scan_bblks <= INT_MAX);
5000d01d 947
d321ceac
NS
948 if (last_blk < num_scan_bblks)
949 num_scan_bblks = last_blk;
950 start_blk = last_blk - num_scan_bblks;
5000d01d 951
d321ceac
NS
952 /*
953 * We search for any instances of cycle number 0 that occur before
954 * our current estimate of the head. What we're trying to detect is
4ed50f8a
RC
955 * 1 ... | 0 | 1 | 0...
956 * ^ binary search ends here
d321ceac 957 */
ce029dc1
ES
958 if ((error = xlog_find_verify_cycle(log, start_blk,
959 (int)num_scan_bblks, 0, &new_blk)))
606d804d 960 goto bp_err;
ce029dc1
ES
961 if (new_blk != -1)
962 last_blk = new_blk;
d321ceac
NS
963
964 /*
965 * Potentially backup over partial log record write. We don't need
966 * to search the end of the log because we know it is zero.
967 */
5000d01d 968 if ((error = xlog_find_verify_log_record(log, start_blk,
79c48ada
ES
969 &last_blk, 0)) == -1) {
970 error = XFS_ERROR(EIO);
971 goto bp_err;
972 } else if (error)
d321ceac
NS
973 goto bp_err;
974
975 *blk_no = last_blk;
976bp_err:
977 xlog_put_bp(bp);
978 if (error)
979 return error;
980 return -1;
a562a63b 981}
d321ceac 982
d321ceac 983STATIC xlog_recover_t *
a562a63b 984xlog_recover_find_tid(
c40bdaa2 985 struct hlist_head *head,
5e656dbb 986 xlog_tid_t tid)
d321ceac 987{
c40bdaa2
DC
988 xlog_recover_t *trans;
989 struct hlist_node *n;
d321ceac 990
c40bdaa2
DC
991 hlist_for_each_entry(trans, n, head, r_list) {
992 if (trans->r_log_tid == tid)
993 return trans;
d321ceac 994 }
c40bdaa2 995 return NULL;
a562a63b 996}
4ed50f8a 997
d321ceac 998STATIC void
c40bdaa2
DC
999xlog_recover_new_tid(
1000 struct hlist_head *head,
1001 xlog_tid_t tid,
1002 xfs_lsn_t lsn)
d321ceac 1003{
c40bdaa2
DC
1004 xlog_recover_t *trans;
1005
6cd1e6db 1006 trans = kmem_zalloc(sizeof(xlog_recover_t), 0);
c40bdaa2
DC
1007 trans->r_log_tid = tid;
1008 trans->r_lsn = lsn;
1009 INIT_LIST_HEAD(&trans->r_itemq);
1010
1011 INIT_HLIST_NODE(&trans->r_list);
1012 hlist_add_head(&trans->r_list, head);
a562a63b 1013}
d321ceac
NS
1014
1015STATIC void
a562a63b 1016xlog_recover_add_item(
c40bdaa2 1017 struct list_head *head)
d321ceac 1018{
a562a63b 1019 xlog_recover_item_t *item;
d321ceac 1020
6cd1e6db 1021 item = kmem_zalloc(sizeof(xlog_recover_item_t), 0);
c40bdaa2
DC
1022 INIT_LIST_HEAD(&item->ri_list);
1023 list_add_tail(&item->ri_list, head);
a562a63b 1024}
d321ceac 1025
a562a63b
NS
1026STATIC int
1027xlog_recover_add_to_cont_trans(
999f0b9c
DC
1028 struct xlog *log,
1029 struct xlog_recover *trans,
d60ba955 1030 char *dp,
a562a63b
NS
1031 int len)
1032{
1033 xlog_recover_item_t *item;
d60ba955 1034 char *ptr, *old_ptr;
a562a63b
NS
1035 int old_len;
1036
c40bdaa2 1037 if (list_empty(&trans->r_itemq)) {
a562a63b
NS
1038 /* finish copying rest of trans header */
1039 xlog_recover_add_item(&trans->r_itemq);
d60ba955 1040 ptr = (char *) &trans->r_theader +
a562a63b
NS
1041 sizeof(xfs_trans_header_t) - len;
1042 memcpy(ptr, dp, len); /* d, s, l */
1043 return 0;
1044 }
c40bdaa2
DC
1045 /* take the tail entry */
1046 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
a562a63b
NS
1047
1048 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1049 old_len = item->ri_buf[item->ri_cnt-1].i_len;
1050
6cd1e6db 1051 ptr = kmem_realloc(old_ptr, len+old_len, 0);
a562a63b
NS
1052 memcpy(&ptr[old_len], dp, len); /* d, s, l */
1053 item->ri_buf[item->ri_cnt-1].i_len += len;
1054 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
c40bdaa2 1055 trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
a562a63b
NS
1056 return 0;
1057}
1058
1059/*
1060 * The next region to add is the start of a new region. It could be
d321ceac
NS
1061 * a whole region or it could be the first part of a new region. Because
1062 * of this, the assumption here is that the type and size fields of all
1063 * format structures fit into the first 32 bits of the structure.
1064 *
1065 * This works because all regions must be 32 bit aligned. Therefore, we
1066 * either have both fields or we have neither field. In the case we have
1067 * neither field, the data part of the region is zero length. We only have
1068 * a log_op_header and can throw away the header since a new one will appear
1069 * later. If we have at least 4 bytes, then we can determine how many regions
1070 * will appear in the current log item.
1071 */
1072STATIC int
a562a63b 1073xlog_recover_add_to_trans(
999f0b9c
DC
1074 struct xlog *log,
1075 struct xlog_recover *trans,
d60ba955 1076 char *dp,
a562a63b 1077 int len)
d321ceac 1078{
e784bcd5 1079 struct xfs_inode_log_format *in_f; /* any will do */
a562a63b 1080 xlog_recover_item_t *item;
d60ba955 1081 char *ptr;
d321ceac
NS
1082
1083 if (!len)
1084 return 0;
c40bdaa2
DC
1085 if (list_empty(&trans->r_itemq)) {
1086 /* we need to catch log corruptions here */
1087 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
999f0b9c
DC
1088 xfs_warn(log->l_mp, "%s: bad header magic number",
1089 __func__);
c40bdaa2
DC
1090 ASSERT(0);
1091 return XFS_ERROR(EIO);
1092 }
d321ceac
NS
1093 if (len == sizeof(xfs_trans_header_t))
1094 xlog_recover_add_item(&trans->r_itemq);
32181a02 1095 memcpy(&trans->r_theader, dp, len); /* d, s, l */
d321ceac
NS
1096 return 0;
1097 }
a562a63b 1098
6cd1e6db 1099 ptr = kmem_alloc(len, 0);
a562a63b 1100 memcpy(ptr, dp, len);
e784bcd5 1101 in_f = (struct xfs_inode_log_format *)ptr;
a562a63b 1102
c40bdaa2
DC
1103 /* take the tail entry */
1104 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1105 if (item->ri_total != 0 &&
1106 item->ri_total == item->ri_cnt) {
1107 /* tail item is in use, get a new one */
d321ceac 1108 xlog_recover_add_item(&trans->r_itemq);
c40bdaa2
DC
1109 item = list_entry(trans->r_itemq.prev,
1110 xlog_recover_item_t, ri_list);
d321ceac 1111 }
d321ceac
NS
1112
1113 if (item->ri_total == 0) { /* first region to be added */
c40bdaa2
DC
1114 if (in_f->ilf_size == 0 ||
1115 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
999f0b9c
DC
1116 xfs_warn(log->l_mp,
1117 "bad number of regions (%d) in inode log format",
c40bdaa2
DC
1118 in_f->ilf_size);
1119 ASSERT(0);
0284875f 1120 kmem_free(ptr);
c40bdaa2
DC
1121 return XFS_ERROR(EIO);
1122 }
1123
1124 item->ri_total = in_f->ilf_size;
1125 item->ri_buf =
1126 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
6cd1e6db 1127 0);
d321ceac
NS
1128 }
1129 ASSERT(item->ri_total > item->ri_cnt);
1130 /* Description region is ri_buf[0] */
1131 item->ri_buf[item->ri_cnt].i_addr = ptr;
1132 item->ri_buf[item->ri_cnt].i_len = len;
1133 item->ri_cnt++;
c40bdaa2 1134 trace_xfs_log_recover_item_add(log, trans, item, 0);
d321ceac 1135 return 0;
a562a63b 1136}
d321ceac 1137
5e656dbb
BN
1138/*
1139 * Free up any resources allocated by the transaction
1140 *
1141 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
1142 */
1143STATIC void
1144xlog_recover_free_trans(
c40bdaa2 1145 struct xlog_recover *trans)
5e656dbb 1146{
c40bdaa2 1147 xlog_recover_item_t *item, *n;
5e656dbb
BN
1148 int i;
1149
c40bdaa2
DC
1150 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
1151 /* Free the regions in the item. */
1152 list_del(&item->ri_list);
1153 for (i = 0; i < item->ri_cnt; i++)
1154 kmem_free(item->ri_buf[i].i_addr);
5e656dbb 1155 /* Free the item itself */
c40bdaa2
DC
1156 kmem_free(item->ri_buf);
1157 kmem_free(item);
1158 }
5e656dbb
BN
1159 /* Free the transaction recover structure */
1160 kmem_free(trans);
1161}
1162
c40bdaa2
DC
1163/*
1164 * Perform the transaction.
1165 *
1166 * If the transaction modifies a buffer or inode, do it now. Otherwise,
1167 * EFIs and EFDs get queued up by adding entries into the AIL for them.
1168 */
5e656dbb
BN
1169STATIC int
1170xlog_recover_commit_trans(
999f0b9c 1171 struct xlog *log,
c40bdaa2 1172 struct xlog_recover *trans,
5e656dbb
BN
1173 int pass)
1174{
c40bdaa2 1175 int error = 0;
5e656dbb 1176
c40bdaa2 1177 hlist_del(&trans->r_list);
5e656dbb
BN
1178 if ((error = xlog_recover_do_trans(log, trans, pass)))
1179 return error;
c40bdaa2
DC
1180
1181 xlog_recover_free_trans(trans);
5e656dbb
BN
1182 return 0;
1183}
1184
d321ceac 1185STATIC int
a562a63b 1186xlog_recover_unmount_trans(
5e656dbb 1187 xlog_recover_t *trans)
d321ceac
NS
1188{
1189 /* Do nothing now */
999f0b9c 1190 xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
a562a63b
NS
1191 return 0;
1192}
d321ceac 1193
a562a63b
NS
1194/*
1195 * There are two valid states of the r_state field. 0 indicates that the
1196 * transaction structure is in a normal state. We have either seen the
1197 * start of the transaction or the last operation we added was not a partial
1198 * operation. If the last operation we added to the transaction was a
1199 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
1200 *
1201 * NOTE: skip LRs with 0 data length.
1202 */
d321ceac 1203STATIC int
a562a63b 1204xlog_recover_process_data(
999f0b9c 1205 struct xlog *log,
c40bdaa2 1206 struct hlist_head rhash[],
999f0b9c 1207 struct xlog_rec_header *rhead,
d60ba955 1208 char *dp,
a562a63b 1209 int pass)
d321ceac 1210{
d60ba955 1211 char *lp;
a562a63b
NS
1212 int num_logops;
1213 xlog_op_header_t *ohead;
1214 xlog_recover_t *trans;
1215 xlog_tid_t tid;
1216 int error;
1217 unsigned long hash;
1218 uint flags;
1219
5e656dbb
BN
1220 lp = dp + be32_to_cpu(rhead->h_len);
1221 num_logops = be32_to_cpu(rhead->h_num_logops);
a562a63b
NS
1222
1223 /* check the log format matches our own - else we can't recover */
1224 if (xlog_header_check_recover(log->l_mp, rhead))
1225 return (XFS_ERROR(EIO));
1226
1227 while ((dp < lp) && num_logops) {
1228 ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
1229 ohead = (xlog_op_header_t *)dp;
1230 dp += sizeof(xlog_op_header_t);
1231 if (ohead->oh_clientid != XFS_TRANSACTION &&
1232 ohead->oh_clientid != XFS_LOG) {
999f0b9c
DC
1233 xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
1234 __func__, ohead->oh_clientid);
a562a63b
NS
1235 ASSERT(0);
1236 return (XFS_ERROR(EIO));
d321ceac 1237 }
5e656dbb 1238 tid = be32_to_cpu(ohead->oh_tid);
a562a63b 1239 hash = XLOG_RHASH(tid);
c40bdaa2 1240 trans = xlog_recover_find_tid(&rhash[hash], tid);
a562a63b
NS
1241 if (trans == NULL) { /* not found; add new tid */
1242 if (ohead->oh_flags & XLOG_START_TRANS)
1243 xlog_recover_new_tid(&rhash[hash], tid,
5e656dbb 1244 be64_to_cpu(rhead->h_lsn));
a562a63b 1245 } else {
c40bdaa2 1246 if (dp + be32_to_cpu(ohead->oh_len) > lp) {
999f0b9c
DC
1247 xfs_warn(log->l_mp, "%s: bad length 0x%x",
1248 __func__, be32_to_cpu(ohead->oh_len));
c40bdaa2
DC
1249 return (XFS_ERROR(EIO));
1250 }
a562a63b
NS
1251 flags = ohead->oh_flags & ~XLOG_END_TRANS;
1252 if (flags & XLOG_WAS_CONT_TRANS)
1253 flags &= ~XLOG_CONTINUE_TRANS;
1254 switch (flags) {
1255 case XLOG_COMMIT_TRANS:
1256 error = xlog_recover_commit_trans(log,
c40bdaa2 1257 trans, pass);
a562a63b
NS
1258 break;
1259 case XLOG_UNMOUNT_TRANS:
1260 error = xlog_recover_unmount_trans(trans);
1261 break;
1262 case XLOG_WAS_CONT_TRANS:
c40bdaa2
DC
1263 error = xlog_recover_add_to_cont_trans(log,
1264 trans, dp,
1265 be32_to_cpu(ohead->oh_len));
a562a63b
NS
1266 break;
1267 case XLOG_START_TRANS:
999f0b9c
DC
1268 xfs_warn(log->l_mp, "%s: bad transaction",
1269 __func__);
a562a63b
NS
1270 ASSERT(0);
1271 error = XFS_ERROR(EIO);
1272 break;
1273 case 0:
1274 case XLOG_CONTINUE_TRANS:
c40bdaa2 1275 error = xlog_recover_add_to_trans(log, trans,
5e656dbb 1276 dp, be32_to_cpu(ohead->oh_len));
a562a63b
NS
1277 break;
1278 default:
999f0b9c
DC
1279 xfs_warn(log->l_mp, "%s: bad flag 0x%x",
1280 __func__, flags);
a562a63b
NS
1281 ASSERT(0);
1282 error = XFS_ERROR(EIO);
1283 break;
1284 }
1285 if (error)
1286 return error;
d321ceac 1287 }
5e656dbb 1288 dp += be32_to_cpu(ohead->oh_len);
a562a63b
NS
1289 num_logops--;
1290 }
1291 return 0;
1292}
d321ceac 1293
999f0b9c
DC
1294/*
1295 * Upack the log buffer data and crc check it. If the check fails, issue a
1296 * warning if and only if the CRC in the header is non-zero. This makes the
1297 * check an advisory warning, and the zero CRC check will prevent failure
1298 * warnings from being emitted when upgrading the kernel from one that does not
1299 * add CRCs by default.
1300 *
1301 * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
1302 * corruption failure
1303 *
1304 * XXX: we do not calculate the CRC here yet. It's not clear what we should do
1305 * with CRC errors here in userspace, so we'll address that problem later on.
1306 */
1307#define xlog_cksum(l,r,dp,len) ((r)->h_crc)
1308STATIC int
1309xlog_unpack_data_crc(
1310 struct xlog_rec_header *rhead,
d60ba955 1311 char *dp,
999f0b9c
DC
1312 struct xlog *log)
1313{
1314 __le32 crc;
1315
1316 crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
1317 if (crc != rhead->h_crc) {
1318 if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
1319 xfs_alert(log->l_mp,
12864fd9 1320 "log record CRC mismatch: found 0x%x, expected 0x%x.",
999f0b9c
DC
1321 le32_to_cpu(rhead->h_crc),
1322 le32_to_cpu(crc));
1323 xfs_hex_dump(dp, 32);
1324 }
1325
1326 /*
1327 * If we've detected a log record corruption, then we can't
1328 * recover past this point. Abort recovery if we are enforcing
1329 * CRC protection by punting an error back up the stack.
1330 */
1331 if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
1332 return EFSCORRUPTED;
1333 }
1334
1335 return 0;
1336}
1337
1338STATIC int
5e656dbb 1339xlog_unpack_data(
999f0b9c 1340 struct xlog_rec_header *rhead,
d60ba955 1341 char *dp,
999f0b9c 1342 struct xlog *log)
5e656dbb
BN
1343{
1344 int i, j, k;
999f0b9c
DC
1345 int error;
1346
1347 error = xlog_unpack_data_crc(rhead, dp, log);
1348 if (error)
1349 return error;
5e656dbb
BN
1350
1351 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
1352 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
1353 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
1354 dp += BBSIZE;
1355 }
1356
1357 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
c40bdaa2 1358 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
5e656dbb
BN
1359 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
1360 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1361 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1362 *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
1363 dp += BBSIZE;
1364 }
1365 }
999f0b9c
DC
1366
1367 return 0;
5e656dbb
BN
1368}
1369
72c5917e
NS
1370STATIC int
1371xlog_valid_rec_header(
999f0b9c
DC
1372 struct xlog *log,
1373 struct xlog_rec_header *rhead,
72c5917e
NS
1374 xfs_daddr_t blkno)
1375{
b0e364f6 1376 int hlen;
72c5917e 1377
999f0b9c 1378 if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
72c5917e
NS
1379 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
1380 XFS_ERRLEVEL_LOW, log->l_mp);
1381 return XFS_ERROR(EFSCORRUPTED);
1382 }
1383 if (unlikely(
46eca962 1384 (!rhead->h_version ||
5e656dbb 1385 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
999f0b9c 1386 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
5e656dbb 1387 __func__, be32_to_cpu(rhead->h_version));
72c5917e
NS
1388 return XFS_ERROR(EIO);
1389 }
1390
1391 /* LR body must have data or it wouldn't have been written */
5e656dbb 1392 hlen = be32_to_cpu(rhead->h_len);
b0e364f6 1393 if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
72c5917e
NS
1394 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
1395 XFS_ERRLEVEL_LOW, log->l_mp);
1396 return XFS_ERROR(EFSCORRUPTED);
1397 }
1398 if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
1399 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
1400 XFS_ERRLEVEL_LOW, log->l_mp);
1401 return XFS_ERROR(EFSCORRUPTED);
1402 }
1403 return 0;
1404}
1405
d321ceac
NS
1406/*
1407 * Read the log from tail to head and process the log records found.
1408 * Handle the two cases where the tail and head are in the same cycle
1409 * and where the active portion of the log wraps around the end of
4ed50f8a 1410 * the physical log separately. The pass parameter is passed through
d321ceac
NS
1411 * to the routines called to process the data and is not looked at
1412 * here.
1413 */
1414int
a562a63b 1415xlog_do_recovery_pass(
999f0b9c 1416 struct xlog *log,
a562a63b
NS
1417 xfs_daddr_t head_blk,
1418 xfs_daddr_t tail_blk,
1419 int pass)
d321ceac 1420{
a562a63b
NS
1421 xlog_rec_header_t *rhead;
1422 xfs_daddr_t blk_no;
d60ba955 1423 char *offset;
a562a63b
NS
1424 xfs_buf_t *hbp, *dbp;
1425 int error = 0, h_size;
1426 int bblks, split_bblks;
1427 int hblks, split_hblks, wrapped_hblks;
c40bdaa2 1428 struct hlist_head rhash[XLOG_RHASH_SIZE];
a562a63b 1429
72c5917e
NS
1430 ASSERT(head_blk != tail_blk);
1431
73bf5988 1432 /*
a562a63b
NS
1433 * Read the header of the tail block and get the iclog buffer size from
1434 * h_size. Use this to tell how many sectors make up the log header.
73bf5988 1435 */
5e656dbb 1436 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
a562a63b
NS
1437 /*
1438 * When using variable length iclogs, read first sector of
1439 * iclog header and extract the header size from it. Get a
1440 * new hbp that is the correct size.
1441 */
1442 hbp = xlog_get_bp(log, 1);
1443 if (!hbp)
1444 return ENOMEM;
c40bdaa2
DC
1445
1446 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
1447 if (error)
a562a63b 1448 goto bread_err1;
c40bdaa2 1449
a562a63b 1450 rhead = (xlog_rec_header_t *)offset;
72c5917e
NS
1451 error = xlog_valid_rec_header(log, rhead, tail_blk);
1452 if (error)
a562a63b 1453 goto bread_err1;
5e656dbb
BN
1454 h_size = be32_to_cpu(rhead->h_size);
1455 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
a562a63b
NS
1456 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1457 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1458 if (h_size % XLOG_HEADER_CYCLE_SIZE)
1459 hblks++;
1460 xlog_put_bp(hbp);
1461 hbp = xlog_get_bp(log, hblks);
1462 } else {
1463 hblks = 1;
1464 }
73bf5988 1465 } else {
c40bdaa2 1466 ASSERT(log->l_sectBBsize == 1);
a562a63b
NS
1467 hblks = 1;
1468 hbp = xlog_get_bp(log, 1);
1469 h_size = XLOG_BIG_RECORD_BSIZE;
73bf5988 1470 }
a562a63b
NS
1471
1472 if (!hbp)
1473 return ENOMEM;
1474 dbp = xlog_get_bp(log, BTOBB(h_size));
1475 if (!dbp) {
1476 xlog_put_bp(hbp);
1477 return ENOMEM;
d321ceac 1478 }
a562a63b
NS
1479
1480 memset(rhash, 0, sizeof(rhash));
1481 if (tail_blk <= head_blk) {
1482 for (blk_no = tail_blk; blk_no < head_blk; ) {
c40bdaa2
DC
1483 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
1484 if (error)
a562a63b 1485 goto bread_err2;
c40bdaa2 1486
a562a63b 1487 rhead = (xlog_rec_header_t *)offset;
72c5917e
NS
1488 error = xlog_valid_rec_header(log, rhead, blk_no);
1489 if (error)
a562a63b 1490 goto bread_err2;
a562a63b 1491
a562a63b 1492 /* blocks in data section */
5e656dbb 1493 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
c40bdaa2
DC
1494 error = xlog_bread(log, blk_no + hblks, bblks, dbp,
1495 &offset);
72c5917e
NS
1496 if (error)
1497 goto bread_err2;
c40bdaa2 1498
999f0b9c
DC
1499 error = xlog_unpack_data(rhead, offset, log);
1500 if (error)
1501 goto bread_err2;
1502
1503 error = xlog_recover_process_data(log,
1504 rhash, rhead, offset, pass);
1505 if (error)
72c5917e
NS
1506 goto bread_err2;
1507 blk_no += bblks + hblks;
73bf5988 1508 }
a562a63b
NS
1509 } else {
1510 /*
1511 * Perform recovery around the end of the physical log.
1512 * When the head is not on the same cycle number as the tail,
1513 * we can't do a sequential recovery as above.
1514 */
1515 blk_no = tail_blk;
1516 while (blk_no < log->l_logBBsize) {
1517 /*
1518 * Check for header wrapping around physical end-of-log
1519 */
999f0b9c 1520 offset = hbp->b_addr;
72c5917e 1521 split_hblks = 0;
a562a63b 1522 wrapped_hblks = 0;
72c5917e 1523 if (blk_no + hblks <= log->l_logBBsize) {
a562a63b 1524 /* Read header in one read */
c40bdaa2
DC
1525 error = xlog_bread(log, blk_no, hblks, hbp,
1526 &offset);
72c5917e 1527 if (error)
a562a63b 1528 goto bread_err2;
a562a63b
NS
1529 } else {
1530 /* This LR is split across physical log end */
a562a63b
NS
1531 if (blk_no != log->l_logBBsize) {
1532 /* some data before physical log end */
1533 ASSERT(blk_no <= INT_MAX);
1534 split_hblks = log->l_logBBsize - (int)blk_no;
1535 ASSERT(split_hblks > 0);
c40bdaa2
DC
1536 error = xlog_bread(log, blk_no,
1537 split_hblks, hbp,
1538 &offset);
1539 if (error)
a562a63b 1540 goto bread_err2;
a562a63b 1541 }
c40bdaa2 1542
a562a63b
NS
1543 /*
1544 * Note: this black magic still works with
1545 * large sector sizes (non-512) only because:
1546 * - we increased the buffer size originally
1547 * by 1 sector giving us enough extra space
1548 * for the second read;
1549 * - the log start is guaranteed to be sector
1550 * aligned;
1551 * - we read the log end (LR header start)
1552 * _first_, then the log start (LR header end)
1553 * - order is important.
1554 */
5e656dbb 1555 wrapped_hblks = hblks - split_hblks;
999f0b9c
DC
1556 error = xlog_bread_offset(log, 0,
1557 wrapped_hblks, hbp,
1558 offset + BBTOB(split_hblks));
c40bdaa2
DC
1559 if (error)
1560 goto bread_err2;
a562a63b
NS
1561 }
1562 rhead = (xlog_rec_header_t *)offset;
72c5917e
NS
1563 error = xlog_valid_rec_header(log, rhead,
1564 split_hblks ? blk_no : 0);
1565 if (error)
a562a63b 1566 goto bread_err2;
72c5917e 1567
5e656dbb 1568 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
72c5917e 1569 blk_no += hblks;
a562a63b
NS
1570
1571 /* Read in data for log record */
72c5917e 1572 if (blk_no + bblks <= log->l_logBBsize) {
c40bdaa2
DC
1573 error = xlog_bread(log, blk_no, bblks, dbp,
1574 &offset);
72c5917e 1575 if (error)
a562a63b 1576 goto bread_err2;
a562a63b
NS
1577 } else {
1578 /* This log record is split across the
1579 * physical end of log */
999f0b9c 1580 offset = dbp->b_addr;
a562a63b
NS
1581 split_bblks = 0;
1582 if (blk_no != log->l_logBBsize) {
1583 /* some data is before the physical
1584 * end of log */
1585 ASSERT(!wrapped_hblks);
1586 ASSERT(blk_no <= INT_MAX);
1587 split_bblks =
1588 log->l_logBBsize - (int)blk_no;
1589 ASSERT(split_bblks > 0);
c40bdaa2
DC
1590 error = xlog_bread(log, blk_no,
1591 split_bblks, dbp,
1592 &offset);
1593 if (error)
a562a63b 1594 goto bread_err2;
a562a63b 1595 }
c40bdaa2 1596
a562a63b
NS
1597 /*
1598 * Note: this black magic still works with
1599 * large sector sizes (non-512) only because:
1600 * - we increased the buffer size originally
1601 * by 1 sector giving us enough extra space
1602 * for the second read;
1603 * - the log start is guaranteed to be sector
1604 * aligned;
1605 * - we read the log end (LR header start)
1606 * _first_, then the log start (LR header end)
1607 * - order is important.
1608 */
999f0b9c
DC
1609 error = xlog_bread_offset(log, 0,
1610 bblks - split_bblks, dbp,
1611 offset + BBTOB(split_bblks));
5e656dbb 1612 if (error)
a562a63b 1613 goto bread_err2;
999f0b9c 1614 }
c40bdaa2 1615
999f0b9c
DC
1616 error = xlog_unpack_data(rhead, offset, log);
1617 if (error)
1618 goto bread_err2;
c40bdaa2 1619
999f0b9c
DC
1620 error = xlog_recover_process_data(log, rhash,
1621 rhead, offset, pass);
1622 if (error)
a562a63b
NS
1623 goto bread_err2;
1624 blk_no += bblks;
d321ceac 1625 }
d321ceac 1626
a562a63b
NS
1627 ASSERT(blk_no >= log->l_logBBsize);
1628 blk_no -= log->l_logBBsize;
1629
1630 /* read first part of physical log */
1631 while (blk_no < head_blk) {
c40bdaa2
DC
1632 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
1633 if (error)
a562a63b 1634 goto bread_err2;
c40bdaa2 1635
a562a63b 1636 rhead = (xlog_rec_header_t *)offset;
72c5917e
NS
1637 error = xlog_valid_rec_header(log, rhead, blk_no);
1638 if (error)
1639 goto bread_err2;
c40bdaa2 1640
5e656dbb 1641 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
c40bdaa2
DC
1642 error = xlog_bread(log, blk_no+hblks, bblks, dbp,
1643 &offset);
1644 if (error)
a562a63b 1645 goto bread_err2;
c40bdaa2 1646
999f0b9c
DC
1647 error = xlog_unpack_data(rhead, offset, log);
1648 if (error)
1649 goto bread_err2;
1650
1651 error = xlog_recover_process_data(log, rhash,
1652 rhead, offset, pass);
1653 if (error)
a562a63b 1654 goto bread_err2;
72c5917e 1655 blk_no += bblks + hblks;
a562a63b 1656 }
5000d01d 1657 }
d321ceac 1658
a562a63b
NS
1659 bread_err2:
1660 xlog_put_bp(dbp);
1661 bread_err1:
1662 xlog_put_bp(hbp);
1663 return error;
d321ceac 1664}