]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - libxlog/xfs_log_recover.c
xfs: remove xfs_buf_t typedef
[thirdparty/xfsprogs-dev.git] / libxlog / xfs_log_recover.c
CommitLineData
959ef981 1// SPDX-License-Identifier: GPL-2.0
d321ceac 2/*
5e656dbb 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
da23017d 4 * All Rights Reserved.
d321ceac 5 */
6b803e5a
CH
6#include "libxfs.h"
7#include "libxlog.h"
d321ceac 8
a562a63b
NS
9#define xfs_readonly_buftarg(buftarg) (0)
10
999f0b9c
DC
11/* avoid set-but-unused var warning. gcc is not very bright. */
12#define xlog_clear_stale_blocks(log, taillsn) ({ \
13 (taillsn) = (taillsn); \
14 (0); \
15})
16
cc085d77 17#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
2aa2e7b9
BN
18
19/*
c40bdaa2
DC
20 * Verify the given count of basic blocks is valid number of blocks
21 * to specify for an operation involving the given XFS log buffer.
22 * Returns nonzero if the count is valid, 0 otherwise.
2aa2e7b9
BN
23 */
24
c40bdaa2
DC
25static inline int
26xlog_buf_bbcount_valid(
999f0b9c 27 struct xlog *log,
c40bdaa2
DC
28 int bbcount)
29{
30 return bbcount > 0 && bbcount <= log->l_logBBsize;
31}
2aa2e7b9 32
c40bdaa2
DC
33/*
34 * Allocate a buffer to hold log data. The buffer needs to be able
35 * to map to a range of nbblks basic blocks at any valid (basic
36 * block) offset within the log.
37 */
d918bc57 38struct xfs_buf *
2aa2e7b9 39xlog_get_bp(
999f0b9c 40 struct xlog *log,
c40bdaa2 41 int nbblks)
2aa2e7b9 42{
d918bc57
DW
43 struct xfs_buf *bp;
44
c40bdaa2 45 if (!xlog_buf_bbcount_valid(log, nbblks)) {
999f0b9c 46 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
c40bdaa2
DC
47 nbblks);
48 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
49 return NULL;
2aa2e7b9 50 }
c40bdaa2
DC
51
52 /*
53 * We do log I/O in units of log sectors (a power-of-2
54 * multiple of the basic block size), so we round up the
999f0b9c 55 * requested size to accommodate the basic blocks required
c40bdaa2
DC
56 * for complete log sectors.
57 *
58 * In addition, the buffer may be used for a non-sector-
59 * aligned block offset, in which case an I/O of the
60 * requested size could extend beyond the end of the
61 * buffer. If the requested size is only 1 basic block it
62 * will never straddle a sector boundary, so this won't be
63 * an issue. Nor will this be a problem if the log I/O is
64 * done in basic blocks (sector size 1). But otherwise we
65 * extend the buffer by one extra log sector to ensure
999f0b9c 66 * there's space to accommodate this possibility.
c40bdaa2
DC
67 */
68 if (nbblks > 1 && log->l_sectBBsize > 1)
69 nbblks += log->l_sectBBsize;
999f0b9c 70 nbblks = round_up(nbblks, log->l_sectBBsize);
c40bdaa2 71
d918bc57
DW
72 libxfs_buf_get_uncached(log->l_dev, nbblks, 0, &bp);
73 return bp;
2aa2e7b9
BN
74}
75
c40bdaa2
DC
76/*
77 * Return the address of the start of the given block number's data
78 * in a log buffer. The buffer covers a log sector-aligned region.
79 */
d60ba955 80STATIC char *
c40bdaa2 81xlog_align(
999f0b9c 82 struct xlog *log,
c40bdaa2
DC
83 xfs_daddr_t blk_no,
84 int nbblks,
999f0b9c 85 struct xfs_buf *bp)
c40bdaa2 86{
999f0b9c 87 xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
c40bdaa2 88
999f0b9c
DC
89 ASSERT(offset + nbblks <= bp->b_length);
90 return bp->b_addr + BBTOB(offset);
c40bdaa2 91}
2aa2e7b9 92
999f0b9c 93
2aa2e7b9
BN
94/*
95 * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
96 */
97int
c40bdaa2 98xlog_bread_noalign(
999f0b9c 99 struct xlog *log,
2aa2e7b9
BN
100 xfs_daddr_t blk_no,
101 int nbblks,
999f0b9c 102 struct xfs_buf *bp)
2aa2e7b9 103{
c40bdaa2 104 if (!xlog_buf_bbcount_valid(log, nbblks)) {
999f0b9c 105 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
c40bdaa2
DC
106 nbblks);
107 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
108 return EFSCORRUPTED;
109 }
110
999f0b9c
DC
111 blk_no = round_down(blk_no, log->l_sectBBsize);
112 nbblks = round_up(nbblks, log->l_sectBBsize);
2aa2e7b9
BN
113
114 ASSERT(nbblks > 0);
c0594dd6 115 ASSERT(nbblks <= bp->b_length);
2aa2e7b9
BN
116
117 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
c0594dd6 118 bp->b_length = nbblks;
999f0b9c 119 bp->b_error = 0;
2aa2e7b9
BN
120
121 return libxfs_readbufr(log->l_dev, XFS_BUF_ADDR(bp), bp, nbblks, 0);
122}
123
c40bdaa2
DC
124int
125xlog_bread(
999f0b9c 126 struct xlog *log,
2aa2e7b9
BN
127 xfs_daddr_t blk_no,
128 int nbblks,
999f0b9c 129 struct xfs_buf *bp,
d60ba955 130 char **offset)
2aa2e7b9 131{
c40bdaa2 132 int error;
2aa2e7b9 133
c40bdaa2
DC
134 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
135 if (error)
136 return error;
2aa2e7b9 137
c40bdaa2
DC
138 *offset = xlog_align(log, blk_no, nbblks, bp);
139 return 0;
2aa2e7b9
BN
140}
141
999f0b9c
DC
142/*
143 * Read at an offset into the buffer. Returns with the buffer in it's original
144 * state regardless of the result of the read.
145 */
146STATIC int
147xlog_bread_offset(
148 struct xlog *log,
149 xfs_daddr_t blk_no, /* block to read from */
150 int nbblks, /* blocks to read */
151 struct xfs_buf *bp,
d60ba955 152 char *offset)
999f0b9c 153{
d60ba955 154 char *orig_offset = bp->b_addr;
c0594dd6 155 int orig_len = BBTOB(bp->b_length);
999f0b9c
DC
156 int error, error2;
157
fc9f709f 158 error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
999f0b9c
DC
159 if (error)
160 return error;
161
162 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
163
164 /* must reset buffer pointer even on error */
fc9f709f 165 error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
999f0b9c
DC
166 if (error)
167 return error;
168 return error2;
169}
170
d321ceac
NS
171/*
172 * This routine finds (to an approximation) the first block in the physical
4ed50f8a 173 * log which contains the given cycle. It uses a binary search algorithm.
d321ceac
NS
174 * Note that the algorithm can not be perfect because the disk will not
175 * necessarily be perfect.
176 */
177int
a562a63b 178xlog_find_cycle_start(
999f0b9c
DC
179 struct xlog *log,
180 struct xfs_buf *bp,
a562a63b
NS
181 xfs_daddr_t first_blk,
182 xfs_daddr_t *last_blk,
183 uint cycle)
d321ceac 184{
d60ba955 185 char *offset;
ffe29fb5 186 xfs_daddr_t mid_blk;
c40bdaa2 187 xfs_daddr_t end_blk;
ffe29fb5
NS
188 uint mid_cycle;
189 int error;
d321ceac 190
c40bdaa2
DC
191 end_blk = *last_blk;
192 mid_blk = BLK_AVG(first_blk, end_blk);
193 while (mid_blk != first_blk && mid_blk != end_blk) {
194 error = xlog_bread(log, mid_blk, 1, bp, &offset);
195 if (error)
d321ceac 196 return error;
5e656dbb 197 mid_cycle = xlog_get_cycle(offset);
c40bdaa2
DC
198 if (mid_cycle == cycle)
199 end_blk = mid_blk; /* last_half_cycle == mid_cycle */
200 else
201 first_blk = mid_blk; /* first_half_cycle == mid_cycle */
202 mid_blk = BLK_AVG(first_blk, end_blk);
d321ceac 203 }
c40bdaa2
DC
204 ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
205 (mid_blk == end_blk && mid_blk-1 == first_blk));
206
207 *last_blk = end_blk;
d321ceac
NS
208
209 return 0;
a562a63b 210}
d321ceac
NS
211
212/*
c40bdaa2
DC
213 * Check that a range of blocks does not contain stop_on_cycle_no.
214 * Fill in *new_blk with the block offset where such a block is
215 * found, or with -1 (an invalid block number) if there is no such
216 * block in the range. The scan needs to occur from front to back
217 * and the pointer into the region must be updated since a later
218 * routine will need to perform another test.
d321ceac 219 */
ce029dc1 220STATIC int
a562a63b 221xlog_find_verify_cycle(
999f0b9c 222 struct xlog *log,
a562a63b
NS
223 xfs_daddr_t start_blk,
224 int nbblks,
225 uint stop_on_cycle_no,
226 xfs_daddr_t *new_blk)
d321ceac 227{
a562a63b
NS
228 xfs_daddr_t i, j;
229 uint cycle;
167137fe 230 struct xfs_buf *bp;
e59b15cf 231 int bufblks;
d60ba955 232 char *buf = NULL;
a562a63b 233 int error = 0;
85a875e9 234
c40bdaa2
DC
235 /*
236 * Greedily allocate a buffer big enough to handle the full
237 * range of basic blocks we'll be examining. If that fails,
238 * try a smaller size. We need to be able to read at least
239 * a log sector, or we're out of luck.
240 */
85a875e9 241 bufblks = 1 << ffs(nbblks);
999f0b9c
DC
242 while (bufblks > log->l_logBBsize)
243 bufblks >>= 1;
a562a63b 244 while (!(bp = xlog_get_bp(log, bufblks))) {
d321ceac 245 bufblks >>= 1;
999f0b9c 246 if (bufblks < log->l_sectBBsize)
5000d01d
SL
247 return ENOMEM;
248 }
249
ffe29fb5
NS
250 for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
251 int bcount;
d321ceac 252
ffe29fb5 253 bcount = min(bufblks, (start_blk + nbblks - i));
d321ceac 254
c40bdaa2
DC
255 error = xlog_bread(log, i, bcount, bp, &buf);
256 if (error)
5000d01d 257 goto out;
d321ceac 258
d321ceac 259 for (j = 0; j < bcount; j++) {
5e656dbb 260 cycle = xlog_get_cycle(buf);
d321ceac 261 if (cycle == stop_on_cycle_no) {
e56fcdce 262 *new_blk = i+j;
d321ceac
NS
263 goto out;
264 }
5000d01d
SL
265
266 buf += BBSIZE;
d321ceac
NS
267 }
268 }
269
ce029dc1 270 *new_blk = -1;
d321ceac
NS
271
272out:
66ab87d3 273 libxfs_buf_relse(bp);
d321ceac 274 return error;
a562a63b 275}
d321ceac
NS
276
277/*
278 * Potentially backup over partial log record write.
279 *
280 * In the typical case, last_blk is the number of the block directly after
281 * a good log record. Therefore, we subtract one to get the block number
282 * of the last block in the given buffer. extra_bblks contains the number
283 * of blocks we would have read on a previous read. This happens when the
284 * last log record is split over the end of the physical log.
285 *
286 * extra_bblks is the number of blocks potentially verified on a previous
287 * call to this routine.
288 */
d321ceac 289STATIC int
a562a63b 290xlog_find_verify_log_record(
999f0b9c 291 struct xlog *log,
a562a63b
NS
292 xfs_daddr_t start_blk,
293 xfs_daddr_t *last_blk,
294 int extra_bblks)
d321ceac 295{
a562a63b 296 xfs_daddr_t i;
167137fe 297 struct xfs_buf *bp;
d60ba955 298 char *offset = NULL;
a562a63b
NS
299 xlog_rec_header_t *head = NULL;
300 int error = 0;
301 int smallmem = 0;
302 int num_blks = *last_blk - start_blk;
303 int xhdrs;
304
305 ASSERT(start_blk != 0 || *last_blk != start_blk);
306
307 if (!(bp = xlog_get_bp(log, num_blks))) {
308 if (!(bp = xlog_get_bp(log, 1)))
309 return ENOMEM;
310 smallmem = 1;
311 } else {
c40bdaa2
DC
312 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
313 if (error)
a562a63b 314 goto out;
a562a63b 315 offset += ((num_blks - 1) << BBSHIFT);
d321ceac
NS
316 }
317
a562a63b
NS
318 for (i = (*last_blk) - 1; i >= 0; i--) {
319 if (i < start_blk) {
05bba5b7 320 /* valid log record not found */
999f0b9c
DC
321 xfs_warn(log->l_mp,
322 "Log inconsistent (didn't find previous header)");
a562a63b
NS
323 ASSERT(0);
324 error = XFS_ERROR(EIO);
325 goto out;
326 }
d321ceac 327
a562a63b 328 if (smallmem) {
c40bdaa2
DC
329 error = xlog_bread(log, i, 1, bp, &offset);
330 if (error)
a562a63b 331 goto out;
a562a63b
NS
332 }
333
334 head = (xlog_rec_header_t *)offset;
d321ceac 335
999f0b9c 336 if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
a562a63b
NS
337 break;
338
339 if (!smallmem)
340 offset -= BBSIZE;
341 }
342
343 /*
344 * We hit the beginning of the physical log & still no header. Return
345 * to caller. If caller can handle a return of -1, then this routine
346 * will be called again for the end of the physical log.
347 */
348 if (i == -1) {
349 error = -1;
350 goto out;
351 }
352
353 /*
354 * We have the final block of the good log (the first block
355 * of the log record _before_ the head. So we check the uuid.
356 */
357 if ((error = xlog_header_check_mount(log->l_mp, head)))
358 goto out;
359
360 /*
361 * We may have found a log record header before we expected one.
362 * last_blk will be the 1st block # with a given cycle #. We may end
363 * up reading an entire log record. In this case, we don't want to
364 * reset last_blk. Only when last_blk points in the middle of a log
365 * record do we update last_blk.
366 */
5e656dbb
BN
367 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
368 uint h_size = be32_to_cpu(head->h_size);
a562a63b
NS
369
370 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
371 if (h_size % XLOG_HEADER_CYCLE_SIZE)
372 xhdrs++;
373 } else {
374 xhdrs = 1;
375 }
376
5e656dbb
BN
377 if (*last_blk - i + extra_bblks !=
378 BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
a562a63b
NS
379 *last_blk = i;
380
381out:
66ab87d3 382 libxfs_buf_relse(bp);
a562a63b
NS
383 return error;
384}
d321ceac
NS
385
386/*
387 * Head is defined to be the point of the log where the next log write
388 * write could go. This means that incomplete LR writes at the end are
389 * eliminated when calculating the head. We aren't guaranteed that previous
5000d01d 390 * LR have complete transactions. We only know that a cycle number of
d321ceac
NS
391 * current cycle number -1 won't be present in the log if we start writing
392 * from our current block number.
393 *
394 * last_blk contains the block number of the first block with a given
395 * cycle number.
396 *
d321ceac
NS
397 * Return: zero if normal, non-zero if error.
398 */
c40bdaa2 399STATIC int
a562a63b 400xlog_find_head(
999f0b9c 401 struct xlog *log,
a562a63b 402 xfs_daddr_t *return_head_blk)
d321ceac 403{
167137fe 404 struct xfs_buf *bp;
d60ba955 405 char *offset;
a562a63b
NS
406 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
407 int num_scan_bblks;
408 uint first_half_cycle, last_half_cycle;
409 uint stop_on_cycle;
410 int error, log_bbnum = log->l_logBBsize;
411
412 /* Is the end of the log device zeroed? */
413 if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
414 *return_head_blk = first_blk;
415
416 /* Is the whole lot zeroed? */
c40bdaa2 417 if (!first_blk) {
a562a63b
NS
418 /* Linux XFS shouldn't generate totally zeroed logs -
419 * mkfs etc write a dummy unmount record to a fresh
420 * log so we can store the uuid in there
421 */
999f0b9c 422 xfs_warn(log->l_mp, "totally zeroed log");
a562a63b
NS
423 }
424
425 return 0;
426 } else if (error) {
999f0b9c 427 xfs_warn(log->l_mp, "empty log check failed");
a562a63b 428 return error;
5000d01d
SL
429 }
430
a562a63b
NS
431 first_blk = 0; /* get cycle # of 1st block */
432 bp = xlog_get_bp(log, 1);
433 if (!bp)
434 return ENOMEM;
c40bdaa2
DC
435
436 error = xlog_bread(log, 0, 1, bp, &offset);
437 if (error)
a562a63b 438 goto bp_err;
c40bdaa2 439
5e656dbb 440 first_half_cycle = xlog_get_cycle(offset);
a562a63b
NS
441
442 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
c40bdaa2
DC
443 error = xlog_bread(log, last_blk, 1, bp, &offset);
444 if (error)
a562a63b 445 goto bp_err;
c40bdaa2 446
5e656dbb 447 last_half_cycle = xlog_get_cycle(offset);
a562a63b
NS
448 ASSERT(last_half_cycle != 0);
449
d321ceac 450 /*
a562a63b
NS
451 * If the 1st half cycle number is equal to the last half cycle number,
452 * then the entire log is stamped with the same cycle number. In this
453 * case, head_blk can't be set to zero (which makes sense). The below
454 * math doesn't work out properly with head_blk equal to zero. Instead,
05bba5b7 455 * we set it to log_bbnum which is an invalid block number, but this
a562a63b
NS
456 * value makes the math correct. If head_blk doesn't changed through
457 * all the tests below, *head_blk is set to zero at the very end rather
458 * than log_bbnum. In a sense, log_bbnum and zero are the same block
459 * in a circular file.
d321ceac 460 */
a562a63b
NS
461 if (first_half_cycle == last_half_cycle) {
462 /*
463 * In this case we believe that the entire log should have
464 * cycle number last_half_cycle. We need to scan backwards
465 * from the end verifying that there are no holes still
466 * containing last_half_cycle - 1. If we find such a hole,
467 * then the start of that hole will be the new head. The
468 * simple case looks like
469 * x | x ... | x - 1 | x
470 * Another case that fits this picture would be
471 * x | x + 1 | x ... | x
5e656dbb 472 * In this case the head really is somewhere at the end of the
a562a63b
NS
473 * log, as one of the latest writes at the beginning was
474 * incomplete.
475 * One more case is
476 * x | x + 1 | x ... | x - 1 | x
477 * This is really the combination of the above two cases, and
478 * the head has to end up at the start of the x-1 hole at the
479 * end of the log.
480 *
481 * In the 256k log case, we will read from the beginning to the
482 * end of the log and search for cycle numbers equal to x-1.
483 * We don't worry about the x+1 blocks that we encounter,
484 * because we know that they cannot be the head since the log
485 * started with x.
486 */
487 head_blk = log_bbnum;
488 stop_on_cycle = last_half_cycle - 1;
489 } else {
490 /*
491 * In this case we want to find the first block with cycle
492 * number matching last_half_cycle. We expect the log to be
493 * some variation on
c40bdaa2 494 * x + 1 ... | x ... | x
a562a63b
NS
495 * The first block with cycle number x (last_half_cycle) will
496 * be where the new head belongs. First we do a binary search
497 * for the first occurrence of last_half_cycle. The binary
498 * search may not be totally accurate, so then we scan back
499 * from there looking for occurrences of last_half_cycle before
500 * us. If that backwards scan wraps around the beginning of
501 * the log, then we look for occurrences of last_half_cycle - 1
502 * at the end of the log. The cases we're looking for look
503 * like
c40bdaa2
DC
504 * v binary search stopped here
505 * x + 1 ... | x | x + 1 | x ... | x
506 * ^ but we want to locate this spot
a562a63b 507 * or
a562a63b 508 * <---------> less than scan distance
c40bdaa2
DC
509 * x + 1 ... | x ... | x - 1 | x
510 * ^ we want to locate this spot
a562a63b
NS
511 */
512 stop_on_cycle = last_half_cycle;
513 if ((error = xlog_find_cycle_start(log, bp, first_blk,
514 &head_blk, last_half_cycle)))
515 goto bp_err;
516 }
517
d321ceac 518 /*
a562a63b
NS
519 * Now validate the answer. Scan back some number of maximum possible
520 * blocks and make sure each one has the expected cycle number. The
521 * maximum is determined by the total possible amount of buffering
522 * in the in-core log. The following number can be made tighter if
523 * we actually look at the block size of the filesystem.
d321ceac 524 */
a562a63b
NS
525 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
526 if (head_blk >= num_scan_bblks) {
527 /*
528 * We are guaranteed that the entire check can be performed
529 * in one buffer.
530 */
531 start_blk = head_blk - num_scan_bblks;
532 if ((error = xlog_find_verify_cycle(log,
533 start_blk, num_scan_bblks,
534 stop_on_cycle, &new_blk)))
535 goto bp_err;
536 if (new_blk != -1)
537 head_blk = new_blk;
538 } else { /* need to read 2 parts of log */
539 /*
540 * We are going to scan backwards in the log in two parts.
541 * First we scan the physical end of the log. In this part
542 * of the log, we are looking for blocks with cycle number
543 * last_half_cycle - 1.
544 * If we find one, then we know that the log starts there, as
545 * we've found a hole that didn't get written in going around
546 * the end of the physical log. The simple case for this is
547 * x + 1 ... | x ... | x - 1 | x
548 * <---------> less than scan distance
549 * If all of the blocks at the end of the log have cycle number
550 * last_half_cycle, then we check the blocks at the start of
551 * the log looking for occurrences of last_half_cycle. If we
552 * find one, then our current estimate for the location of the
553 * first occurrence of last_half_cycle is wrong and we move
554 * back to the hole we've found. This case looks like
555 * x + 1 ... | x | x + 1 | x ...
556 * ^ binary search stopped here
557 * Another case we need to handle that only occurs in 256k
558 * logs is
559 * x + 1 ... | x ... | x+1 | x ...
560 * ^ binary search stops here
561 * In a 256k log, the scan at the end of the log will see the
562 * x + 1 blocks. We need to skip past those since that is
563 * certainly not the head of the log. By searching for
564 * last_half_cycle-1 we accomplish that.
565 */
a562a63b 566 ASSERT(head_blk <= INT_MAX &&
c40bdaa2
DC
567 (xfs_daddr_t) num_scan_bblks >= head_blk);
568 start_blk = log_bbnum - (num_scan_bblks - head_blk);
a562a63b
NS
569 if ((error = xlog_find_verify_cycle(log, start_blk,
570 num_scan_bblks - (int)head_blk,
571 (stop_on_cycle - 1), &new_blk)))
572 goto bp_err;
573 if (new_blk != -1) {
574 head_blk = new_blk;
c40bdaa2 575 goto validate_head;
a562a63b
NS
576 }
577
578 /*
579 * Scan beginning of log now. The last part of the physical
580 * log is good. This scan needs to verify that it doesn't find
581 * the last_half_cycle.
582 */
583 start_blk = 0;
584 ASSERT(head_blk <= INT_MAX);
585 if ((error = xlog_find_verify_cycle(log,
586 start_blk, (int)head_blk,
587 stop_on_cycle, &new_blk)))
588 goto bp_err;
589 if (new_blk != -1)
590 head_blk = new_blk;
591 }
592
c40bdaa2 593validate_head:
5000d01d 594 /*
a562a63b
NS
595 * Now we need to make sure head_blk is not pointing to a block in
596 * the middle of a log record.
d321ceac 597 */
a562a63b
NS
598 num_scan_bblks = XLOG_REC_SHIFT(log);
599 if (head_blk >= num_scan_bblks) {
600 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
601
602 /* start ptr at last block ptr before head_blk */
603 if ((error = xlog_find_verify_log_record(log, start_blk,
604 &head_blk, 0)) == -1) {
605 error = XFS_ERROR(EIO);
606 goto bp_err;
607 } else if (error)
608 goto bp_err;
609 } else {
610 start_blk = 0;
611 ASSERT(head_blk <= INT_MAX);
612 if ((error = xlog_find_verify_log_record(log, start_blk,
613 &head_blk, 0)) == -1) {
614 /* We hit the beginning of the log during our search */
c40bdaa2 615 start_blk = log_bbnum - (num_scan_bblks - head_blk);
a562a63b
NS
616 new_blk = log_bbnum;
617 ASSERT(start_blk <= INT_MAX &&
618 (xfs_daddr_t) log_bbnum-start_blk >= 0);
619 ASSERT(head_blk <= INT_MAX);
620 if ((error = xlog_find_verify_log_record(log,
621 start_blk, &new_blk,
622 (int)head_blk)) == -1) {
623 error = XFS_ERROR(EIO);
624 goto bp_err;
625 } else if (error)
626 goto bp_err;
627 if (new_blk != log_bbnum)
628 head_blk = new_blk;
629 } else if (error)
630 goto bp_err;
d321ceac
NS
631 }
632
66ab87d3 633 libxfs_buf_relse(bp);
a562a63b
NS
634 if (head_blk == log_bbnum)
635 *return_head_blk = 0;
636 else
637 *return_head_blk = head_blk;
d321ceac 638 /*
a562a63b
NS
639 * When returning here, we have a good block number. Bad block
640 * means that during a previous crash, we didn't have a clean break
641 * from cycle number N to cycle number N-1. In this case, we need
642 * to find the first block with cycle number N-1.
d321ceac 643 */
a562a63b 644 return 0;
d321ceac 645
a562a63b 646 bp_err:
66ab87d3 647 libxfs_buf_relse(bp);
d321ceac 648
5000d01d 649 if (error)
999f0b9c 650 xfs_warn(log->l_mp, "failed to find log head");
d321ceac 651 return error;
a562a63b 652}
d321ceac
NS
653
654/*
655 * Find the sync block number or the tail of the log.
656 *
657 * This will be the block number of the last record to have its
658 * associated buffers synced to disk. Every log record header has
659 * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
4ed50f8a 660 * to get a sync block number. The only concern is to figure out which
d321ceac
NS
661 * log record header to believe.
662 *
663 * The following algorithm uses the log record header with the largest
4ed50f8a 664 * lsn. The entire log record does not need to be valid. We only care
d321ceac
NS
665 * that the header is valid.
666 *
667 * We could speed up search by using current head_blk buffer, but it is not
668 * available.
669 */
670int
a562a63b 671xlog_find_tail(
999f0b9c 672 struct xlog *log,
a562a63b 673 xfs_daddr_t *head_blk,
5e656dbb 674 xfs_daddr_t *tail_blk)
d321ceac
NS
675{
676 xlog_rec_header_t *rhead;
677 xlog_op_header_t *op_head;
d60ba955 678 char *offset = NULL;
167137fe 679 struct xfs_buf *bp;
d321ceac
NS
680 int error, i, found;
681 xfs_daddr_t umount_data_blk;
682 xfs_daddr_t after_umount_blk;
683 xfs_lsn_t tail_lsn;
73bf5988 684 int hblks;
5000d01d 685
1b6a0044 686 found = 0;
d321ceac
NS
687
688 /*
5000d01d 689 * Find previous log record
d321ceac
NS
690 */
691 if ((error = xlog_find_head(log, head_blk)))
692 return error;
693
a562a63b 694 bp = xlog_get_bp(log, 1);
d321ceac 695 if (!bp)
ce029dc1 696 return ENOMEM;
d321ceac 697 if (*head_blk == 0) { /* special case */
c40bdaa2
DC
698 error = xlog_bread(log, 0, 1, bp, &offset);
699 if (error)
700 goto done;
701
5e656dbb 702 if (xlog_get_cycle(offset) == 0) {
d321ceac
NS
703 *tail_blk = 0;
704 /* leave all other log inited values alone */
c40bdaa2 705 goto done;
d321ceac
NS
706 }
707 }
708
709 /*
710 * Search backwards looking for log record header block
711 */
712 ASSERT(*head_blk < INT_MAX);
1b6a0044 713 for (i = (int)(*head_blk) - 1; i >= 0; i--) {
c40bdaa2
DC
714 error = xlog_bread(log, i, 1, bp, &offset);
715 if (error)
716 goto done;
717
999f0b9c 718 if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
d321ceac
NS
719 found = 1;
720 break;
721 }
722 }
723 /*
724 * If we haven't found the log record header block, start looking
725 * again from the end of the physical log. XXXmiken: There should be
726 * a check here to make sure we didn't search more than N blocks in
727 * the previous code.
728 */
729 if (!found) {
1b6a0044 730 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
c40bdaa2
DC
731 error = xlog_bread(log, i, 1, bp, &offset);
732 if (error)
733 goto done;
734
999f0b9c
DC
735 if (*(__be32 *)offset ==
736 cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
d321ceac
NS
737 found = 2;
738 break;
739 }
740 }
741 }
742 if (!found) {
999f0b9c 743 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
66ab87d3 744 libxfs_buf_relse(bp);
d321ceac
NS
745 ASSERT(0);
746 return XFS_ERROR(EIO);
747 }
748
749 /* find blk_no of tail of log */
a562a63b 750 rhead = (xlog_rec_header_t *)offset;
5e656dbb 751 *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
d321ceac
NS
752
753 /*
754 * Reset log values according to the state of the log when we
755 * crashed. In the case where head_blk == 0, we bump curr_cycle
756 * one because the next write starts a new cycle rather than
757 * continuing the cycle of the last good log record. At this
758 * point we have guaranteed that all partial log records have been
759 * accounted for. Therefore, we know that the last good log record
760 * written was complete and ended exactly on the end boundary
761 * of the physical log.
762 */
763 log->l_prev_block = i;
764 log->l_curr_block = (int)*head_blk;
5e656dbb 765 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
d321ceac
NS
766 if (found == 2)
767 log->l_curr_cycle++;
c40bdaa2
DC
768 atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
769 atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
999f0b9c 770 xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
c40bdaa2 771 BBTOB(log->l_curr_block));
999f0b9c 772 xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
c40bdaa2 773 BBTOB(log->l_curr_block));
d321ceac
NS
774
775 /*
776 * Look for unmount record. If we find it, then we know there
4ed50f8a 777 * was a clean unmount. Since 'i' could be the last block in
d321ceac
NS
778 * the physical log, we convert to a log block before comparing
779 * to the head_blk.
780 *
781 * Save the current tail lsn to use to pass to
782 * xlog_clear_stale_blocks() below. We won't want to clear the
783 * unmount record if there is one, so we pass the lsn of the
784 * unmount record rather than the block after it.
785 */
5e656dbb
BN
786 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
787 int h_size = be32_to_cpu(rhead->h_size);
788 int h_version = be32_to_cpu(rhead->h_version);
1b6a0044
NS
789
790 if ((h_version & XLOG_VERSION_2) &&
73bf5988
SL
791 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
792 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
793 if (h_size % XLOG_HEADER_CYCLE_SIZE)
794 hblks++;
795 } else {
796 hblks = 1;
797 }
798 } else {
799 hblks = 1;
800 }
1b6a0044 801 after_umount_blk = (i + hblks + (int)
5e656dbb 802 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
c40bdaa2 803 tail_lsn = atomic64_read(&log->l_tail_lsn);
1b6a0044 804 if (*head_blk == after_umount_blk &&
5e656dbb 805 be32_to_cpu(rhead->h_num_logops) == 1) {
73bf5988 806 umount_data_blk = (i + hblks) % log->l_logBBsize;
c40bdaa2
DC
807 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
808 if (error)
809 goto done;
810
a562a63b 811 op_head = (xlog_op_header_t *)offset;
d321ceac
NS
812 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
813 /*
814 * Set tail and last sync so that newly written
815 * log records will point recovery to after the
816 * current unmount record.
817 */
c40bdaa2
DC
818 xlog_assign_atomic_lsn(&log->l_tail_lsn,
819 log->l_curr_cycle, after_umount_blk);
820 xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
821 log->l_curr_cycle, after_umount_blk);
d321ceac 822 *tail_blk = after_umount_blk;
46eca962
NS
823
824 /*
825 * Note that the unmount was clean. If the unmount
826 * was not clean, we need to know this to rebuild the
827 * superblock counters from the perag headers if we
828 * have a filesystem using non-persistent counters.
829 */
830 log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
d321ceac
NS
831 }
832 }
833
d321ceac
NS
834 /*
835 * Make sure that there are no blocks in front of the head
836 * with the same cycle number as the head. This can happen
837 * because we allow multiple outstanding log writes concurrently,
838 * and the later writes might make it out before earlier ones.
839 *
840 * We use the lsn from before modifying it so that we'll never
841 * overwrite the unmount record after a clean unmount.
842 *
843 * Do this only if we are going to recover the filesystem
32181a02
NS
844 *
845 * NOTE: This used to say "if (!readonly)"
846 * However on Linux, we can & do recover a read-only filesystem.
847 * We only skip recovery if NORECOVERY is specified on mount,
848 * in which case we would not be here.
849 *
850 * But... if the -device- itself is readonly, just skip this.
851 * We can't recover this device anyway, so it won't matter.
d321ceac 852 */
c40bdaa2 853 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
d321ceac 854 error = xlog_clear_stale_blocks(log, tail_lsn);
d321ceac 855
c40bdaa2 856done:
66ab87d3 857 libxfs_buf_relse(bp);
d321ceac 858
5000d01d 859 if (error)
999f0b9c 860 xfs_warn(log->l_mp, "failed to locate log tail");
d321ceac 861 return error;
a562a63b 862}
4ed50f8a 863
d321ceac
NS
864/*
865 * Is the log zeroed at all?
866 *
867 * The last binary search should be changed to perform an X block read
4ed50f8a 868 * once X becomes small enough. You can then search linearly through
d321ceac
NS
869 * the X blocks. This will cut down on the number of reads we need to do.
870 *
871 * If the log is partially zeroed, this routine will pass back the blkno
872 * of the first block with cycle number 0. It won't have a complete LR
873 * preceding it.
874 *
875 * Return:
876 * 0 => the log is completely written to
877 * -1 => use *blk_no as the first block of the log
878 * >0 => error has occurred
879 */
880int
a562a63b 881xlog_find_zeroed(
999f0b9c 882 struct xlog *log,
a562a63b 883 xfs_daddr_t *blk_no)
d321ceac 884{
167137fe 885 struct xfs_buf *bp;
d60ba955 886 char *offset;
4ed50f8a 887 uint first_cycle, last_cycle;
d321ceac 888 xfs_daddr_t new_blk, last_blk, start_blk;
4ed50f8a
RC
889 xfs_daddr_t num_scan_bblks;
890 int error, log_bbnum = log->l_logBBsize;
d321ceac 891
5e656dbb
BN
892 *blk_no = 0;
893
d321ceac 894 /* check totally zeroed log */
a562a63b 895 bp = xlog_get_bp(log, 1);
d321ceac 896 if (!bp)
ce029dc1 897 return ENOMEM;
c40bdaa2
DC
898 error = xlog_bread(log, 0, 1, bp, &offset);
899 if (error)
d321ceac 900 goto bp_err;
c40bdaa2 901
5e656dbb 902 first_cycle = xlog_get_cycle(offset);
d321ceac
NS
903 if (first_cycle == 0) { /* completely zeroed log */
904 *blk_no = 0;
66ab87d3 905 libxfs_buf_relse(bp);
d321ceac
NS
906 return -1;
907 }
908
909 /* check partially zeroed log */
c40bdaa2
DC
910 error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
911 if (error)
d321ceac 912 goto bp_err;
c40bdaa2 913
5e656dbb 914 last_cycle = xlog_get_cycle(offset);
d321ceac 915 if (last_cycle != 0) { /* log completely written to */
66ab87d3 916 libxfs_buf_relse(bp);
d321ceac
NS
917 return 0;
918 } else if (first_cycle != 1) {
919 /*
920 * If the cycle of the last block is zero, the cycle of
5000d01d
SL
921 * the first block must be 1. If it's not, maybe we're
922 * not looking at a log... Bail out.
d321ceac 923 */
999f0b9c
DC
924 xfs_warn(log->l_mp,
925 "Log inconsistent or not a log (last==0, first!=1)");
4623d104
ES
926 error = XFS_ERROR(EINVAL);
927 goto bp_err;
d321ceac 928 }
5000d01d 929
d321ceac
NS
930 /* we have a partially zeroed log */
931 last_blk = log_bbnum-1;
932 if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
933 goto bp_err;
934
935 /*
4ed50f8a 936 * Validate the answer. Because there is no way to guarantee that
d321ceac
NS
937 * the entire log is made up of log records which are the same size,
938 * we scan over the defined maximum blocks. At this point, the maximum
939 * is not chosen to mean anything special. XXXmiken
940 */
73bf5988 941 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
d321ceac 942 ASSERT(num_scan_bblks <= INT_MAX);
5000d01d 943
d321ceac
NS
944 if (last_blk < num_scan_bblks)
945 num_scan_bblks = last_blk;
946 start_blk = last_blk - num_scan_bblks;
5000d01d 947
d321ceac
NS
948 /*
949 * We search for any instances of cycle number 0 that occur before
950 * our current estimate of the head. What we're trying to detect is
4ed50f8a
RC
951 * 1 ... | 0 | 1 | 0...
952 * ^ binary search ends here
d321ceac 953 */
ce029dc1
ES
954 if ((error = xlog_find_verify_cycle(log, start_blk,
955 (int)num_scan_bblks, 0, &new_blk)))
606d804d 956 goto bp_err;
ce029dc1
ES
957 if (new_blk != -1)
958 last_blk = new_blk;
d321ceac
NS
959
960 /*
961 * Potentially backup over partial log record write. We don't need
962 * to search the end of the log because we know it is zero.
963 */
5000d01d 964 if ((error = xlog_find_verify_log_record(log, start_blk,
79c48ada
ES
965 &last_blk, 0)) == -1) {
966 error = XFS_ERROR(EIO);
967 goto bp_err;
968 } else if (error)
d321ceac
NS
969 goto bp_err;
970
971 *blk_no = last_blk;
972bp_err:
66ab87d3 973 libxfs_buf_relse(bp);
d321ceac
NS
974 if (error)
975 return error;
976 return -1;
a562a63b 977}
d321ceac 978
19879397 979STATIC struct xlog_recover *
a562a63b 980xlog_recover_find_tid(
c40bdaa2 981 struct hlist_head *head,
5e656dbb 982 xlog_tid_t tid)
d321ceac 983{
19879397 984 struct xlog_recover *trans;
c40bdaa2 985 struct hlist_node *n;
d321ceac 986
c40bdaa2
DC
987 hlist_for_each_entry(trans, n, head, r_list) {
988 if (trans->r_log_tid == tid)
989 return trans;
d321ceac 990 }
c40bdaa2 991 return NULL;
a562a63b 992}
4ed50f8a 993
d321ceac 994STATIC void
c40bdaa2
DC
995xlog_recover_new_tid(
996 struct hlist_head *head,
997 xlog_tid_t tid,
998 xfs_lsn_t lsn)
d321ceac 999{
19879397 1000 struct xlog_recover *trans;
c40bdaa2 1001
19879397 1002 trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
c40bdaa2
DC
1003 trans->r_log_tid = tid;
1004 trans->r_lsn = lsn;
1005 INIT_LIST_HEAD(&trans->r_itemq);
1006
1007 INIT_HLIST_NODE(&trans->r_list);
1008 hlist_add_head(&trans->r_list, head);
a562a63b 1009}
d321ceac
NS
1010
1011STATIC void
a562a63b 1012xlog_recover_add_item(
c40bdaa2 1013 struct list_head *head)
d321ceac 1014{
3d16b59a 1015 struct xlog_recover_item *item;
d321ceac 1016
3d16b59a 1017 item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
c40bdaa2
DC
1018 INIT_LIST_HEAD(&item->ri_list);
1019 list_add_tail(&item->ri_list, head);
a562a63b 1020}
d321ceac 1021
a562a63b
NS
1022STATIC int
1023xlog_recover_add_to_cont_trans(
999f0b9c
DC
1024 struct xlog *log,
1025 struct xlog_recover *trans,
d60ba955 1026 char *dp,
a562a63b
NS
1027 int len)
1028{
3d16b59a 1029 struct xlog_recover_item *item;
d60ba955 1030 char *ptr, *old_ptr;
a562a63b
NS
1031 int old_len;
1032
c40bdaa2 1033 if (list_empty(&trans->r_itemq)) {
a562a63b
NS
1034 /* finish copying rest of trans header */
1035 xlog_recover_add_item(&trans->r_itemq);
d60ba955 1036 ptr = (char *) &trans->r_theader +
a562a63b
NS
1037 sizeof(xfs_trans_header_t) - len;
1038 memcpy(ptr, dp, len); /* d, s, l */
1039 return 0;
1040 }
c40bdaa2 1041 /* take the tail entry */
3d16b59a
DW
1042 item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
1043 ri_list);
a562a63b
NS
1044
1045 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1046 old_len = item->ri_buf[item->ri_cnt-1].i_len;
1047
8a21dbae 1048 ptr = krealloc(old_ptr, len+old_len, 0);
a562a63b
NS
1049 memcpy(&ptr[old_len], dp, len); /* d, s, l */
1050 item->ri_buf[item->ri_cnt-1].i_len += len;
1051 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
c40bdaa2 1052 trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
a562a63b
NS
1053 return 0;
1054}
1055
1056/*
1057 * The next region to add is the start of a new region. It could be
d321ceac
NS
1058 * a whole region or it could be the first part of a new region. Because
1059 * of this, the assumption here is that the type and size fields of all
1060 * format structures fit into the first 32 bits of the structure.
1061 *
1062 * This works because all regions must be 32 bit aligned. Therefore, we
1063 * either have both fields or we have neither field. In the case we have
1064 * neither field, the data part of the region is zero length. We only have
1065 * a log_op_header and can throw away the header since a new one will appear
1066 * later. If we have at least 4 bytes, then we can determine how many regions
1067 * will appear in the current log item.
1068 */
1069STATIC int
a562a63b 1070xlog_recover_add_to_trans(
999f0b9c
DC
1071 struct xlog *log,
1072 struct xlog_recover *trans,
d60ba955 1073 char *dp,
a562a63b 1074 int len)
d321ceac 1075{
e784bcd5 1076 struct xfs_inode_log_format *in_f; /* any will do */
3d16b59a 1077 struct xlog_recover_item *item;
d60ba955 1078 char *ptr;
d321ceac
NS
1079
1080 if (!len)
1081 return 0;
c40bdaa2
DC
1082 if (list_empty(&trans->r_itemq)) {
1083 /* we need to catch log corruptions here */
1084 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
999f0b9c
DC
1085 xfs_warn(log->l_mp, "%s: bad header magic number",
1086 __func__);
c40bdaa2
DC
1087 ASSERT(0);
1088 return XFS_ERROR(EIO);
1089 }
d321ceac
NS
1090 if (len == sizeof(xfs_trans_header_t))
1091 xlog_recover_add_item(&trans->r_itemq);
32181a02 1092 memcpy(&trans->r_theader, dp, len); /* d, s, l */
d321ceac
NS
1093 return 0;
1094 }
a562a63b 1095
6cd1e6db 1096 ptr = kmem_alloc(len, 0);
a562a63b 1097 memcpy(ptr, dp, len);
e784bcd5 1098 in_f = (struct xfs_inode_log_format *)ptr;
a562a63b 1099
c40bdaa2 1100 /* take the tail entry */
3d16b59a
DW
1101 item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
1102 ri_list);
c40bdaa2
DC
1103 if (item->ri_total != 0 &&
1104 item->ri_total == item->ri_cnt) {
1105 /* tail item is in use, get a new one */
d321ceac 1106 xlog_recover_add_item(&trans->r_itemq);
c40bdaa2 1107 item = list_entry(trans->r_itemq.prev,
3d16b59a 1108 struct xlog_recover_item, ri_list);
d321ceac 1109 }
d321ceac
NS
1110
1111 if (item->ri_total == 0) { /* first region to be added */
c40bdaa2
DC
1112 if (in_f->ilf_size == 0 ||
1113 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
999f0b9c
DC
1114 xfs_warn(log->l_mp,
1115 "bad number of regions (%d) in inode log format",
c40bdaa2
DC
1116 in_f->ilf_size);
1117 ASSERT(0);
0284875f 1118 kmem_free(ptr);
c40bdaa2
DC
1119 return XFS_ERROR(EIO);
1120 }
1121
1122 item->ri_total = in_f->ilf_size;
1123 item->ri_buf =
1124 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
6cd1e6db 1125 0);
d321ceac
NS
1126 }
1127 ASSERT(item->ri_total > item->ri_cnt);
1128 /* Description region is ri_buf[0] */
1129 item->ri_buf[item->ri_cnt].i_addr = ptr;
1130 item->ri_buf[item->ri_cnt].i_len = len;
1131 item->ri_cnt++;
c40bdaa2 1132 trace_xfs_log_recover_item_add(log, trans, item, 0);
d321ceac 1133 return 0;
a562a63b 1134}
d321ceac 1135
5e656dbb
BN
1136/*
1137 * Free up any resources allocated by the transaction
1138 *
1139 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
1140 */
1141STATIC void
1142xlog_recover_free_trans(
c40bdaa2 1143 struct xlog_recover *trans)
5e656dbb 1144{
3d16b59a 1145 struct xlog_recover_item *item, *n;
5e656dbb
BN
1146 int i;
1147
c40bdaa2
DC
1148 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
1149 /* Free the regions in the item. */
1150 list_del(&item->ri_list);
1151 for (i = 0; i < item->ri_cnt; i++)
1152 kmem_free(item->ri_buf[i].i_addr);
5e656dbb 1153 /* Free the item itself */
c40bdaa2
DC
1154 kmem_free(item->ri_buf);
1155 kmem_free(item);
1156 }
5e656dbb
BN
1157 /* Free the transaction recover structure */
1158 kmem_free(trans);
1159}
1160
c40bdaa2
DC
1161/*
1162 * Perform the transaction.
1163 *
1164 * If the transaction modifies a buffer or inode, do it now. Otherwise,
1165 * EFIs and EFDs get queued up by adding entries into the AIL for them.
1166 */
5e656dbb
BN
1167STATIC int
1168xlog_recover_commit_trans(
999f0b9c 1169 struct xlog *log,
c40bdaa2 1170 struct xlog_recover *trans,
5e656dbb
BN
1171 int pass)
1172{
c40bdaa2 1173 int error = 0;
5e656dbb 1174
c40bdaa2 1175 hlist_del(&trans->r_list);
5e656dbb
BN
1176 if ((error = xlog_recover_do_trans(log, trans, pass)))
1177 return error;
c40bdaa2
DC
1178
1179 xlog_recover_free_trans(trans);
5e656dbb
BN
1180 return 0;
1181}
1182
d321ceac 1183STATIC int
a562a63b 1184xlog_recover_unmount_trans(
19879397 1185 struct xlog_recover *trans)
d321ceac
NS
1186{
1187 /* Do nothing now */
999f0b9c 1188 xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
a562a63b
NS
1189 return 0;
1190}
d321ceac 1191
a562a63b
NS
1192/*
1193 * There are two valid states of the r_state field. 0 indicates that the
1194 * transaction structure is in a normal state. We have either seen the
1195 * start of the transaction or the last operation we added was not a partial
1196 * operation. If the last operation we added to the transaction was a
1197 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
1198 *
1199 * NOTE: skip LRs with 0 data length.
1200 */
d321ceac 1201STATIC int
a562a63b 1202xlog_recover_process_data(
999f0b9c 1203 struct xlog *log,
c40bdaa2 1204 struct hlist_head rhash[],
999f0b9c 1205 struct xlog_rec_header *rhead,
d60ba955 1206 char *dp,
a562a63b 1207 int pass)
d321ceac 1208{
d60ba955 1209 char *lp;
a562a63b
NS
1210 int num_logops;
1211 xlog_op_header_t *ohead;
19879397 1212 struct xlog_recover *trans;
a562a63b
NS
1213 xlog_tid_t tid;
1214 int error;
1215 unsigned long hash;
1216 uint flags;
1217
5e656dbb
BN
1218 lp = dp + be32_to_cpu(rhead->h_len);
1219 num_logops = be32_to_cpu(rhead->h_num_logops);
a562a63b
NS
1220
1221 /* check the log format matches our own - else we can't recover */
1222 if (xlog_header_check_recover(log->l_mp, rhead))
1223 return (XFS_ERROR(EIO));
1224
1225 while ((dp < lp) && num_logops) {
1226 ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
1227 ohead = (xlog_op_header_t *)dp;
1228 dp += sizeof(xlog_op_header_t);
1229 if (ohead->oh_clientid != XFS_TRANSACTION &&
1230 ohead->oh_clientid != XFS_LOG) {
999f0b9c
DC
1231 xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
1232 __func__, ohead->oh_clientid);
a562a63b
NS
1233 ASSERT(0);
1234 return (XFS_ERROR(EIO));
d321ceac 1235 }
5e656dbb 1236 tid = be32_to_cpu(ohead->oh_tid);
a562a63b 1237 hash = XLOG_RHASH(tid);
c40bdaa2 1238 trans = xlog_recover_find_tid(&rhash[hash], tid);
a562a63b
NS
1239 if (trans == NULL) { /* not found; add new tid */
1240 if (ohead->oh_flags & XLOG_START_TRANS)
1241 xlog_recover_new_tid(&rhash[hash], tid,
5e656dbb 1242 be64_to_cpu(rhead->h_lsn));
a562a63b 1243 } else {
c40bdaa2 1244 if (dp + be32_to_cpu(ohead->oh_len) > lp) {
999f0b9c
DC
1245 xfs_warn(log->l_mp, "%s: bad length 0x%x",
1246 __func__, be32_to_cpu(ohead->oh_len));
c40bdaa2
DC
1247 return (XFS_ERROR(EIO));
1248 }
a562a63b
NS
1249 flags = ohead->oh_flags & ~XLOG_END_TRANS;
1250 if (flags & XLOG_WAS_CONT_TRANS)
1251 flags &= ~XLOG_CONTINUE_TRANS;
1252 switch (flags) {
1253 case XLOG_COMMIT_TRANS:
1254 error = xlog_recover_commit_trans(log,
c40bdaa2 1255 trans, pass);
a562a63b
NS
1256 break;
1257 case XLOG_UNMOUNT_TRANS:
1258 error = xlog_recover_unmount_trans(trans);
1259 break;
1260 case XLOG_WAS_CONT_TRANS:
c40bdaa2
DC
1261 error = xlog_recover_add_to_cont_trans(log,
1262 trans, dp,
1263 be32_to_cpu(ohead->oh_len));
a562a63b
NS
1264 break;
1265 case XLOG_START_TRANS:
999f0b9c
DC
1266 xfs_warn(log->l_mp, "%s: bad transaction",
1267 __func__);
a562a63b
NS
1268 ASSERT(0);
1269 error = XFS_ERROR(EIO);
1270 break;
1271 case 0:
1272 case XLOG_CONTINUE_TRANS:
c40bdaa2 1273 error = xlog_recover_add_to_trans(log, trans,
5e656dbb 1274 dp, be32_to_cpu(ohead->oh_len));
a562a63b
NS
1275 break;
1276 default:
999f0b9c
DC
1277 xfs_warn(log->l_mp, "%s: bad flag 0x%x",
1278 __func__, flags);
a562a63b
NS
1279 ASSERT(0);
1280 error = XFS_ERROR(EIO);
1281 break;
1282 }
1283 if (error)
1284 return error;
d321ceac 1285 }
5e656dbb 1286 dp += be32_to_cpu(ohead->oh_len);
a562a63b
NS
1287 num_logops--;
1288 }
1289 return 0;
1290}
d321ceac 1291
999f0b9c
DC
1292/*
1293 * Upack the log buffer data and crc check it. If the check fails, issue a
1294 * warning if and only if the CRC in the header is non-zero. This makes the
1295 * check an advisory warning, and the zero CRC check will prevent failure
1296 * warnings from being emitted when upgrading the kernel from one that does not
1297 * add CRCs by default.
1298 *
1299 * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
1300 * corruption failure
1301 *
1302 * XXX: we do not calculate the CRC here yet. It's not clear what we should do
1303 * with CRC errors here in userspace, so we'll address that problem later on.
1304 */
1305#define xlog_cksum(l,r,dp,len) ((r)->h_crc)
1306STATIC int
1307xlog_unpack_data_crc(
1308 struct xlog_rec_header *rhead,
d60ba955 1309 char *dp,
999f0b9c
DC
1310 struct xlog *log)
1311{
1312 __le32 crc;
1313
1314 crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
1315 if (crc != rhead->h_crc) {
1316 if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
1317 xfs_alert(log->l_mp,
12864fd9 1318 "log record CRC mismatch: found 0x%x, expected 0x%x.",
999f0b9c
DC
1319 le32_to_cpu(rhead->h_crc),
1320 le32_to_cpu(crc));
1321 xfs_hex_dump(dp, 32);
1322 }
1323
1324 /*
1325 * If we've detected a log record corruption, then we can't
1326 * recover past this point. Abort recovery if we are enforcing
1327 * CRC protection by punting an error back up the stack.
1328 */
1329 if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
1330 return EFSCORRUPTED;
1331 }
1332
1333 return 0;
1334}
1335
1336STATIC int
5e656dbb 1337xlog_unpack_data(
999f0b9c 1338 struct xlog_rec_header *rhead,
d60ba955 1339 char *dp,
999f0b9c 1340 struct xlog *log)
5e656dbb
BN
1341{
1342 int i, j, k;
999f0b9c
DC
1343 int error;
1344
1345 error = xlog_unpack_data_crc(rhead, dp, log);
1346 if (error)
1347 return error;
5e656dbb
BN
1348
1349 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
1350 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
1351 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
1352 dp += BBSIZE;
1353 }
1354
1355 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
c40bdaa2 1356 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
5e656dbb
BN
1357 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
1358 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1359 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1360 *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
1361 dp += BBSIZE;
1362 }
1363 }
999f0b9c
DC
1364
1365 return 0;
5e656dbb
BN
1366}
1367
72c5917e
NS
1368STATIC int
1369xlog_valid_rec_header(
999f0b9c
DC
1370 struct xlog *log,
1371 struct xlog_rec_header *rhead,
72c5917e
NS
1372 xfs_daddr_t blkno)
1373{
b0e364f6 1374 int hlen;
72c5917e 1375
999f0b9c 1376 if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
72c5917e
NS
1377 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
1378 XFS_ERRLEVEL_LOW, log->l_mp);
1379 return XFS_ERROR(EFSCORRUPTED);
1380 }
1381 if (unlikely(
46eca962 1382 (!rhead->h_version ||
5e656dbb 1383 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
999f0b9c 1384 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
5e656dbb 1385 __func__, be32_to_cpu(rhead->h_version));
72c5917e
NS
1386 return XFS_ERROR(EIO);
1387 }
1388
1389 /* LR body must have data or it wouldn't have been written */
5e656dbb 1390 hlen = be32_to_cpu(rhead->h_len);
b0e364f6 1391 if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
72c5917e
NS
1392 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
1393 XFS_ERRLEVEL_LOW, log->l_mp);
1394 return XFS_ERROR(EFSCORRUPTED);
1395 }
1396 if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
1397 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
1398 XFS_ERRLEVEL_LOW, log->l_mp);
1399 return XFS_ERROR(EFSCORRUPTED);
1400 }
1401 return 0;
1402}
1403
d321ceac
NS
1404/*
1405 * Read the log from tail to head and process the log records found.
1406 * Handle the two cases where the tail and head are in the same cycle
1407 * and where the active portion of the log wraps around the end of
4ed50f8a 1408 * the physical log separately. The pass parameter is passed through
d321ceac
NS
1409 * to the routines called to process the data and is not looked at
1410 * here.
1411 */
1412int
a562a63b 1413xlog_do_recovery_pass(
999f0b9c 1414 struct xlog *log,
a562a63b
NS
1415 xfs_daddr_t head_blk,
1416 xfs_daddr_t tail_blk,
1417 int pass)
d321ceac 1418{
a562a63b
NS
1419 xlog_rec_header_t *rhead;
1420 xfs_daddr_t blk_no;
d60ba955 1421 char *offset;
167137fe 1422 struct xfs_buf *hbp, *dbp;
a562a63b
NS
1423 int error = 0, h_size;
1424 int bblks, split_bblks;
1425 int hblks, split_hblks, wrapped_hblks;
c40bdaa2 1426 struct hlist_head rhash[XLOG_RHASH_SIZE];
a562a63b 1427
72c5917e
NS
1428 ASSERT(head_blk != tail_blk);
1429
73bf5988 1430 /*
a562a63b
NS
1431 * Read the header of the tail block and get the iclog buffer size from
1432 * h_size. Use this to tell how many sectors make up the log header.
73bf5988 1433 */
5e656dbb 1434 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
a562a63b
NS
1435 /*
1436 * When using variable length iclogs, read first sector of
1437 * iclog header and extract the header size from it. Get a
1438 * new hbp that is the correct size.
1439 */
1440 hbp = xlog_get_bp(log, 1);
1441 if (!hbp)
1442 return ENOMEM;
c40bdaa2
DC
1443
1444 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
1445 if (error)
a562a63b 1446 goto bread_err1;
c40bdaa2 1447
a562a63b 1448 rhead = (xlog_rec_header_t *)offset;
72c5917e
NS
1449 error = xlog_valid_rec_header(log, rhead, tail_blk);
1450 if (error)
a562a63b 1451 goto bread_err1;
5e656dbb
BN
1452 h_size = be32_to_cpu(rhead->h_size);
1453 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
a562a63b
NS
1454 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1455 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1456 if (h_size % XLOG_HEADER_CYCLE_SIZE)
1457 hblks++;
66ab87d3 1458 libxfs_buf_relse(hbp);
a562a63b
NS
1459 hbp = xlog_get_bp(log, hblks);
1460 } else {
1461 hblks = 1;
1462 }
73bf5988 1463 } else {
c40bdaa2 1464 ASSERT(log->l_sectBBsize == 1);
a562a63b
NS
1465 hblks = 1;
1466 hbp = xlog_get_bp(log, 1);
1467 h_size = XLOG_BIG_RECORD_BSIZE;
73bf5988 1468 }
a562a63b
NS
1469
1470 if (!hbp)
1471 return ENOMEM;
1472 dbp = xlog_get_bp(log, BTOBB(h_size));
1473 if (!dbp) {
66ab87d3 1474 libxfs_buf_relse(hbp);
a562a63b 1475 return ENOMEM;
d321ceac 1476 }
a562a63b
NS
1477
1478 memset(rhash, 0, sizeof(rhash));
1479 if (tail_blk <= head_blk) {
1480 for (blk_no = tail_blk; blk_no < head_blk; ) {
c40bdaa2
DC
1481 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
1482 if (error)
a562a63b 1483 goto bread_err2;
c40bdaa2 1484
a562a63b 1485 rhead = (xlog_rec_header_t *)offset;
72c5917e
NS
1486 error = xlog_valid_rec_header(log, rhead, blk_no);
1487 if (error)
a562a63b 1488 goto bread_err2;
a562a63b 1489
a562a63b 1490 /* blocks in data section */
5e656dbb 1491 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
c40bdaa2
DC
1492 error = xlog_bread(log, blk_no + hblks, bblks, dbp,
1493 &offset);
72c5917e
NS
1494 if (error)
1495 goto bread_err2;
c40bdaa2 1496
999f0b9c
DC
1497 error = xlog_unpack_data(rhead, offset, log);
1498 if (error)
1499 goto bread_err2;
1500
1501 error = xlog_recover_process_data(log,
1502 rhash, rhead, offset, pass);
1503 if (error)
72c5917e
NS
1504 goto bread_err2;
1505 blk_no += bblks + hblks;
73bf5988 1506 }
a562a63b
NS
1507 } else {
1508 /*
1509 * Perform recovery around the end of the physical log.
1510 * When the head is not on the same cycle number as the tail,
1511 * we can't do a sequential recovery as above.
1512 */
1513 blk_no = tail_blk;
1514 while (blk_no < log->l_logBBsize) {
1515 /*
1516 * Check for header wrapping around physical end-of-log
1517 */
999f0b9c 1518 offset = hbp->b_addr;
72c5917e 1519 split_hblks = 0;
a562a63b 1520 wrapped_hblks = 0;
72c5917e 1521 if (blk_no + hblks <= log->l_logBBsize) {
a562a63b 1522 /* Read header in one read */
c40bdaa2
DC
1523 error = xlog_bread(log, blk_no, hblks, hbp,
1524 &offset);
72c5917e 1525 if (error)
a562a63b 1526 goto bread_err2;
a562a63b
NS
1527 } else {
1528 /* This LR is split across physical log end */
a562a63b
NS
1529 if (blk_no != log->l_logBBsize) {
1530 /* some data before physical log end */
1531 ASSERT(blk_no <= INT_MAX);
1532 split_hblks = log->l_logBBsize - (int)blk_no;
1533 ASSERT(split_hblks > 0);
c40bdaa2
DC
1534 error = xlog_bread(log, blk_no,
1535 split_hblks, hbp,
1536 &offset);
1537 if (error)
a562a63b 1538 goto bread_err2;
a562a63b 1539 }
c40bdaa2 1540
a562a63b
NS
1541 /*
1542 * Note: this black magic still works with
1543 * large sector sizes (non-512) only because:
1544 * - we increased the buffer size originally
1545 * by 1 sector giving us enough extra space
1546 * for the second read;
1547 * - the log start is guaranteed to be sector
1548 * aligned;
1549 * - we read the log end (LR header start)
1550 * _first_, then the log start (LR header end)
1551 * - order is important.
1552 */
5e656dbb 1553 wrapped_hblks = hblks - split_hblks;
999f0b9c
DC
1554 error = xlog_bread_offset(log, 0,
1555 wrapped_hblks, hbp,
1556 offset + BBTOB(split_hblks));
c40bdaa2
DC
1557 if (error)
1558 goto bread_err2;
a562a63b
NS
1559 }
1560 rhead = (xlog_rec_header_t *)offset;
72c5917e
NS
1561 error = xlog_valid_rec_header(log, rhead,
1562 split_hblks ? blk_no : 0);
1563 if (error)
a562a63b 1564 goto bread_err2;
72c5917e 1565
5e656dbb 1566 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
72c5917e 1567 blk_no += hblks;
a562a63b
NS
1568
1569 /* Read in data for log record */
72c5917e 1570 if (blk_no + bblks <= log->l_logBBsize) {
c40bdaa2
DC
1571 error = xlog_bread(log, blk_no, bblks, dbp,
1572 &offset);
72c5917e 1573 if (error)
a562a63b 1574 goto bread_err2;
a562a63b
NS
1575 } else {
1576 /* This log record is split across the
1577 * physical end of log */
999f0b9c 1578 offset = dbp->b_addr;
a562a63b
NS
1579 split_bblks = 0;
1580 if (blk_no != log->l_logBBsize) {
1581 /* some data is before the physical
1582 * end of log */
1583 ASSERT(!wrapped_hblks);
1584 ASSERT(blk_no <= INT_MAX);
1585 split_bblks =
1586 log->l_logBBsize - (int)blk_no;
1587 ASSERT(split_bblks > 0);
c40bdaa2
DC
1588 error = xlog_bread(log, blk_no,
1589 split_bblks, dbp,
1590 &offset);
1591 if (error)
a562a63b 1592 goto bread_err2;
a562a63b 1593 }
c40bdaa2 1594
a562a63b
NS
1595 /*
1596 * Note: this black magic still works with
1597 * large sector sizes (non-512) only because:
1598 * - we increased the buffer size originally
1599 * by 1 sector giving us enough extra space
1600 * for the second read;
1601 * - the log start is guaranteed to be sector
1602 * aligned;
1603 * - we read the log end (LR header start)
1604 * _first_, then the log start (LR header end)
1605 * - order is important.
1606 */
999f0b9c
DC
1607 error = xlog_bread_offset(log, 0,
1608 bblks - split_bblks, dbp,
1609 offset + BBTOB(split_bblks));
5e656dbb 1610 if (error)
a562a63b 1611 goto bread_err2;
999f0b9c 1612 }
c40bdaa2 1613
999f0b9c
DC
1614 error = xlog_unpack_data(rhead, offset, log);
1615 if (error)
1616 goto bread_err2;
c40bdaa2 1617
999f0b9c
DC
1618 error = xlog_recover_process_data(log, rhash,
1619 rhead, offset, pass);
1620 if (error)
a562a63b
NS
1621 goto bread_err2;
1622 blk_no += bblks;
d321ceac 1623 }
d321ceac 1624
a562a63b
NS
1625 ASSERT(blk_no >= log->l_logBBsize);
1626 blk_no -= log->l_logBBsize;
1627
1628 /* read first part of physical log */
1629 while (blk_no < head_blk) {
c40bdaa2
DC
1630 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
1631 if (error)
a562a63b 1632 goto bread_err2;
c40bdaa2 1633
a562a63b 1634 rhead = (xlog_rec_header_t *)offset;
72c5917e
NS
1635 error = xlog_valid_rec_header(log, rhead, blk_no);
1636 if (error)
1637 goto bread_err2;
c40bdaa2 1638
5e656dbb 1639 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
c40bdaa2
DC
1640 error = xlog_bread(log, blk_no+hblks, bblks, dbp,
1641 &offset);
1642 if (error)
a562a63b 1643 goto bread_err2;
c40bdaa2 1644
999f0b9c
DC
1645 error = xlog_unpack_data(rhead, offset, log);
1646 if (error)
1647 goto bread_err2;
1648
1649 error = xlog_recover_process_data(log, rhash,
1650 rhead, offset, pass);
1651 if (error)
a562a63b 1652 goto bread_err2;
72c5917e 1653 blk_no += bblks + hblks;
a562a63b 1654 }
5000d01d 1655 }
d321ceac 1656
a562a63b 1657 bread_err2:
66ab87d3 1658 libxfs_buf_relse(dbp);
a562a63b 1659 bread_err1:
66ab87d3 1660 libxfs_buf_relse(hbp);
a562a63b 1661 return error;
d321ceac 1662}