libxlog/xfs_log_recover.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   4  * All Rights Reserved.
   5  */
   6 #include "libxfs.h"
   7 #include "libxlog.h"
   8
   9 #define xfs_readonly_buftarg(buftarg)                   (0)
  10
  11 /* avoid set-but-unused var warning. gcc is not very bright. */
  12 #define xlog_clear_stale_blocks(log, taillsn)           ({ \
  13         (taillsn) = (taillsn); \
  14         (0); \
  15 })
  16
  17 #define BLK_AVG(blk1, blk2)    ((blk1+blk2) >> 1)
  18
  19 /*
  20  * Verify the given count of basic blocks is valid number of blocks
  21  * to specify for an operation involving the given XFS log buffer.
  22  * Returns nonzero if the count is valid, 0 otherwise.
  23  */
  24
  25 static inline int
  26 xlog_buf_bbcount_valid(
  27         struct xlog     *log,
  28         int             bbcount)
  29 {
  30         return bbcount > 0 && bbcount <= log->l_logBBsize;
  31 }
  32
  33 /*
  34  * Allocate a buffer to hold log data.  The buffer needs to be able
  35  * to map to a range of nbblks basic blocks at any valid (basic
  36  * block) offset within the log.
  37  */
  38 xfs_buf_t *
  39 xlog_get_bp(
  40         struct xlog     *log,
  41         int             nbblks)
  42 {
  43         if (!xlog_buf_bbcount_valid(log, nbblks)) {
  44                 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
  45                         nbblks);
  46                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
  47                 return NULL;
  48         }
  49
  50         /*
  51          * We do log I/O in units of log sectors (a power-of-2
  52          * multiple of the basic block size), so we round up the
  53          * requested size to accommodate the basic blocks required
  54          * for complete log sectors.
  55          *
  56          * In addition, the buffer may be used for a non-sector-
  57          * aligned block offset, in which case an I/O of the
  58          * requested size could extend beyond the end of the
  59          * buffer.  If the requested size is only 1 basic block it
  60          * will never straddle a sector boundary, so this won't be
  61          * an issue.  Nor will this be a problem if the log I/O is
  62          * done in basic blocks (sector size 1).  But otherwise we
  63          * extend the buffer by one extra log sector to ensure
  64          * there's space to accommodate this possibility.
  65          */
  66         if (nbblks > 1 && log->l_sectBBsize > 1)
  67                 nbblks += log->l_sectBBsize;
  68         nbblks = round_up(nbblks, log->l_sectBBsize);
  69
  70         return libxfs_getbufr(log->l_dev, (xfs_daddr_t)-1, nbblks);
  71 }
  72
  73 void
  74 xlog_put_bp(
  75         xfs_buf_t       *bp)
  76 {
  77         libxfs_putbufr(bp);
  78 }
  79
  80 /*
  81  * Return the address of the start of the given block number's data
  82  * in a log buffer.  The buffer covers a log sector-aligned region.
  83  */
  84 STATIC char *
  85 xlog_align(
  86         struct xlog     *log,
  87         xfs_daddr_t     blk_no,
  88         int             nbblks,
  89         struct xfs_buf  *bp)
  90 {
  91         xfs_daddr_t     offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
  92
  93         ASSERT(offset + nbblks <= bp->b_length);
  94         return bp->b_addr + BBTOB(offset);
  95 }
  96
  97
  98 /*
  99  * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
 100  */
 101 int
 102 xlog_bread_noalign(
 103         struct xlog     *log,
 104         xfs_daddr_t     blk_no,
 105         int             nbblks,
 106         struct xfs_buf  *bp)
 107 {
 108         if (!xlog_buf_bbcount_valid(log, nbblks)) {
 109                 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 110                         nbblks);
 111                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 112                 return EFSCORRUPTED;
 113         }
 114
 115         blk_no = round_down(blk_no, log->l_sectBBsize);
 116         nbblks = round_up(nbblks, log->l_sectBBsize);
 117
 118         ASSERT(nbblks > 0);
 119         ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
 120
 121         XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 122         bp->b_bcount = BBTOB(nbblks);
 123         bp->b_error = 0;
 124
 125         return libxfs_readbufr(log->l_dev, XFS_BUF_ADDR(bp), bp, nbblks, 0);
 126 }
 127
 128 int
 129 xlog_bread(
 130         struct xlog     *log,
 131         xfs_daddr_t     blk_no,
 132         int             nbblks,
 133         struct xfs_buf  *bp,
 134         char            **offset)
 135 {
 136         int             error;
 137
 138         error = xlog_bread_noalign(log, blk_no, nbblks, bp);
 139         if (error)
 140                 return error;
 141
 142         *offset = xlog_align(log, blk_no, nbblks, bp);
 143         return 0;
 144 }
 145
 146 /*
 147  * Read at an offset into the buffer. Returns with the buffer in it's original
 148  * state regardless of the result of the read.
 149  */
 150 STATIC int
 151 xlog_bread_offset(
 152         struct xlog     *log,
 153         xfs_daddr_t     blk_no,         /* block to read from */
 154         int             nbblks,         /* blocks to read */
 155         struct xfs_buf  *bp,
 156         char            *offset)
 157 {
 158         char            *orig_offset = bp->b_addr;
 159         int             orig_len = bp->b_bcount;
 160         int             error, error2;
 161
 162         error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
 163         if (error)
 164                 return error;
 165
 166         error = xlog_bread_noalign(log, blk_no, nbblks, bp);
 167
 168         /* must reset buffer pointer even on error */
 169         error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
 170         if (error)
 171                 return error;
 172         return error2;
 173 }
 174
 175 /*
 176  * This routine finds (to an approximation) the first block in the physical
 177  * log which contains the given cycle.  It uses a binary search algorithm.
 178  * Note that the algorithm can not be perfect because the disk will not
 179  * necessarily be perfect.
 180  */
 181 int
 182 xlog_find_cycle_start(
 183         struct xlog     *log,
 184         struct xfs_buf  *bp,
 185         xfs_daddr_t     first_blk,
 186         xfs_daddr_t     *last_blk,
 187         uint            cycle)
 188 {
 189         char            *offset;
 190         xfs_daddr_t     mid_blk;
 191         xfs_daddr_t     end_blk;
 192         uint            mid_cycle;
 193         int             error;
 194
 195         end_blk = *last_blk;
 196         mid_blk = BLK_AVG(first_blk, end_blk);
 197         while (mid_blk != first_blk && mid_blk != end_blk) {
 198                 error = xlog_bread(log, mid_blk, 1, bp, &offset);
 199                 if (error)
 200                         return error;
 201                 mid_cycle = xlog_get_cycle(offset);
 202                 if (mid_cycle == cycle)
 203                         end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
 204                 else
 205                         first_blk = mid_blk; /* first_half_cycle == mid_cycle */
 206                 mid_blk = BLK_AVG(first_blk, end_blk);
 207         }
 208         ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
 209                (mid_blk == end_blk && mid_blk-1 == first_blk));
 210
 211         *last_blk = end_blk;
 212
 213         return 0;
 214 }
 215
 216 /*
 217  * Check that a range of blocks does not contain stop_on_cycle_no.
 218  * Fill in *new_blk with the block offset where such a block is
 219  * found, or with -1 (an invalid block number) if there is no such
 220  * block in the range.  The scan needs to occur from front to back
 221  * and the pointer into the region must be updated since a later
 222  * routine will need to perform another test.
 223  */
 224 STATIC int
 225 xlog_find_verify_cycle(
 226         struct xlog     *log,
 227         xfs_daddr_t     start_blk,
 228         int             nbblks,
 229         uint            stop_on_cycle_no,
 230         xfs_daddr_t     *new_blk)
 231 {
 232         xfs_daddr_t     i, j;
 233         uint            cycle;
 234         xfs_buf_t       *bp;
 235         int             bufblks;
 236         char            *buf = NULL;
 237         int             error = 0;
 238
 239         /*
 240          * Greedily allocate a buffer big enough to handle the full
 241          * range of basic blocks we'll be examining.  If that fails,
 242          * try a smaller size.  We need to be able to read at least
 243          * a log sector, or we're out of luck.
 244          */
 245         bufblks = 1 << ffs(nbblks);
 246         while (bufblks > log->l_logBBsize)
 247                 bufblks >>= 1;
 248         while (!(bp = xlog_get_bp(log, bufblks))) {
 249                 bufblks >>= 1;
 250                 if (bufblks < log->l_sectBBsize)
 251                         return ENOMEM;
 252         }
 253
 254         for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
 255                 int     bcount;
 256
 257                 bcount = min(bufblks, (start_blk + nbblks - i));
 258
 259                 error = xlog_bread(log, i, bcount, bp, &buf);
 260                 if (error)
 261                         goto out;
 262
 263                 for (j = 0; j < bcount; j++) {
 264                         cycle = xlog_get_cycle(buf);
 265                         if (cycle == stop_on_cycle_no) {
 266                                 *new_blk = i+j;
 267                                 goto out;
 268                         }
 269
 270                         buf += BBSIZE;
 271                 }
 272         }
 273
 274         *new_blk = -1;
 275
 276 out:
 277         xlog_put_bp(bp);
 278         return error;
 279 }
 280
 281 /*
 282  * Potentially backup over partial log record write.
 283  *
 284  * In the typical case, last_blk is the number of the block directly after
 285  * a good log record.  Therefore, we subtract one to get the block number
 286  * of the last block in the given buffer.  extra_bblks contains the number
 287  * of blocks we would have read on a previous read.  This happens when the
 288  * last log record is split over the end of the physical log.
 289  *
 290  * extra_bblks is the number of blocks potentially verified on a previous
 291  * call to this routine.
 292  */
 293 STATIC int
 294 xlog_find_verify_log_record(
 295         struct xlog             *log,
 296         xfs_daddr_t             start_blk,
 297         xfs_daddr_t             *last_blk,
 298         int                     extra_bblks)
 299 {
 300         xfs_daddr_t             i;
 301         xfs_buf_t               *bp;
 302         char                    *offset = NULL;
 303         xlog_rec_header_t       *head = NULL;
 304         int                     error = 0;
 305         int                     smallmem = 0;
 306         int                     num_blks = *last_blk - start_blk;
 307         int                     xhdrs;
 308
 309         ASSERT(start_blk != 0 || *last_blk != start_blk);
 310
 311         if (!(bp = xlog_get_bp(log, num_blks))) {
 312                 if (!(bp = xlog_get_bp(log, 1)))
 313                         return ENOMEM;
 314                 smallmem = 1;
 315         } else {
 316                 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
 317                 if (error)
 318                         goto out;
 319                 offset += ((num_blks - 1) << BBSHIFT);
 320         }
 321
 322         for (i = (*last_blk) - 1; i >= 0; i--) {
 323                 if (i < start_blk) {
 324                         /* valid log record not found */
 325                         xfs_warn(log->l_mp,
 326                 "Log inconsistent (didn't find previous header)");
 327                         ASSERT(0);
 328                         error = XFS_ERROR(EIO);
 329                         goto out;
 330                 }
 331
 332                 if (smallmem) {
 333                         error = xlog_bread(log, i, 1, bp, &offset);
 334                         if (error)
 335                                 goto out;
 336                 }
 337
 338                 head = (xlog_rec_header_t *)offset;
 339
 340                 if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
 341                         break;
 342
 343                 if (!smallmem)
 344                         offset -= BBSIZE;
 345         }
 346
 347         /*
 348          * We hit the beginning of the physical log & still no header.  Return
 349          * to caller.  If caller can handle a return of -1, then this routine
 350          * will be called again for the end of the physical log.
 351          */
 352         if (i == -1) {
 353                 error = -1;
 354                 goto out;
 355         }
 356
 357         /*
 358          * We have the final block of the good log (the first block
 359          * of the log record _before_ the head. So we check the uuid.
 360          */
 361         if ((error = xlog_header_check_mount(log->l_mp, head)))
 362                 goto out;
 363
 364         /*
 365          * We may have found a log record header before we expected one.
 366          * last_blk will be the 1st block # with a given cycle #.  We may end
 367          * up reading an entire log record.  In this case, we don't want to
 368          * reset last_blk.  Only when last_blk points in the middle of a log
 369          * record do we update last_blk.
 370          */
 371         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 372                 uint    h_size = be32_to_cpu(head->h_size);
 373
 374                 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
 375                 if (h_size % XLOG_HEADER_CYCLE_SIZE)
 376                         xhdrs++;
 377         } else {
 378                 xhdrs = 1;
 379         }
 380
 381         if (*last_blk - i + extra_bblks !=
 382             BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
 383                 *last_blk = i;
 384
 385 out:
 386         xlog_put_bp(bp);
 387         return error;
 388 }
 389
 390 /*
 391  * Head is defined to be the point of the log where the next log write
 392  * write could go.  This means that incomplete LR writes at the end are
 393  * eliminated when calculating the head.  We aren't guaranteed that previous
 394  * LR have complete transactions.  We only know that a cycle number of
 395  * current cycle number -1 won't be present in the log if we start writing
 396  * from our current block number.
 397  *
 398  * last_blk contains the block number of the first block with a given
 399  * cycle number.
 400  *
 401  * Return: zero if normal, non-zero if error.
 402  */
 403 STATIC int
 404 xlog_find_head(
 405         struct xlog     *log,
 406         xfs_daddr_t     *return_head_blk)
 407 {
 408         xfs_buf_t       *bp;
 409         char            *offset;
 410         xfs_daddr_t     new_blk, first_blk, start_blk, last_blk, head_blk;
 411         int             num_scan_bblks;
 412         uint            first_half_cycle, last_half_cycle;
 413         uint            stop_on_cycle;
 414         int             error, log_bbnum = log->l_logBBsize;
 415
 416         /* Is the end of the log device zeroed? */
 417         if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
 418                 *return_head_blk = first_blk;
 419
 420                 /* Is the whole lot zeroed? */
 421                 if (!first_blk) {
 422                         /* Linux XFS shouldn't generate totally zeroed logs -
 423                          * mkfs etc write a dummy unmount record to a fresh
 424                          * log so we can store the uuid in there
 425                          */
 426                         xfs_warn(log->l_mp, "totally zeroed log");
 427                 }
 428
 429                 return 0;
 430         } else if (error) {
 431                 xfs_warn(log->l_mp, "empty log check failed");
 432                 return error;
 433         }
 434
 435         first_blk = 0;                  /* get cycle # of 1st block */
 436         bp = xlog_get_bp(log, 1);
 437         if (!bp)
 438                 return ENOMEM;
 439
 440         error = xlog_bread(log, 0, 1, bp, &offset);
 441         if (error)
 442                 goto bp_err;
 443
 444         first_half_cycle = xlog_get_cycle(offset);
 445
 446         last_blk = head_blk = log_bbnum - 1;    /* get cycle # of last block */
 447         error = xlog_bread(log, last_blk, 1, bp, &offset);
 448         if (error)
 449                 goto bp_err;
 450
 451         last_half_cycle = xlog_get_cycle(offset);
 452         ASSERT(last_half_cycle != 0);
 453
 454         /*
 455          * If the 1st half cycle number is equal to the last half cycle number,
 456          * then the entire log is stamped with the same cycle number.  In this
 457          * case, head_blk can't be set to zero (which makes sense).  The below
 458          * math doesn't work out properly with head_blk equal to zero.  Instead,
 459          * we set it to log_bbnum which is an invalid block number, but this
 460          * value makes the math correct.  If head_blk doesn't changed through
 461          * all the tests below, *head_blk is set to zero at the very end rather
 462          * than log_bbnum.  In a sense, log_bbnum and zero are the same block
 463          * in a circular file.
 464          */
 465         if (first_half_cycle == last_half_cycle) {
 466                 /*
 467                  * In this case we believe that the entire log should have
 468                  * cycle number last_half_cycle.  We need to scan backwards
 469                  * from the end verifying that there are no holes still
 470                  * containing last_half_cycle - 1.  If we find such a hole,
 471                  * then the start of that hole will be the new head.  The
 472                  * simple case looks like
 473                  *        x | x ... | x - 1 | x
 474                  * Another case that fits this picture would be
 475                  *        x | x + 1 | x ... | x
 476                  * In this case the head really is somewhere at the end of the
 477                  * log, as one of the latest writes at the beginning was
 478                  * incomplete.
 479                  * One more case is
 480                  *        x | x + 1 | x ... | x - 1 | x
 481                  * This is really the combination of the above two cases, and
 482                  * the head has to end up at the start of the x-1 hole at the
 483                  * end of the log.
 484                  *
 485                  * In the 256k log case, we will read from the beginning to the
 486                  * end of the log and search for cycle numbers equal to x-1.
 487                  * We don't worry about the x+1 blocks that we encounter,
 488                  * because we know that they cannot be the head since the log
 489                  * started with x.
 490                  */
 491                 head_blk = log_bbnum;
 492                 stop_on_cycle = last_half_cycle - 1;
 493         } else {
 494                 /*
 495                  * In this case we want to find the first block with cycle
 496                  * number matching last_half_cycle.  We expect the log to be
 497                  * some variation on
 498                  *        x + 1 ... | x ... | x
 499                  * The first block with cycle number x (last_half_cycle) will
 500                  * be where the new head belongs.  First we do a binary search
 501                  * for the first occurrence of last_half_cycle.  The binary
 502                  * search may not be totally accurate, so then we scan back
 503                  * from there looking for occurrences of last_half_cycle before
 504                  * us.  If that backwards scan wraps around the beginning of
 505                  * the log, then we look for occurrences of last_half_cycle - 1
 506                  * at the end of the log.  The cases we're looking for look
 507                  * like
 508                  *                               v binary search stopped here
 509                  *        x + 1 ... | x | x + 1 | x ... | x
 510                  *                   ^ but we want to locate this spot
 511                  * or
 512                  *        <---------> less than scan distance
 513                  *        x + 1 ... | x ... | x - 1 | x
 514                  *                           ^ we want to locate this spot
 515                  */
 516                 stop_on_cycle = last_half_cycle;
 517                 if ((error = xlog_find_cycle_start(log, bp, first_blk,
 518                                                 &head_blk, last_half_cycle)))
 519                         goto bp_err;
 520         }
 521
 522         /*
 523          * Now validate the answer.  Scan back some number of maximum possible
 524          * blocks and make sure each one has the expected cycle number.  The
 525          * maximum is determined by the total possible amount of buffering
 526          * in the in-core log.  The following number can be made tighter if
 527          * we actually look at the block size of the filesystem.
 528          */
 529         num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
 530         if (head_blk >= num_scan_bblks) {
 531                 /*
 532                  * We are guaranteed that the entire check can be performed
 533                  * in one buffer.
 534                  */
 535                 start_blk = head_blk - num_scan_bblks;
 536                 if ((error = xlog_find_verify_cycle(log,
 537                                                 start_blk, num_scan_bblks,
 538                                                 stop_on_cycle, &new_blk)))
 539                         goto bp_err;
 540                 if (new_blk != -1)
 541                         head_blk = new_blk;
 542         } else {                /* need to read 2 parts of log */
 543                 /*
 544                  * We are going to scan backwards in the log in two parts.
 545                  * First we scan the physical end of the log.  In this part
 546                  * of the log, we are looking for blocks with cycle number
 547                  * last_half_cycle - 1.
 548                  * If we find one, then we know that the log starts there, as
 549                  * we've found a hole that didn't get written in going around
 550                  * the end of the physical log.  The simple case for this is
 551                  *        x + 1 ... | x ... | x - 1 | x
 552                  *        <---------> less than scan distance
 553                  * If all of the blocks at the end of the log have cycle number
 554                  * last_half_cycle, then we check the blocks at the start of
 555                  * the log looking for occurrences of last_half_cycle.  If we
 556                  * find one, then our current estimate for the location of the
 557                  * first occurrence of last_half_cycle is wrong and we move
 558                  * back to the hole we've found.  This case looks like
 559                  *        x + 1 ... | x | x + 1 | x ...
 560                  *                               ^ binary search stopped here
 561                  * Another case we need to handle that only occurs in 256k
 562                  * logs is
 563                  *        x + 1 ... | x ... | x+1 | x ...
 564                  *                   ^ binary search stops here
 565                  * In a 256k log, the scan at the end of the log will see the
 566                  * x + 1 blocks.  We need to skip past those since that is
 567                  * certainly not the head of the log.  By searching for
 568                  * last_half_cycle-1 we accomplish that.
 569                  */
 570                 ASSERT(head_blk <= INT_MAX &&
 571                         (xfs_daddr_t) num_scan_bblks >= head_blk);
 572                 start_blk = log_bbnum - (num_scan_bblks - head_blk);
 573                 if ((error = xlog_find_verify_cycle(log, start_blk,
 574                                         num_scan_bblks - (int)head_blk,
 575                                         (stop_on_cycle - 1), &new_blk)))
 576                         goto bp_err;
 577                 if (new_blk != -1) {
 578                         head_blk = new_blk;
 579                         goto validate_head;
 580                 }
 581
 582                 /*
 583                  * Scan beginning of log now.  The last part of the physical
 584                  * log is good.  This scan needs to verify that it doesn't find
 585                  * the last_half_cycle.
 586                  */
 587                 start_blk = 0;
 588                 ASSERT(head_blk <= INT_MAX);
 589                 if ((error = xlog_find_verify_cycle(log,
 590                                         start_blk, (int)head_blk,
 591                                         stop_on_cycle, &new_blk)))
 592                         goto bp_err;
 593                 if (new_blk != -1)
 594                         head_blk = new_blk;
 595         }
 596
 597 validate_head:
 598         /*
 599          * Now we need to make sure head_blk is not pointing to a block in
 600          * the middle of a log record.
 601          */
 602         num_scan_bblks = XLOG_REC_SHIFT(log);
 603         if (head_blk >= num_scan_bblks) {
 604                 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
 605
 606                 /* start ptr at last block ptr before head_blk */
 607                 if ((error = xlog_find_verify_log_record(log, start_blk,
 608                                                         &head_blk, 0)) == -1) {
 609                         error = XFS_ERROR(EIO);
 610                         goto bp_err;
 611                 } else if (error)
 612                         goto bp_err;
 613         } else {
 614                 start_blk = 0;
 615                 ASSERT(head_blk <= INT_MAX);
 616                 if ((error = xlog_find_verify_log_record(log, start_blk,
 617                                                         &head_blk, 0)) == -1) {
 618                         /* We hit the beginning of the log during our search */
 619                         start_blk = log_bbnum - (num_scan_bblks - head_blk);
 620                         new_blk = log_bbnum;
 621                         ASSERT(start_blk <= INT_MAX &&
 622                                 (xfs_daddr_t) log_bbnum-start_blk >= 0);
 623                         ASSERT(head_blk <= INT_MAX);
 624                         if ((error = xlog_find_verify_log_record(log,
 625                                                         start_blk, &new_blk,
 626                                                         (int)head_blk)) == -1) {
 627                                 error = XFS_ERROR(EIO);
 628                                 goto bp_err;
 629                         } else if (error)
 630                                 goto bp_err;
 631                         if (new_blk != log_bbnum)
 632                                 head_blk = new_blk;
 633                 } else if (error)
 634                         goto bp_err;
 635         }
 636
 637         xlog_put_bp(bp);
 638         if (head_blk == log_bbnum)
 639                 *return_head_blk = 0;
 640         else
 641                 *return_head_blk = head_blk;
 642         /*
 643          * When returning here, we have a good block number.  Bad block
 644          * means that during a previous crash, we didn't have a clean break
 645          * from cycle number N to cycle number N-1.  In this case, we need
 646          * to find the first block with cycle number N-1.
 647          */
 648         return 0;
 649
 650  bp_err:
 651         xlog_put_bp(bp);
 652
 653         if (error)
 654                 xfs_warn(log->l_mp, "failed to find log head");
 655         return error;
 656 }
 657
 658 /*
 659  * Find the sync block number or the tail of the log.
 660  *
 661  * This will be the block number of the last record to have its
 662  * associated buffers synced to disk.  Every log record header has
 663  * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
 664  * to get a sync block number.  The only concern is to figure out which
 665  * log record header to believe.
 666  *
 667  * The following algorithm uses the log record header with the largest
 668  * lsn.  The entire log record does not need to be valid.  We only care
 669  * that the header is valid.
 670  *
 671  * We could speed up search by using current head_blk buffer, but it is not
 672  * available.
 673  */
 674 int
 675 xlog_find_tail(
 676         struct xlog             *log,
 677         xfs_daddr_t             *head_blk,
 678         xfs_daddr_t             *tail_blk)
 679 {
 680         xlog_rec_header_t       *rhead;
 681         xlog_op_header_t        *op_head;
 682         char                    *offset = NULL;
 683         xfs_buf_t               *bp;
 684         int                     error, i, found;
 685         xfs_daddr_t             umount_data_blk;
 686         xfs_daddr_t             after_umount_blk;
 687         xfs_lsn_t               tail_lsn;
 688         int                     hblks;
 689
 690         found = 0;
 691
 692         /*
 693          * Find previous log record
 694          */
 695         if ((error = xlog_find_head(log, head_blk)))
 696                 return error;
 697
 698         bp = xlog_get_bp(log, 1);
 699         if (!bp)
 700                 return ENOMEM;
 701         if (*head_blk == 0) {                           /* special case */
 702                 error = xlog_bread(log, 0, 1, bp, &offset);
 703                 if (error)
 704                         goto done;
 705
 706                 if (xlog_get_cycle(offset) == 0) {
 707                         *tail_blk = 0;
 708                         /* leave all other log inited values alone */
 709                         goto done;
 710                 }
 711         }
 712
 713         /*
 714          * Search backwards looking for log record header block
 715          */
 716         ASSERT(*head_blk < INT_MAX);
 717         for (i = (int)(*head_blk) - 1; i >= 0; i--) {
 718                 error = xlog_bread(log, i, 1, bp, &offset);
 719                 if (error)
 720                         goto done;
 721
 722                 if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 723                         found = 1;
 724                         break;
 725                 }
 726         }
 727         /*
 728          * If we haven't found the log record header block, start looking
 729          * again from the end of the physical log.  XXXmiken: There should be
 730          * a check here to make sure we didn't search more than N blocks in
 731          * the previous code.
 732          */
 733         if (!found) {
 734                 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
 735                         error = xlog_bread(log, i, 1, bp, &offset);
 736                         if (error)
 737                                 goto done;
 738
 739                         if (*(__be32 *)offset ==
 740                             cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 741                                 found = 2;
 742                                 break;
 743                         }
 744                 }
 745         }
 746         if (!found) {
 747                 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
 748                 xlog_put_bp(bp);
 749                 ASSERT(0);
 750                 return XFS_ERROR(EIO);
 751         }
 752
 753         /* find blk_no of tail of log */
 754         rhead = (xlog_rec_header_t *)offset;
 755         *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
 756
 757         /*
 758          * Reset log values according to the state of the log when we
 759          * crashed.  In the case where head_blk == 0, we bump curr_cycle
 760          * one because the next write starts a new cycle rather than
 761          * continuing the cycle of the last good log record.  At this
 762          * point we have guaranteed that all partial log records have been
 763          * accounted for.  Therefore, we know that the last good log record
 764          * written was complete and ended exactly on the end boundary
 765          * of the physical log.
 766          */
 767         log->l_prev_block = i;
 768         log->l_curr_block = (int)*head_blk;
 769         log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
 770         if (found == 2)
 771                 log->l_curr_cycle++;
 772         atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
 773         atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
 774         xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
 775                                         BBTOB(log->l_curr_block));
 776         xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
 777                                         BBTOB(log->l_curr_block));
 778
 779         /*
 780          * Look for unmount record.  If we find it, then we know there
 781          * was a clean unmount.  Since 'i' could be the last block in
 782          * the physical log, we convert to a log block before comparing
 783          * to the head_blk.
 784          *
 785          * Save the current tail lsn to use to pass to
 786          * xlog_clear_stale_blocks() below.  We won't want to clear the
 787          * unmount record if there is one, so we pass the lsn of the
 788          * unmount record rather than the block after it.
 789          */
 790         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 791                 int     h_size = be32_to_cpu(rhead->h_size);
 792                 int     h_version = be32_to_cpu(rhead->h_version);
 793
 794                 if ((h_version & XLOG_VERSION_2) &&
 795                     (h_size > XLOG_HEADER_CYCLE_SIZE)) {
 796                         hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
 797                         if (h_size % XLOG_HEADER_CYCLE_SIZE)
 798                                 hblks++;
 799                 } else {
 800                         hblks = 1;
 801                 }
 802         } else {
 803                 hblks = 1;
 804         }
 805         after_umount_blk = (i + hblks + (int)
 806                 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
 807         tail_lsn = atomic64_read(&log->l_tail_lsn);
 808         if (*head_blk == after_umount_blk &&
 809             be32_to_cpu(rhead->h_num_logops) == 1) {
 810                 umount_data_blk = (i + hblks) % log->l_logBBsize;
 811                 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
 812                 if (error)
 813                         goto done;
 814
 815                 op_head = (xlog_op_header_t *)offset;
 816                 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
 817                         /*
 818                          * Set tail and last sync so that newly written
 819                          * log records will point recovery to after the
 820                          * current unmount record.
 821                          */
 822                         xlog_assign_atomic_lsn(&log->l_tail_lsn,
 823                                         log->l_curr_cycle, after_umount_blk);
 824                         xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
 825                                         log->l_curr_cycle, after_umount_blk);
 826                         *tail_blk = after_umount_blk;
 827
 828                         /*
 829                          * Note that the unmount was clean. If the unmount
 830                          * was not clean, we need to know this to rebuild the
 831                          * superblock counters from the perag headers if we
 832                          * have a filesystem using non-persistent counters.
 833                          */
 834                         log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
 835                 }
 836         }
 837
 838         /*
 839          * Make sure that there are no blocks in front of the head
 840          * with the same cycle number as the head.  This can happen
 841          * because we allow multiple outstanding log writes concurrently,
 842          * and the later writes might make it out before earlier ones.
 843          *
 844          * We use the lsn from before modifying it so that we'll never
 845          * overwrite the unmount record after a clean unmount.
 846          *
 847          * Do this only if we are going to recover the filesystem
 848          *
 849          * NOTE: This used to say "if (!readonly)"
 850          * However on Linux, we can & do recover a read-only filesystem.
 851          * We only skip recovery if NORECOVERY is specified on mount,
 852          * in which case we would not be here.
 853          *
 854          * But... if the -device- itself is readonly, just skip this.
 855          * We can't recover this device anyway, so it won't matter.
 856          */
 857         if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
 858                 error = xlog_clear_stale_blocks(log, tail_lsn);
 859
 860 done:
 861         xlog_put_bp(bp);
 862
 863         if (error)
 864                 xfs_warn(log->l_mp, "failed to locate log tail");
 865         return error;
 866 }
 867
 868 /*
 869  * Is the log zeroed at all?
 870  *
 871  * The last binary search should be changed to perform an X block read
 872  * once X becomes small enough.  You can then search linearly through
 873  * the X blocks.  This will cut down on the number of reads we need to do.
 874  *
 875  * If the log is partially zeroed, this routine will pass back the blkno
 876  * of the first block with cycle number 0.  It won't have a complete LR
 877  * preceding it.
 878  *
 879  * Return:
 880  *      0  => the log is completely written to
 881  *      -1 => use *blk_no as the first block of the log
 882  *      >0 => error has occurred
 883  */
 884 int
 885 xlog_find_zeroed(
 886         struct xlog     *log,
 887         xfs_daddr_t     *blk_no)
 888 {
 889         xfs_buf_t       *bp;
 890         char            *offset;
 891         uint            first_cycle, last_cycle;
 892         xfs_daddr_t     new_blk, last_blk, start_blk;
 893         xfs_daddr_t     num_scan_bblks;
 894         int             error, log_bbnum = log->l_logBBsize;
 895
 896         *blk_no = 0;
 897
 898         /* check totally zeroed log */
 899         bp = xlog_get_bp(log, 1);
 900         if (!bp)
 901                 return ENOMEM;
 902         error = xlog_bread(log, 0, 1, bp, &offset);
 903         if (error)
 904                 goto bp_err;
 905
 906         first_cycle = xlog_get_cycle(offset);
 907         if (first_cycle == 0) {         /* completely zeroed log */
 908                 *blk_no = 0;
 909                 xlog_put_bp(bp);
 910                 return -1;
 911         }
 912
 913         /* check partially zeroed log */
 914         error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
 915         if (error)
 916                 goto bp_err;
 917
 918         last_cycle = xlog_get_cycle(offset);
 919         if (last_cycle != 0) {          /* log completely written to */
 920                 xlog_put_bp(bp);
 921                 return 0;
 922         } else if (first_cycle != 1) {
 923                 /*
 924                  * If the cycle of the last block is zero, the cycle of
 925                  * the first block must be 1. If it's not, maybe we're
 926                  * not looking at a log... Bail out.
 927                  */
 928                 xfs_warn(log->l_mp,
 929                         "Log inconsistent or not a log (last==0, first!=1)");
 930                 error = XFS_ERROR(EINVAL);
 931                 goto bp_err;
 932         }
 933
 934         /* we have a partially zeroed log */
 935         last_blk = log_bbnum-1;
 936         if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
 937                 goto bp_err;
 938
 939         /*
 940          * Validate the answer.  Because there is no way to guarantee that
 941          * the entire log is made up of log records which are the same size,
 942          * we scan over the defined maximum blocks.  At this point, the maximum
 943          * is not chosen to mean anything special.   XXXmiken
 944          */
 945         num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
 946         ASSERT(num_scan_bblks <= INT_MAX);
 947
 948         if (last_blk < num_scan_bblks)
 949                 num_scan_bblks = last_blk;
 950         start_blk = last_blk - num_scan_bblks;
 951
 952         /*
 953          * We search for any instances of cycle number 0 that occur before
 954          * our current estimate of the head.  What we're trying to detect is
 955          *        1 ... | 0 | 1 | 0...
 956          *                       ^ binary search ends here
 957          */
 958         if ((error = xlog_find_verify_cycle(log, start_blk,
 959                                          (int)num_scan_bblks, 0, &new_blk)))
 960                 goto bp_err;
 961         if (new_blk != -1)
 962                 last_blk = new_blk;
 963
 964         /*
 965          * Potentially backup over partial log record write.  We don't need
 966          * to search the end of the log because we know it is zero.
 967          */
 968         if ((error = xlog_find_verify_log_record(log, start_blk,
 969                                 &last_blk, 0)) == -1) {
 970             error = XFS_ERROR(EIO);
 971             goto bp_err;
 972         } else if (error)
 973             goto bp_err;
 974
 975         *blk_no = last_blk;
 976 bp_err:
 977         xlog_put_bp(bp);
 978         if (error)
 979                 return error;
 980         return -1;
 981 }
 982
 983 STATIC xlog_recover_t *
 984 xlog_recover_find_tid(
 985         struct hlist_head       *head,
 986         xlog_tid_t              tid)
 987 {
 988         xlog_recover_t          *trans;
 989         struct hlist_node       *n;
 990
 991         hlist_for_each_entry(trans, n, head, r_list) {
 992                 if (trans->r_log_tid == tid)
 993                         return trans;
 994         }
 995         return NULL;
 996 }
 997
 998 STATIC void
 999 xlog_recover_new_tid(
1000         struct hlist_head       *head,
1001         xlog_tid_t              tid,
1002         xfs_lsn_t               lsn)
1003 {
1004         xlog_recover_t          *trans;
1005
1006         trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1007         trans->r_log_tid   = tid;
1008         trans->r_lsn       = lsn;
1009         INIT_LIST_HEAD(&trans->r_itemq);
1010
1011         INIT_HLIST_NODE(&trans->r_list);
1012         hlist_add_head(&trans->r_list, head);
1013 }
1014
1015 STATIC void
1016 xlog_recover_add_item(
1017         struct list_head        *head)
1018 {
1019         xlog_recover_item_t     *item;
1020
1021         item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1022         INIT_LIST_HEAD(&item->ri_list);
1023         list_add_tail(&item->ri_list, head);
1024 }
1025
1026 #define BLK_AVG(blk1, blk2)     ((blk1+blk2) >> 1)
1027
1028 STATIC int
1029 xlog_recover_add_to_cont_trans(
1030         struct xlog             *log,
1031         struct xlog_recover     *trans,
1032         char                    *dp,
1033         int                     len)
1034 {
1035         xlog_recover_item_t     *item;
1036         char                    *ptr, *old_ptr;
1037         int                     old_len;
1038
1039         if (list_empty(&trans->r_itemq)) {
1040                 /* finish copying rest of trans header */
1041                 xlog_recover_add_item(&trans->r_itemq);
1042                 ptr = (char *) &trans->r_theader +
1043                                 sizeof(xfs_trans_header_t) - len;
1044                 memcpy(ptr, dp, len); /* d, s, l */
1045                 return 0;
1046         }
1047         /* take the tail entry */
1048         item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1049
1050         old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1051         old_len = item->ri_buf[item->ri_cnt-1].i_len;
1052
1053         ptr = kmem_realloc(old_ptr, len+old_len, KM_SLEEP);
1054         memcpy(&ptr[old_len], dp, len); /* d, s, l */
1055         item->ri_buf[item->ri_cnt-1].i_len += len;
1056         item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1057         trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1058         return 0;
1059 }
1060
1061 /*
1062  * The next region to add is the start of a new region.  It could be
1063  * a whole region or it could be the first part of a new region.  Because
1064  * of this, the assumption here is that the type and size fields of all
1065  * format structures fit into the first 32 bits of the structure.
1066  *
1067  * This works because all regions must be 32 bit aligned.  Therefore, we
1068  * either have both fields or we have neither field.  In the case we have
1069  * neither field, the data part of the region is zero length.  We only have
1070  * a log_op_header and can throw away the header since a new one will appear
1071  * later.  If we have at least 4 bytes, then we can determine how many regions
1072  * will appear in the current log item.
1073  */
1074 STATIC int
1075 xlog_recover_add_to_trans(
1076         struct xlog             *log,
1077         struct xlog_recover     *trans,
1078         char                    *dp,
1079         int                     len)
1080 {
1081         struct xfs_inode_log_format     *in_f;                  /* any will do */
1082         xlog_recover_item_t     *item;
1083         char                    *ptr;
1084
1085         if (!len)
1086                 return 0;
1087         if (list_empty(&trans->r_itemq)) {
1088                 /* we need to catch log corruptions here */
1089                 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1090                         xfs_warn(log->l_mp, "%s: bad header magic number",
1091                                 __func__);
1092                         ASSERT(0);
1093                         return XFS_ERROR(EIO);
1094                 }
1095                 if (len == sizeof(xfs_trans_header_t))
1096                         xlog_recover_add_item(&trans->r_itemq);
1097                 memcpy(&trans->r_theader, dp, len); /* d, s, l */
1098                 return 0;
1099         }
1100
1101         ptr = kmem_alloc(len, KM_SLEEP);
1102         memcpy(ptr, dp, len);
1103         in_f = (struct xfs_inode_log_format *)ptr;
1104
1105         /* take the tail entry */
1106         item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1107         if (item->ri_total != 0 &&
1108              item->ri_total == item->ri_cnt) {
1109                 /* tail item is in use, get a new one */
1110                 xlog_recover_add_item(&trans->r_itemq);
1111                 item = list_entry(trans->r_itemq.prev,
1112                                         xlog_recover_item_t, ri_list);
1113         }
1114
1115         if (item->ri_total == 0) {              /* first region to be added */
1116                 if (in_f->ilf_size == 0 ||
1117                     in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1118                         xfs_warn(log->l_mp,
1119                 "bad number of regions (%d) in inode log format",
1120                                   in_f->ilf_size);
1121                         ASSERT(0);
1122                         kmem_free(ptr);
1123                         return XFS_ERROR(EIO);
1124                 }
1125
1126                 item->ri_total = in_f->ilf_size;
1127                 item->ri_buf =
1128                         kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
1129                                     KM_SLEEP);
1130         }
1131         ASSERT(item->ri_total > item->ri_cnt);
1132         /* Description region is ri_buf[0] */
1133         item->ri_buf[item->ri_cnt].i_addr = ptr;
1134         item->ri_buf[item->ri_cnt].i_len  = len;
1135         item->ri_cnt++;
1136         trace_xfs_log_recover_item_add(log, trans, item, 0);
1137         return 0;
1138 }
1139
1140 /*
1141  * Free up any resources allocated by the transaction
1142  *
1143  * Remember that EFIs, EFDs, and IUNLINKs are handled later.
1144  */
1145 STATIC void
1146 xlog_recover_free_trans(
1147         struct xlog_recover     *trans)
1148 {
1149         xlog_recover_item_t     *item, *n;
1150         int                     i;
1151
1152         list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
1153                 /* Free the regions in the item. */
1154                 list_del(&item->ri_list);
1155                 for (i = 0; i < item->ri_cnt; i++)
1156                         kmem_free(item->ri_buf[i].i_addr);
1157                 /* Free the item itself */
1158                 kmem_free(item->ri_buf);
1159                 kmem_free(item);
1160         }
1161         /* Free the transaction recover structure */
1162         kmem_free(trans);
1163 }
1164
1165 /*
1166  * Perform the transaction.
1167  *
1168  * If the transaction modifies a buffer or inode, do it now.  Otherwise,
1169  * EFIs and EFDs get queued up by adding entries into the AIL for them.
1170  */
1171 STATIC int
1172 xlog_recover_commit_trans(
1173         struct xlog             *log,
1174         struct xlog_recover     *trans,
1175         int                     pass)
1176 {
1177         int                     error = 0;
1178
1179         hlist_del(&trans->r_list);
1180         if ((error = xlog_recover_do_trans(log, trans, pass)))
1181                 return error;
1182
1183         xlog_recover_free_trans(trans);
1184         return 0;
1185 }
1186
1187 STATIC int
1188 xlog_recover_unmount_trans(
1189         xlog_recover_t          *trans)
1190 {
1191         /* Do nothing now */
1192         xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
1193         return 0;
1194 }
1195
1196 /*
1197  * There are two valid states of the r_state field.  0 indicates that the
1198  * transaction structure is in a normal state.  We have either seen the
1199  * start of the transaction or the last operation we added was not a partial
1200  * operation.  If the last operation we added to the transaction was a
1201  * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
1202  *
1203  * NOTE: skip LRs with 0 data length.
1204  */
1205 STATIC int
1206 xlog_recover_process_data(
1207         struct xlog             *log,
1208         struct hlist_head       rhash[],
1209         struct xlog_rec_header  *rhead,
1210         char                    *dp,
1211         int                     pass)
1212 {
1213         char                    *lp;
1214         int                     num_logops;
1215         xlog_op_header_t        *ohead;
1216         xlog_recover_t          *trans;
1217         xlog_tid_t              tid;
1218         int                     error;
1219         unsigned long           hash;
1220         uint                    flags;
1221
1222         lp = dp + be32_to_cpu(rhead->h_len);
1223         num_logops = be32_to_cpu(rhead->h_num_logops);
1224
1225         /* check the log format matches our own - else we can't recover */
1226         if (xlog_header_check_recover(log->l_mp, rhead))
1227                 return (XFS_ERROR(EIO));
1228
1229         while ((dp < lp) && num_logops) {
1230                 ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
1231                 ohead = (xlog_op_header_t *)dp;
1232                 dp += sizeof(xlog_op_header_t);
1233                 if (ohead->oh_clientid != XFS_TRANSACTION &&
1234                     ohead->oh_clientid != XFS_LOG) {
1235                         xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
1236                                         __func__, ohead->oh_clientid);
1237                         ASSERT(0);
1238                         return (XFS_ERROR(EIO));
1239                 }
1240                 tid = be32_to_cpu(ohead->oh_tid);
1241                 hash = XLOG_RHASH(tid);
1242                 trans = xlog_recover_find_tid(&rhash[hash], tid);
1243                 if (trans == NULL) {               /* not found; add new tid */
1244                         if (ohead->oh_flags & XLOG_START_TRANS)
1245                                 xlog_recover_new_tid(&rhash[hash], tid,
1246                                         be64_to_cpu(rhead->h_lsn));
1247                 } else {
1248                         if (dp + be32_to_cpu(ohead->oh_len) > lp) {
1249                                 xfs_warn(log->l_mp, "%s: bad length 0x%x",
1250                                         __func__, be32_to_cpu(ohead->oh_len));
1251                                 return (XFS_ERROR(EIO));
1252                         }
1253                         flags = ohead->oh_flags & ~XLOG_END_TRANS;
1254                         if (flags & XLOG_WAS_CONT_TRANS)
1255                                 flags &= ~XLOG_CONTINUE_TRANS;
1256                         switch (flags) {
1257                         case XLOG_COMMIT_TRANS:
1258                                 error = xlog_recover_commit_trans(log,
1259                                                                 trans, pass);
1260                                 break;
1261                         case XLOG_UNMOUNT_TRANS:
1262                                 error = xlog_recover_unmount_trans(trans);
1263                                 break;
1264                         case XLOG_WAS_CONT_TRANS:
1265                                 error = xlog_recover_add_to_cont_trans(log,
1266                                                 trans, dp,
1267                                                 be32_to_cpu(ohead->oh_len));
1268                                 break;
1269                         case XLOG_START_TRANS:
1270                                 xfs_warn(log->l_mp, "%s: bad transaction",
1271                                         __func__);
1272                                 ASSERT(0);
1273                                 error = XFS_ERROR(EIO);
1274                                 break;
1275                         case 0:
1276                         case XLOG_CONTINUE_TRANS:
1277                                 error = xlog_recover_add_to_trans(log, trans,
1278                                                 dp, be32_to_cpu(ohead->oh_len));
1279                                 break;
1280                         default:
1281                                 xfs_warn(log->l_mp, "%s: bad flag 0x%x",
1282                                         __func__, flags);
1283                                 ASSERT(0);
1284                                 error = XFS_ERROR(EIO);
1285                                 break;
1286                         }
1287                         if (error)
1288                                 return error;
1289                 }
1290                 dp += be32_to_cpu(ohead->oh_len);
1291                 num_logops--;
1292         }
1293         return 0;
1294 }
1295
1296 /*
1297  * Upack the log buffer data and crc check it. If the check fails, issue a
1298  * warning if and only if the CRC in the header is non-zero. This makes the
1299  * check an advisory warning, and the zero CRC check will prevent failure
1300  * warnings from being emitted when upgrading the kernel from one that does not
1301  * add CRCs by default.
1302  *
1303  * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
1304  * corruption failure
1305  *
1306  * XXX: we do not calculate the CRC here yet. It's not clear what we should do
1307  * with CRC errors here in userspace, so we'll address that problem later on.
1308  */
1309 #define xlog_cksum(l,r,dp,len)  ((r)->h_crc)
1310 STATIC int
1311 xlog_unpack_data_crc(
1312         struct xlog_rec_header  *rhead,
1313         char                    *dp,
1314         struct xlog             *log)
1315 {
1316         __le32                  crc;
1317
1318         crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
1319         if (crc != rhead->h_crc) {
1320                 if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
1321                         xfs_alert(log->l_mp,
1322                 "log record CRC mismatch: found 0x%x, expected 0x%x.",
1323                                         le32_to_cpu(rhead->h_crc),
1324                                         le32_to_cpu(crc));
1325                         xfs_hex_dump(dp, 32);
1326                 }
1327
1328                 /*
1329                  * If we've detected a log record corruption, then we can't
1330                  * recover past this point. Abort recovery if we are enforcing
1331                  * CRC protection by punting an error back up the stack.
1332                  */
1333                 if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
1334                         return EFSCORRUPTED;
1335         }
1336
1337         return 0;
1338 }
1339
1340 STATIC int
1341 xlog_unpack_data(
1342         struct xlog_rec_header  *rhead,
1343         char                    *dp,
1344         struct xlog             *log)
1345 {
1346         int                     i, j, k;
1347         int                     error;
1348
1349         error = xlog_unpack_data_crc(rhead, dp, log);
1350         if (error)
1351                 return error;
1352
1353         for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
1354                   i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
1355                 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
1356                 dp += BBSIZE;
1357         }
1358
1359         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1360                 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
1361                 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
1362                         j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1363                         k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1364                         *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
1365                         dp += BBSIZE;
1366                 }
1367         }
1368
1369         return 0;
1370 }
1371
1372 STATIC int
1373 xlog_valid_rec_header(
1374         struct xlog             *log,
1375         struct xlog_rec_header  *rhead,
1376         xfs_daddr_t             blkno)
1377 {
1378         int                     hlen;
1379
1380         if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
1381                 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
1382                                 XFS_ERRLEVEL_LOW, log->l_mp);
1383                 return XFS_ERROR(EFSCORRUPTED);
1384         }
1385         if (unlikely(
1386             (!rhead->h_version ||
1387             (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
1388                 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
1389                         __func__, be32_to_cpu(rhead->h_version));
1390                 return XFS_ERROR(EIO);
1391         }
1392
1393         /* LR body must have data or it wouldn't have been written */
1394         hlen = be32_to_cpu(rhead->h_len);
1395         if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
1396                 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
1397                                 XFS_ERRLEVEL_LOW, log->l_mp);
1398                 return XFS_ERROR(EFSCORRUPTED);
1399         }
1400         if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
1401                 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
1402                                 XFS_ERRLEVEL_LOW, log->l_mp);
1403                 return XFS_ERROR(EFSCORRUPTED);
1404         }
1405         return 0;
1406 }
1407
1408 /*
1409  * Read the log from tail to head and process the log records found.
1410  * Handle the two cases where the tail and head are in the same cycle
1411  * and where the active portion of the log wraps around the end of
1412  * the physical log separately.  The pass parameter is passed through
1413  * to the routines called to process the data and is not looked at
1414  * here.
1415  */
1416 int
1417 xlog_do_recovery_pass(
1418         struct xlog             *log,
1419         xfs_daddr_t             head_blk,
1420         xfs_daddr_t             tail_blk,
1421         int                     pass)
1422 {
1423         xlog_rec_header_t       *rhead;
1424         xfs_daddr_t             blk_no;
1425         char                    *offset;
1426         xfs_buf_t               *hbp, *dbp;
1427         int                     error = 0, h_size;
1428         int                     bblks, split_bblks;
1429         int                     hblks, split_hblks, wrapped_hblks;
1430         struct hlist_head       rhash[XLOG_RHASH_SIZE];
1431
1432         ASSERT(head_blk != tail_blk);
1433
1434         /*
1435          * Read the header of the tail block and get the iclog buffer size from
1436          * h_size.  Use this to tell how many sectors make up the log header.
1437          */
1438         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1439                 /*
1440                  * When using variable length iclogs, read first sector of
1441                  * iclog header and extract the header size from it.  Get a
1442                  * new hbp that is the correct size.
1443                  */
1444                 hbp = xlog_get_bp(log, 1);
1445                 if (!hbp)
1446                         return ENOMEM;
1447
1448                 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
1449                 if (error)
1450                         goto bread_err1;
1451
1452                 rhead = (xlog_rec_header_t *)offset;
1453                 error = xlog_valid_rec_header(log, rhead, tail_blk);
1454                 if (error)
1455                         goto bread_err1;
1456                 h_size = be32_to_cpu(rhead->h_size);
1457                 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
1458                     (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1459                         hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1460                         if (h_size % XLOG_HEADER_CYCLE_SIZE)
1461                                 hblks++;
1462                         xlog_put_bp(hbp);
1463                         hbp = xlog_get_bp(log, hblks);
1464                 } else {
1465                         hblks = 1;
1466                 }
1467         } else {
1468                 ASSERT(log->l_sectBBsize == 1);
1469                 hblks = 1;
1470                 hbp = xlog_get_bp(log, 1);
1471                 h_size = XLOG_BIG_RECORD_BSIZE;
1472         }
1473
1474         if (!hbp)
1475                 return ENOMEM;
1476         dbp = xlog_get_bp(log, BTOBB(h_size));
1477         if (!dbp) {
1478                 xlog_put_bp(hbp);
1479                 return ENOMEM;
1480         }
1481
1482         memset(rhash, 0, sizeof(rhash));
1483         if (tail_blk <= head_blk) {
1484                 for (blk_no = tail_blk; blk_no < head_blk; ) {
1485                         error = xlog_bread(log, blk_no, hblks, hbp, &offset);
1486                         if (error)
1487                                 goto bread_err2;
1488
1489                         rhead = (xlog_rec_header_t *)offset;
1490                         error = xlog_valid_rec_header(log, rhead, blk_no);
1491                         if (error)
1492                                 goto bread_err2;
1493
1494                         /* blocks in data section */
1495                         bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1496                         error = xlog_bread(log, blk_no + hblks, bblks, dbp,
1497                                            &offset);
1498                         if (error)
1499                                 goto bread_err2;
1500
1501                         error = xlog_unpack_data(rhead, offset, log);
1502                         if (error)
1503                                 goto bread_err2;
1504
1505                         error = xlog_recover_process_data(log,
1506                                                 rhash, rhead, offset, pass);
1507                         if (error)
1508                                 goto bread_err2;
1509                         blk_no += bblks + hblks;
1510                 }
1511         } else {
1512                 /*
1513                  * Perform recovery around the end of the physical log.
1514                  * When the head is not on the same cycle number as the tail,
1515                  * we can't do a sequential recovery as above.
1516                  */
1517                 blk_no = tail_blk;
1518                 while (blk_no < log->l_logBBsize) {
1519                         /*
1520                          * Check for header wrapping around physical end-of-log
1521                          */
1522                         offset = hbp->b_addr;
1523                         split_hblks = 0;
1524                         wrapped_hblks = 0;
1525                         if (blk_no + hblks <= log->l_logBBsize) {
1526                                 /* Read header in one read */
1527                                 error = xlog_bread(log, blk_no, hblks, hbp,
1528                                                    &offset);
1529                                 if (error)
1530                                         goto bread_err2;
1531                         } else {
1532                                 /* This LR is split across physical log end */
1533                                 if (blk_no != log->l_logBBsize) {
1534                                         /* some data before physical log end */
1535                                         ASSERT(blk_no <= INT_MAX);
1536                                         split_hblks = log->l_logBBsize - (int)blk_no;
1537                                         ASSERT(split_hblks > 0);
1538                                         error = xlog_bread(log, blk_no,
1539                                                            split_hblks, hbp,
1540                                                            &offset);
1541                                         if (error)
1542                                                 goto bread_err2;
1543                                 }
1544
1545                                 /*
1546                                  * Note: this black magic still works with
1547                                  * large sector sizes (non-512) only because:
1548                                  * - we increased the buffer size originally
1549                                  *   by 1 sector giving us enough extra space
1550                                  *   for the second read;
1551                                  * - the log start is guaranteed to be sector
1552                                  *   aligned;
1553                                  * - we read the log end (LR header start)
1554                                  *   _first_, then the log start (LR header end)
1555                                  *   - order is important.
1556                                  */
1557                                 wrapped_hblks = hblks - split_hblks;
1558                                 error = xlog_bread_offset(log, 0,
1559                                                 wrapped_hblks, hbp,
1560                                                 offset + BBTOB(split_hblks));
1561                                 if (error)
1562                                         goto bread_err2;
1563                         }
1564                         rhead = (xlog_rec_header_t *)offset;
1565                         error = xlog_valid_rec_header(log, rhead,
1566                                                 split_hblks ? blk_no : 0);
1567                         if (error)
1568                                 goto bread_err2;
1569
1570                         bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1571                         blk_no += hblks;
1572
1573                         /* Read in data for log record */
1574                         if (blk_no + bblks <= log->l_logBBsize) {
1575                                 error = xlog_bread(log, blk_no, bblks, dbp,
1576                                                    &offset);
1577                                 if (error)
1578                                         goto bread_err2;
1579                         } else {
1580                                 /* This log record is split across the
1581                                  * physical end of log */
1582                                 offset = dbp->b_addr;
1583                                 split_bblks = 0;
1584                                 if (blk_no != log->l_logBBsize) {
1585                                         /* some data is before the physical
1586                                          * end of log */
1587                                         ASSERT(!wrapped_hblks);
1588                                         ASSERT(blk_no <= INT_MAX);
1589                                         split_bblks =
1590                                                 log->l_logBBsize - (int)blk_no;
1591                                         ASSERT(split_bblks > 0);
1592                                         error = xlog_bread(log, blk_no,
1593                                                         split_bblks, dbp,
1594                                                         &offset);
1595                                         if (error)
1596                                                 goto bread_err2;
1597                                 }
1598
1599                                 /*
1600                                  * Note: this black magic still works with
1601                                  * large sector sizes (non-512) only because:
1602                                  * - we increased the buffer size originally
1603                                  *   by 1 sector giving us enough extra space
1604                                  *   for the second read;
1605                                  * - the log start is guaranteed to be sector
1606                                  *   aligned;
1607                                  * - we read the log end (LR header start)
1608                                  *   _first_, then the log start (LR header end)
1609                                  *   - order is important.
1610                                  */
1611                                 error = xlog_bread_offset(log, 0,
1612                                                 bblks - split_bblks, dbp,
1613                                                 offset + BBTOB(split_bblks));
1614                                 if (error)
1615                                         goto bread_err2;
1616                         }
1617
1618                         error = xlog_unpack_data(rhead, offset, log);
1619                         if (error)
1620                                 goto bread_err2;
1621
1622                         error = xlog_recover_process_data(log, rhash,
1623                                                         rhead, offset, pass);
1624                         if (error)
1625                                 goto bread_err2;
1626                         blk_no += bblks;
1627                 }
1628
1629                 ASSERT(blk_no >= log->l_logBBsize);
1630                 blk_no -= log->l_logBBsize;
1631
1632                 /* read first part of physical log */
1633                 while (blk_no < head_blk) {
1634                         error = xlog_bread(log, blk_no, hblks, hbp, &offset);
1635                         if (error)
1636                                 goto bread_err2;
1637
1638                         rhead = (xlog_rec_header_t *)offset;
1639                         error = xlog_valid_rec_header(log, rhead, blk_no);
1640                         if (error)
1641                                 goto bread_err2;
1642
1643                         bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1644                         error = xlog_bread(log, blk_no+hblks, bblks, dbp,
1645                                            &offset);
1646                         if (error)
1647                                 goto bread_err2;
1648
1649                         error = xlog_unpack_data(rhead, offset, log);
1650                         if (error)
1651                                 goto bread_err2;
1652
1653                         error = xlog_recover_process_data(log, rhash,
1654                                                         rhead, offset, pass);
1655                         if (error)
1656                                 goto bread_err2;
1657                         blk_no += bblks + hblks;
1658                 }
1659         }
1660
1661  bread_err2:
1662         xlog_put_bp(dbp);
1663  bread_err1:
1664         xlog_put_bp(hbp);
1665         return error;
1666 }