libxlog/xfs_log_recover.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 #include <xfs/libxlog.h>
  20
  21 #define xfs_readonly_buftarg(buftarg)                   (0)
  22
  23 /* avoid set-but-unused var warning. gcc is not very bright. */
  24 #define xlog_clear_stale_blocks(log, taillsn)           ({ \
  25         (taillsn) = (taillsn); \
  26         (0); \
  27 })
  28
  29 #define BLK_AVG(blk1, blk2)    ((blk1+blk2) >> 1)
  30
  31 /*
  32  * Verify the given count of basic blocks is valid number of blocks
  33  * to specify for an operation involving the given XFS log buffer.
  34  * Returns nonzero if the count is valid, 0 otherwise.
  35  */
  36
  37 static inline int
  38 xlog_buf_bbcount_valid(
  39         struct xlog     *log,
  40         int             bbcount)
  41 {
  42         return bbcount > 0 && bbcount <= log->l_logBBsize;
  43 }
  44
  45 /*
  46  * Allocate a buffer to hold log data.  The buffer needs to be able
  47  * to map to a range of nbblks basic blocks at any valid (basic
  48  * block) offset within the log.
  49  */
  50 xfs_buf_t *
  51 xlog_get_bp(
  52         struct xlog     *log,
  53         int             nbblks)
  54 {
  55         if (!xlog_buf_bbcount_valid(log, nbblks)) {
  56                 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
  57                         nbblks);
  58                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
  59                 return NULL;
  60         }
  61
  62         /*
  63          * We do log I/O in units of log sectors (a power-of-2
  64          * multiple of the basic block size), so we round up the
  65          * requested size to accommodate the basic blocks required
  66          * for complete log sectors.
  67          *
  68          * In addition, the buffer may be used for a non-sector-
  69          * aligned block offset, in which case an I/O of the
  70          * requested size could extend beyond the end of the
  71          * buffer.  If the requested size is only 1 basic block it
  72          * will never straddle a sector boundary, so this won't be
  73          * an issue.  Nor will this be a problem if the log I/O is
  74          * done in basic blocks (sector size 1).  But otherwise we
  75          * extend the buffer by one extra log sector to ensure
  76          * there's space to accommodate this possibility.
  77          */
  78         if (nbblks > 1 && log->l_sectBBsize > 1)
  79                 nbblks += log->l_sectBBsize;
  80         nbblks = round_up(nbblks, log->l_sectBBsize);
  81
  82         return libxfs_getbufr(log->l_dev, (xfs_daddr_t)-1, nbblks);
  83 }
  84
  85 void
  86 xlog_put_bp(
  87         xfs_buf_t       *bp)
  88 {
  89         libxfs_putbufr(bp);
  90 }
  91
  92 /*
  93  * Return the address of the start of the given block number's data
  94  * in a log buffer.  The buffer covers a log sector-aligned region.
  95  */
  96 STATIC xfs_caddr_t
  97 xlog_align(
  98         struct xlog     *log,
  99         xfs_daddr_t     blk_no,
 100         int             nbblks,
 101         struct xfs_buf  *bp)
 102 {
 103         xfs_daddr_t     offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
 104
 105         ASSERT(offset + nbblks <= bp->b_length);
 106         return bp->b_addr + BBTOB(offset);
 107 }
 108
 109
 110 /*
 111  * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
 112  */
 113 int
 114 xlog_bread_noalign(
 115         struct xlog     *log,
 116         xfs_daddr_t     blk_no,
 117         int             nbblks,
 118         struct xfs_buf  *bp)
 119 {
 120         if (!xlog_buf_bbcount_valid(log, nbblks)) {
 121                 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 122                         nbblks);
 123                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 124                 return EFSCORRUPTED;
 125         }
 126
 127         blk_no = round_down(blk_no, log->l_sectBBsize);
 128         nbblks = round_up(nbblks, log->l_sectBBsize);
 129
 130         ASSERT(nbblks > 0);
 131         ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
 132
 133         XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 134         XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
 135         bp->b_error = 0;
 136
 137         return libxfs_readbufr(log->l_dev, XFS_BUF_ADDR(bp), bp, nbblks, 0);
 138 }
 139
 140 int
 141 xlog_bread(
 142         struct xlog     *log,
 143         xfs_daddr_t     blk_no,
 144         int             nbblks,
 145         struct xfs_buf  *bp,
 146         xfs_caddr_t     *offset)
 147 {
 148         int             error;
 149
 150         error = xlog_bread_noalign(log, blk_no, nbblks, bp);
 151         if (error)
 152                 return error;
 153
 154         *offset = xlog_align(log, blk_no, nbblks, bp);
 155         return 0;
 156 }
 157
 158 /*
 159  * Read at an offset into the buffer. Returns with the buffer in it's original
 160  * state regardless of the result of the read.
 161  */
 162 STATIC int
 163 xlog_bread_offset(
 164         struct xlog     *log,
 165         xfs_daddr_t     blk_no,         /* block to read from */
 166         int             nbblks,         /* blocks to read */
 167         struct xfs_buf  *bp,
 168         xfs_caddr_t     offset)
 169 {
 170         xfs_caddr_t     orig_offset = bp->b_addr;
 171         int             orig_len = bp->b_bcount;
 172         int             error, error2;
 173
 174         error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks));
 175         if (error)
 176                 return error;
 177
 178         error = xlog_bread_noalign(log, blk_no, nbblks, bp);
 179
 180         /* must reset buffer pointer even on error */
 181         error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len);
 182         if (error)
 183                 return error;
 184         return error2;
 185 }
 186
 187 /*
 188  * This routine finds (to an approximation) the first block in the physical
 189  * log which contains the given cycle.  It uses a binary search algorithm.
 190  * Note that the algorithm can not be perfect because the disk will not
 191  * necessarily be perfect.
 192  */
 193 int
 194 xlog_find_cycle_start(
 195         struct xlog     *log,
 196         struct xfs_buf  *bp,
 197         xfs_daddr_t     first_blk,
 198         xfs_daddr_t     *last_blk,
 199         uint            cycle)
 200 {
 201         xfs_caddr_t     offset;
 202         xfs_daddr_t     mid_blk;
 203         xfs_daddr_t     end_blk;
 204         uint            mid_cycle;
 205         int             error;
 206
 207         end_blk = *last_blk;
 208         mid_blk = BLK_AVG(first_blk, end_blk);
 209         while (mid_blk != first_blk && mid_blk != end_blk) {
 210                 error = xlog_bread(log, mid_blk, 1, bp, &offset);
 211                 if (error)
 212                         return error;
 213                 mid_cycle = xlog_get_cycle(offset);
 214                 if (mid_cycle == cycle)
 215                         end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
 216                 else
 217                         first_blk = mid_blk; /* first_half_cycle == mid_cycle */
 218                 mid_blk = BLK_AVG(first_blk, end_blk);
 219         }
 220         ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
 221                (mid_blk == end_blk && mid_blk-1 == first_blk));
 222
 223         *last_blk = end_blk;
 224
 225         return 0;
 226 }
 227
 228 /*
 229  * Check that a range of blocks does not contain stop_on_cycle_no.
 230  * Fill in *new_blk with the block offset where such a block is
 231  * found, or with -1 (an invalid block number) if there is no such
 232  * block in the range.  The scan needs to occur from front to back
 233  * and the pointer into the region must be updated since a later
 234  * routine will need to perform another test.
 235  */
 236 STATIC int
 237 xlog_find_verify_cycle(
 238         struct xlog     *log,
 239         xfs_daddr_t     start_blk,
 240         int             nbblks,
 241         uint            stop_on_cycle_no,
 242         xfs_daddr_t     *new_blk)
 243 {
 244         xfs_daddr_t     i, j;
 245         uint            cycle;
 246         xfs_buf_t       *bp;
 247         xfs_daddr_t     bufblks;
 248         xfs_caddr_t     buf = NULL;
 249         int             error = 0;
 250
 251         /*
 252          * Greedily allocate a buffer big enough to handle the full
 253          * range of basic blocks we'll be examining.  If that fails,
 254          * try a smaller size.  We need to be able to read at least
 255          * a log sector, or we're out of luck.
 256          */
 257         bufblks = 1 << ffs(nbblks);
 258         while (bufblks > log->l_logBBsize)
 259                 bufblks >>= 1;
 260         while (!(bp = xlog_get_bp(log, bufblks))) {
 261                 bufblks >>= 1;
 262                 if (bufblks < log->l_sectBBsize)
 263                         return ENOMEM;
 264         }
 265
 266         for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
 267                 int     bcount;
 268
 269                 bcount = min(bufblks, (start_blk + nbblks - i));
 270
 271                 error = xlog_bread(log, i, bcount, bp, &buf);
 272                 if (error)
 273                         goto out;
 274
 275                 for (j = 0; j < bcount; j++) {
 276                         cycle = xlog_get_cycle(buf);
 277                         if (cycle == stop_on_cycle_no) {
 278                                 *new_blk = i+j;
 279                                 goto out;
 280                         }
 281
 282                         buf += BBSIZE;
 283                 }
 284         }
 285
 286         *new_blk = -1;
 287
 288 out:
 289         xlog_put_bp(bp);
 290         return error;
 291 }
 292
 293 /*
 294  * Potentially backup over partial log record write.
 295  *
 296  * In the typical case, last_blk is the number of the block directly after
 297  * a good log record.  Therefore, we subtract one to get the block number
 298  * of the last block in the given buffer.  extra_bblks contains the number
 299  * of blocks we would have read on a previous read.  This happens when the
 300  * last log record is split over the end of the physical log.
 301  *
 302  * extra_bblks is the number of blocks potentially verified on a previous
 303  * call to this routine.
 304  */
 305 STATIC int
 306 xlog_find_verify_log_record(
 307         struct xlog             *log,
 308         xfs_daddr_t             start_blk,
 309         xfs_daddr_t             *last_blk,
 310         int                     extra_bblks)
 311 {
 312         xfs_daddr_t             i;
 313         xfs_buf_t               *bp;
 314         xfs_caddr_t             offset = NULL;
 315         xlog_rec_header_t       *head = NULL;
 316         int                     error = 0;
 317         int                     smallmem = 0;
 318         int                     num_blks = *last_blk - start_blk;
 319         int                     xhdrs;
 320
 321         ASSERT(start_blk != 0 || *last_blk != start_blk);
 322
 323         if (!(bp = xlog_get_bp(log, num_blks))) {
 324                 if (!(bp = xlog_get_bp(log, 1)))
 325                         return ENOMEM;
 326                 smallmem = 1;
 327         } else {
 328                 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
 329                 if (error)
 330                         goto out;
 331                 offset += ((num_blks - 1) << BBSHIFT);
 332         }
 333
 334         for (i = (*last_blk) - 1; i >= 0; i--) {
 335                 if (i < start_blk) {
 336                         /* valid log record not found */
 337                         xfs_warn(log->l_mp,
 338                 "Log inconsistent (didn't find previous header)");
 339                         ASSERT(0);
 340                         error = XFS_ERROR(EIO);
 341                         goto out;
 342                 }
 343
 344                 if (smallmem) {
 345                         error = xlog_bread(log, i, 1, bp, &offset);
 346                         if (error)
 347                                 goto out;
 348                 }
 349
 350                 head = (xlog_rec_header_t *)offset;
 351
 352                 if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
 353                         break;
 354
 355                 if (!smallmem)
 356                         offset -= BBSIZE;
 357         }
 358
 359         /*
 360          * We hit the beginning of the physical log & still no header.  Return
 361          * to caller.  If caller can handle a return of -1, then this routine
 362          * will be called again for the end of the physical log.
 363          */
 364         if (i == -1) {
 365                 error = -1;
 366                 goto out;
 367         }
 368
 369         /*
 370          * We have the final block of the good log (the first block
 371          * of the log record _before_ the head. So we check the uuid.
 372          */
 373         if ((error = xlog_header_check_mount(log->l_mp, head)))
 374                 goto out;
 375
 376         /*
 377          * We may have found a log record header before we expected one.
 378          * last_blk will be the 1st block # with a given cycle #.  We may end
 379          * up reading an entire log record.  In this case, we don't want to
 380          * reset last_blk.  Only when last_blk points in the middle of a log
 381          * record do we update last_blk.
 382          */
 383         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 384                 uint    h_size = be32_to_cpu(head->h_size);
 385
 386                 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
 387                 if (h_size % XLOG_HEADER_CYCLE_SIZE)
 388                         xhdrs++;
 389         } else {
 390                 xhdrs = 1;
 391         }
 392
 393         if (*last_blk - i + extra_bblks !=
 394             BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
 395                 *last_blk = i;
 396
 397 out:
 398         xlog_put_bp(bp);
 399         return error;
 400 }
 401
 402 /*
 403  * Head is defined to be the point of the log where the next log write
 404  * write could go.  This means that incomplete LR writes at the end are
 405  * eliminated when calculating the head.  We aren't guaranteed that previous
 406  * LR have complete transactions.  We only know that a cycle number of
 407  * current cycle number -1 won't be present in the log if we start writing
 408  * from our current block number.
 409  *
 410  * last_blk contains the block number of the first block with a given
 411  * cycle number.
 412  *
 413  * Return: zero if normal, non-zero if error.
 414  */
 415 STATIC int
 416 xlog_find_head(
 417         struct xlog     *log,
 418         xfs_daddr_t     *return_head_blk)
 419 {
 420         xfs_buf_t       *bp;
 421         xfs_caddr_t     offset;
 422         xfs_daddr_t     new_blk, first_blk, start_blk, last_blk, head_blk;
 423         int             num_scan_bblks;
 424         uint            first_half_cycle, last_half_cycle;
 425         uint            stop_on_cycle;
 426         int             error, log_bbnum = log->l_logBBsize;
 427
 428         /* Is the end of the log device zeroed? */
 429         if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
 430                 *return_head_blk = first_blk;
 431
 432                 /* Is the whole lot zeroed? */
 433                 if (!first_blk) {
 434                         /* Linux XFS shouldn't generate totally zeroed logs -
 435                          * mkfs etc write a dummy unmount record to a fresh
 436                          * log so we can store the uuid in there
 437                          */
 438                         xfs_warn(log->l_mp, "totally zeroed log");
 439                 }
 440
 441                 return 0;
 442         } else if (error) {
 443                 xfs_warn(log->l_mp, "empty log check failed");
 444                 return error;
 445         }
 446
 447         first_blk = 0;                  /* get cycle # of 1st block */
 448         bp = xlog_get_bp(log, 1);
 449         if (!bp)
 450                 return ENOMEM;
 451
 452         error = xlog_bread(log, 0, 1, bp, &offset);
 453         if (error)
 454                 goto bp_err;
 455
 456         first_half_cycle = xlog_get_cycle(offset);
 457
 458         last_blk = head_blk = log_bbnum - 1;    /* get cycle # of last block */
 459         error = xlog_bread(log, last_blk, 1, bp, &offset);
 460         if (error)
 461                 goto bp_err;
 462
 463         last_half_cycle = xlog_get_cycle(offset);
 464         ASSERT(last_half_cycle != 0);
 465
 466         /*
 467          * If the 1st half cycle number is equal to the last half cycle number,
 468          * then the entire log is stamped with the same cycle number.  In this
 469          * case, head_blk can't be set to zero (which makes sense).  The below
 470          * math doesn't work out properly with head_blk equal to zero.  Instead,
 471          * we set it to log_bbnum which is an invalid block number, but this
 472          * value makes the math correct.  If head_blk doesn't changed through
 473          * all the tests below, *head_blk is set to zero at the very end rather
 474          * than log_bbnum.  In a sense, log_bbnum and zero are the same block
 475          * in a circular file.
 476          */
 477         if (first_half_cycle == last_half_cycle) {
 478                 /*
 479                  * In this case we believe that the entire log should have
 480                  * cycle number last_half_cycle.  We need to scan backwards
 481                  * from the end verifying that there are no holes still
 482                  * containing last_half_cycle - 1.  If we find such a hole,
 483                  * then the start of that hole will be the new head.  The
 484                  * simple case looks like
 485                  *        x | x ... | x - 1 | x
 486                  * Another case that fits this picture would be
 487                  *        x | x + 1 | x ... | x
 488                  * In this case the head really is somewhere at the end of the
 489                  * log, as one of the latest writes at the beginning was
 490                  * incomplete.
 491                  * One more case is
 492                  *        x | x + 1 | x ... | x - 1 | x
 493                  * This is really the combination of the above two cases, and
 494                  * the head has to end up at the start of the x-1 hole at the
 495                  * end of the log.
 496                  *
 497                  * In the 256k log case, we will read from the beginning to the
 498                  * end of the log and search for cycle numbers equal to x-1.
 499                  * We don't worry about the x+1 blocks that we encounter,
 500                  * because we know that they cannot be the head since the log
 501                  * started with x.
 502                  */
 503                 head_blk = log_bbnum;
 504                 stop_on_cycle = last_half_cycle - 1;
 505         } else {
 506                 /*
 507                  * In this case we want to find the first block with cycle
 508                  * number matching last_half_cycle.  We expect the log to be
 509                  * some variation on
 510                  *        x + 1 ... | x ... | x
 511                  * The first block with cycle number x (last_half_cycle) will
 512                  * be where the new head belongs.  First we do a binary search
 513                  * for the first occurrence of last_half_cycle.  The binary
 514                  * search may not be totally accurate, so then we scan back
 515                  * from there looking for occurrences of last_half_cycle before
 516                  * us.  If that backwards scan wraps around the beginning of
 517                  * the log, then we look for occurrences of last_half_cycle - 1
 518                  * at the end of the log.  The cases we're looking for look
 519                  * like
 520                  *                               v binary search stopped here
 521                  *        x + 1 ... | x | x + 1 | x ... | x
 522                  *                   ^ but we want to locate this spot
 523                  * or
 524                  *        <---------> less than scan distance
 525                  *        x + 1 ... | x ... | x - 1 | x
 526                  *                           ^ we want to locate this spot
 527                  */
 528                 stop_on_cycle = last_half_cycle;
 529                 if ((error = xlog_find_cycle_start(log, bp, first_blk,
 530                                                 &head_blk, last_half_cycle)))
 531                         goto bp_err;
 532         }
 533
 534         /*
 535          * Now validate the answer.  Scan back some number of maximum possible
 536          * blocks and make sure each one has the expected cycle number.  The
 537          * maximum is determined by the total possible amount of buffering
 538          * in the in-core log.  The following number can be made tighter if
 539          * we actually look at the block size of the filesystem.
 540          */
 541         num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
 542         if (head_blk >= num_scan_bblks) {
 543                 /*
 544                  * We are guaranteed that the entire check can be performed
 545                  * in one buffer.
 546                  */
 547                 start_blk = head_blk - num_scan_bblks;
 548                 if ((error = xlog_find_verify_cycle(log,
 549                                                 start_blk, num_scan_bblks,
 550                                                 stop_on_cycle, &new_blk)))
 551                         goto bp_err;
 552                 if (new_blk != -1)
 553                         head_blk = new_blk;
 554         } else {                /* need to read 2 parts of log */
 555                 /*
 556                  * We are going to scan backwards in the log in two parts.
 557                  * First we scan the physical end of the log.  In this part
 558                  * of the log, we are looking for blocks with cycle number
 559                  * last_half_cycle - 1.
 560                  * If we find one, then we know that the log starts there, as
 561                  * we've found a hole that didn't get written in going around
 562                  * the end of the physical log.  The simple case for this is
 563                  *        x + 1 ... | x ... | x - 1 | x
 564                  *        <---------> less than scan distance
 565                  * If all of the blocks at the end of the log have cycle number
 566                  * last_half_cycle, then we check the blocks at the start of
 567                  * the log looking for occurrences of last_half_cycle.  If we
 568                  * find one, then our current estimate for the location of the
 569                  * first occurrence of last_half_cycle is wrong and we move
 570                  * back to the hole we've found.  This case looks like
 571                  *        x + 1 ... | x | x + 1 | x ...
 572                  *                               ^ binary search stopped here
 573                  * Another case we need to handle that only occurs in 256k
 574                  * logs is
 575                  *        x + 1 ... | x ... | x+1 | x ...
 576                  *                   ^ binary search stops here
 577                  * In a 256k log, the scan at the end of the log will see the
 578                  * x + 1 blocks.  We need to skip past those since that is
 579                  * certainly not the head of the log.  By searching for
 580                  * last_half_cycle-1 we accomplish that.
 581                  */
 582                 ASSERT(head_blk <= INT_MAX &&
 583                         (xfs_daddr_t) num_scan_bblks >= head_blk);
 584                 start_blk = log_bbnum - (num_scan_bblks - head_blk);
 585                 if ((error = xlog_find_verify_cycle(log, start_blk,
 586                                         num_scan_bblks - (int)head_blk,
 587                                         (stop_on_cycle - 1), &new_blk)))
 588                         goto bp_err;
 589                 if (new_blk != -1) {
 590                         head_blk = new_blk;
 591                         goto validate_head;
 592                 }
 593
 594                 /*
 595                  * Scan beginning of log now.  The last part of the physical
 596                  * log is good.  This scan needs to verify that it doesn't find
 597                  * the last_half_cycle.
 598                  */
 599                 start_blk = 0;
 600                 ASSERT(head_blk <= INT_MAX);
 601                 if ((error = xlog_find_verify_cycle(log,
 602                                         start_blk, (int)head_blk,
 603                                         stop_on_cycle, &new_blk)))
 604                         goto bp_err;
 605                 if (new_blk != -1)
 606                         head_blk = new_blk;
 607         }
 608
 609 validate_head:
 610         /*
 611          * Now we need to make sure head_blk is not pointing to a block in
 612          * the middle of a log record.
 613          */
 614         num_scan_bblks = XLOG_REC_SHIFT(log);
 615         if (head_blk >= num_scan_bblks) {
 616                 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
 617
 618                 /* start ptr at last block ptr before head_blk */
 619                 if ((error = xlog_find_verify_log_record(log, start_blk,
 620                                                         &head_blk, 0)) == -1) {
 621                         error = XFS_ERROR(EIO);
 622                         goto bp_err;
 623                 } else if (error)
 624                         goto bp_err;
 625         } else {
 626                 start_blk = 0;
 627                 ASSERT(head_blk <= INT_MAX);
 628                 if ((error = xlog_find_verify_log_record(log, start_blk,
 629                                                         &head_blk, 0)) == -1) {
 630                         /* We hit the beginning of the log during our search */
 631                         start_blk = log_bbnum - (num_scan_bblks - head_blk);
 632                         new_blk = log_bbnum;
 633                         ASSERT(start_blk <= INT_MAX &&
 634                                 (xfs_daddr_t) log_bbnum-start_blk >= 0);
 635                         ASSERT(head_blk <= INT_MAX);
 636                         if ((error = xlog_find_verify_log_record(log,
 637                                                         start_blk, &new_blk,
 638                                                         (int)head_blk)) == -1) {
 639                                 error = XFS_ERROR(EIO);
 640                                 goto bp_err;
 641                         } else if (error)
 642                                 goto bp_err;
 643                         if (new_blk != log_bbnum)
 644                                 head_blk = new_blk;
 645                 } else if (error)
 646                         goto bp_err;
 647         }
 648
 649         xlog_put_bp(bp);
 650         if (head_blk == log_bbnum)
 651                 *return_head_blk = 0;
 652         else
 653                 *return_head_blk = head_blk;
 654         /*
 655          * When returning here, we have a good block number.  Bad block
 656          * means that during a previous crash, we didn't have a clean break
 657          * from cycle number N to cycle number N-1.  In this case, we need
 658          * to find the first block with cycle number N-1.
 659          */
 660         return 0;
 661
 662  bp_err:
 663         xlog_put_bp(bp);
 664
 665         if (error)
 666                 xfs_warn(log->l_mp, "failed to find log head");
 667         return error;
 668 }
 669
 670 /*
 671  * Find the sync block number or the tail of the log.
 672  *
 673  * This will be the block number of the last record to have its
 674  * associated buffers synced to disk.  Every log record header has
 675  * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
 676  * to get a sync block number.  The only concern is to figure out which
 677  * log record header to believe.
 678  *
 679  * The following algorithm uses the log record header with the largest
 680  * lsn.  The entire log record does not need to be valid.  We only care
 681  * that the header is valid.
 682  *
 683  * We could speed up search by using current head_blk buffer, but it is not
 684  * available.
 685  */
 686 int
 687 xlog_find_tail(
 688         struct xlog             *log,
 689         xfs_daddr_t             *head_blk,
 690         xfs_daddr_t             *tail_blk)
 691 {
 692         xlog_rec_header_t       *rhead;
 693         xlog_op_header_t        *op_head;
 694         xfs_caddr_t             offset = NULL;
 695         xfs_buf_t               *bp;
 696         int                     error, i, found;
 697         xfs_daddr_t             umount_data_blk;
 698         xfs_daddr_t             after_umount_blk;
 699         xfs_lsn_t               tail_lsn;
 700         int                     hblks;
 701
 702         found = 0;
 703
 704         /*
 705          * Find previous log record
 706          */
 707         if ((error = xlog_find_head(log, head_blk)))
 708                 return error;
 709
 710         bp = xlog_get_bp(log, 1);
 711         if (!bp)
 712                 return ENOMEM;
 713         if (*head_blk == 0) {                           /* special case */
 714                 error = xlog_bread(log, 0, 1, bp, &offset);
 715                 if (error)
 716                         goto done;
 717
 718                 if (xlog_get_cycle(offset) == 0) {
 719                         *tail_blk = 0;
 720                         /* leave all other log inited values alone */
 721                         goto done;
 722                 }
 723         }
 724
 725         /*
 726          * Search backwards looking for log record header block
 727          */
 728         ASSERT(*head_blk < INT_MAX);
 729         for (i = (int)(*head_blk) - 1; i >= 0; i--) {
 730                 error = xlog_bread(log, i, 1, bp, &offset);
 731                 if (error)
 732                         goto done;
 733
 734                 if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 735                         found = 1;
 736                         break;
 737                 }
 738         }
 739         /*
 740          * If we haven't found the log record header block, start looking
 741          * again from the end of the physical log.  XXXmiken: There should be
 742          * a check here to make sure we didn't search more than N blocks in
 743          * the previous code.
 744          */
 745         if (!found) {
 746                 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
 747                         error = xlog_bread(log, i, 1, bp, &offset);
 748                         if (error)
 749                                 goto done;
 750
 751                         if (*(__be32 *)offset ==
 752                             cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 753                                 found = 2;
 754                                 break;
 755                         }
 756                 }
 757         }
 758         if (!found) {
 759                 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
 760                 xlog_put_bp(bp);
 761                 ASSERT(0);
 762                 return XFS_ERROR(EIO);
 763         }
 764
 765         /* find blk_no of tail of log */
 766         rhead = (xlog_rec_header_t *)offset;
 767         *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
 768
 769         /*
 770          * Reset log values according to the state of the log when we
 771          * crashed.  In the case where head_blk == 0, we bump curr_cycle
 772          * one because the next write starts a new cycle rather than
 773          * continuing the cycle of the last good log record.  At this
 774          * point we have guaranteed that all partial log records have been
 775          * accounted for.  Therefore, we know that the last good log record
 776          * written was complete and ended exactly on the end boundary
 777          * of the physical log.
 778          */
 779         log->l_prev_block = i;
 780         log->l_curr_block = (int)*head_blk;
 781         log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
 782         if (found == 2)
 783                 log->l_curr_cycle++;
 784         atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
 785         atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
 786         xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
 787                                         BBTOB(log->l_curr_block));
 788         xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
 789                                         BBTOB(log->l_curr_block));
 790
 791         /*
 792          * Look for unmount record.  If we find it, then we know there
 793          * was a clean unmount.  Since 'i' could be the last block in
 794          * the physical log, we convert to a log block before comparing
 795          * to the head_blk.
 796          *
 797          * Save the current tail lsn to use to pass to
 798          * xlog_clear_stale_blocks() below.  We won't want to clear the
 799          * unmount record if there is one, so we pass the lsn of the
 800          * unmount record rather than the block after it.
 801          */
 802         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 803                 int     h_size = be32_to_cpu(rhead->h_size);
 804                 int     h_version = be32_to_cpu(rhead->h_version);
 805
 806                 if ((h_version & XLOG_VERSION_2) &&
 807                     (h_size > XLOG_HEADER_CYCLE_SIZE)) {
 808                         hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
 809                         if (h_size % XLOG_HEADER_CYCLE_SIZE)
 810                                 hblks++;
 811                 } else {
 812                         hblks = 1;
 813                 }
 814         } else {
 815                 hblks = 1;
 816         }
 817         after_umount_blk = (i + hblks + (int)
 818                 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
 819         tail_lsn = atomic64_read(&log->l_tail_lsn);
 820         if (*head_blk == after_umount_blk &&
 821             be32_to_cpu(rhead->h_num_logops) == 1) {
 822                 umount_data_blk = (i + hblks) % log->l_logBBsize;
 823                 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
 824                 if (error)
 825                         goto done;
 826
 827                 op_head = (xlog_op_header_t *)offset;
 828                 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
 829                         /*
 830                          * Set tail and last sync so that newly written
 831                          * log records will point recovery to after the
 832                          * current unmount record.
 833                          */
 834                         xlog_assign_atomic_lsn(&log->l_tail_lsn,
 835                                         log->l_curr_cycle, after_umount_blk);
 836                         xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
 837                                         log->l_curr_cycle, after_umount_blk);
 838                         *tail_blk = after_umount_blk;
 839
 840                         /*
 841                          * Note that the unmount was clean. If the unmount
 842                          * was not clean, we need to know this to rebuild the
 843                          * superblock counters from the perag headers if we
 844                          * have a filesystem using non-persistent counters.
 845                          */
 846                         log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
 847                 }
 848         }
 849
 850         /*
 851          * Make sure that there are no blocks in front of the head
 852          * with the same cycle number as the head.  This can happen
 853          * because we allow multiple outstanding log writes concurrently,
 854          * and the later writes might make it out before earlier ones.
 855          *
 856          * We use the lsn from before modifying it so that we'll never
 857          * overwrite the unmount record after a clean unmount.
 858          *
 859          * Do this only if we are going to recover the filesystem
 860          *
 861          * NOTE: This used to say "if (!readonly)"
 862          * However on Linux, we can & do recover a read-only filesystem.
 863          * We only skip recovery if NORECOVERY is specified on mount,
 864          * in which case we would not be here.
 865          *
 866          * But... if the -device- itself is readonly, just skip this.
 867          * We can't recover this device anyway, so it won't matter.
 868          */
 869         if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
 870                 error = xlog_clear_stale_blocks(log, tail_lsn);
 871
 872 done:
 873         xlog_put_bp(bp);
 874
 875         if (error)
 876                 xfs_warn(log->l_mp, "failed to locate log tail");
 877         return error;
 878 }
 879
 880 /*
 881  * Is the log zeroed at all?
 882  *
 883  * The last binary search should be changed to perform an X block read
 884  * once X becomes small enough.  You can then search linearly through
 885  * the X blocks.  This will cut down on the number of reads we need to do.
 886  *
 887  * If the log is partially zeroed, this routine will pass back the blkno
 888  * of the first block with cycle number 0.  It won't have a complete LR
 889  * preceding it.
 890  *
 891  * Return:
 892  *      0  => the log is completely written to
 893  *      -1 => use *blk_no as the first block of the log
 894  *      >0 => error has occurred
 895  */
 896 int
 897 xlog_find_zeroed(
 898         struct xlog     *log,
 899         xfs_daddr_t     *blk_no)
 900 {
 901         xfs_buf_t       *bp;
 902         xfs_caddr_t     offset;
 903         uint            first_cycle, last_cycle;
 904         xfs_daddr_t     new_blk, last_blk, start_blk;
 905         xfs_daddr_t     num_scan_bblks;
 906         int             error, log_bbnum = log->l_logBBsize;
 907
 908         *blk_no = 0;
 909
 910         /* check totally zeroed log */
 911         bp = xlog_get_bp(log, 1);
 912         if (!bp)
 913                 return ENOMEM;
 914         error = xlog_bread(log, 0, 1, bp, &offset);
 915         if (error)
 916                 goto bp_err;
 917
 918         first_cycle = xlog_get_cycle(offset);
 919         if (first_cycle == 0) {         /* completely zeroed log */
 920                 *blk_no = 0;
 921                 xlog_put_bp(bp);
 922                 return -1;
 923         }
 924
 925         /* check partially zeroed log */
 926         error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
 927         if (error)
 928                 goto bp_err;
 929
 930         last_cycle = xlog_get_cycle(offset);
 931         if (last_cycle != 0) {          /* log completely written to */
 932                 xlog_put_bp(bp);
 933                 return 0;
 934         } else if (first_cycle != 1) {
 935                 /*
 936                  * If the cycle of the last block is zero, the cycle of
 937                  * the first block must be 1. If it's not, maybe we're
 938                  * not looking at a log... Bail out.
 939                  */
 940                 xfs_warn(log->l_mp,
 941                         "Log inconsistent or not a log (last==0, first!=1)");
 942                 error = XFS_ERROR(EINVAL);
 943                 goto bp_err;
 944         }
 945
 946         /* we have a partially zeroed log */
 947         last_blk = log_bbnum-1;
 948         if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
 949                 goto bp_err;
 950
 951         /*
 952          * Validate the answer.  Because there is no way to guarantee that
 953          * the entire log is made up of log records which are the same size,
 954          * we scan over the defined maximum blocks.  At this point, the maximum
 955          * is not chosen to mean anything special.   XXXmiken
 956          */
 957         num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
 958         ASSERT(num_scan_bblks <= INT_MAX);
 959
 960         if (last_blk < num_scan_bblks)
 961                 num_scan_bblks = last_blk;
 962         start_blk = last_blk - num_scan_bblks;
 963
 964         /*
 965          * We search for any instances of cycle number 0 that occur before
 966          * our current estimate of the head.  What we're trying to detect is
 967          *        1 ... | 0 | 1 | 0...
 968          *                       ^ binary search ends here
 969          */
 970         if ((error = xlog_find_verify_cycle(log, start_blk,
 971                                          (int)num_scan_bblks, 0, &new_blk)))
 972                 goto bp_err;
 973         if (new_blk != -1)
 974                 last_blk = new_blk;
 975
 976         /*
 977          * Potentially backup over partial log record write.  We don't need
 978          * to search the end of the log because we know it is zero.
 979          */
 980         if ((error = xlog_find_verify_log_record(log, start_blk,
 981                                 &last_blk, 0)) == -1) {
 982             error = XFS_ERROR(EIO);
 983             goto bp_err;
 984         } else if (error)
 985             goto bp_err;
 986
 987         *blk_no = last_blk;
 988 bp_err:
 989         xlog_put_bp(bp);
 990         if (error)
 991                 return error;
 992         return -1;
 993 }
 994
 995 STATIC xlog_recover_t *
 996 xlog_recover_find_tid(
 997         struct hlist_head       *head,
 998         xlog_tid_t              tid)
 999 {
1000         xlog_recover_t          *trans;
1001         struct hlist_node       *n;
1002
1003         hlist_for_each_entry(trans, n, head, r_list) {
1004                 if (trans->r_log_tid == tid)
1005                         return trans;
1006         }
1007         return NULL;
1008 }
1009
1010 STATIC void
1011 xlog_recover_new_tid(
1012         struct hlist_head       *head,
1013         xlog_tid_t              tid,
1014         xfs_lsn_t               lsn)
1015 {
1016         xlog_recover_t          *trans;
1017
1018         trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1019         trans->r_log_tid   = tid;
1020         trans->r_lsn       = lsn;
1021         INIT_LIST_HEAD(&trans->r_itemq);
1022
1023         INIT_HLIST_NODE(&trans->r_list);
1024         hlist_add_head(&trans->r_list, head);
1025 }
1026
1027 STATIC void
1028 xlog_recover_add_item(
1029         struct list_head        *head)
1030 {
1031         xlog_recover_item_t     *item;
1032
1033         item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1034         INIT_LIST_HEAD(&item->ri_list);
1035         list_add_tail(&item->ri_list, head);
1036 }
1037
1038 #define BLK_AVG(blk1, blk2)     ((blk1+blk2) >> 1)
1039
1040 STATIC int
1041 xlog_recover_add_to_cont_trans(
1042         struct xlog             *log,
1043         struct xlog_recover     *trans,
1044         xfs_caddr_t             dp,
1045         int                     len)
1046 {
1047         xlog_recover_item_t     *item;
1048         xfs_caddr_t             ptr, old_ptr;
1049         int                     old_len;
1050
1051         if (list_empty(&trans->r_itemq)) {
1052                 /* finish copying rest of trans header */
1053                 xlog_recover_add_item(&trans->r_itemq);
1054                 ptr = (xfs_caddr_t) &trans->r_theader +
1055                                 sizeof(xfs_trans_header_t) - len;
1056                 memcpy(ptr, dp, len); /* d, s, l */
1057                 return 0;
1058         }
1059         /* take the tail entry */
1060         item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1061
1062         old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1063         old_len = item->ri_buf[item->ri_cnt-1].i_len;
1064
1065         ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
1066         memcpy(&ptr[old_len], dp, len); /* d, s, l */
1067         item->ri_buf[item->ri_cnt-1].i_len += len;
1068         item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1069         trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1070         return 0;
1071 }
1072
1073 /*
1074  * The next region to add is the start of a new region.  It could be
1075  * a whole region or it could be the first part of a new region.  Because
1076  * of this, the assumption here is that the type and size fields of all
1077  * format structures fit into the first 32 bits of the structure.
1078  *
1079  * This works because all regions must be 32 bit aligned.  Therefore, we
1080  * either have both fields or we have neither field.  In the case we have
1081  * neither field, the data part of the region is zero length.  We only have
1082  * a log_op_header and can throw away the header since a new one will appear
1083  * later.  If we have at least 4 bytes, then we can determine how many regions
1084  * will appear in the current log item.
1085  */
1086 STATIC int
1087 xlog_recover_add_to_trans(
1088         struct xlog             *log,
1089         struct xlog_recover     *trans,
1090         xfs_caddr_t             dp,
1091         int                     len)
1092 {
1093         xfs_inode_log_format_t  *in_f;                  /* any will do */
1094         xlog_recover_item_t     *item;
1095         xfs_caddr_t             ptr;
1096
1097         if (!len)
1098                 return 0;
1099         if (list_empty(&trans->r_itemq)) {
1100                 /* we need to catch log corruptions here */
1101                 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1102                         xfs_warn(log->l_mp, "%s: bad header magic number",
1103                                 __func__);
1104                         ASSERT(0);
1105                         return XFS_ERROR(EIO);
1106                 }
1107                 if (len == sizeof(xfs_trans_header_t))
1108                         xlog_recover_add_item(&trans->r_itemq);
1109                 memcpy(&trans->r_theader, dp, len); /* d, s, l */
1110                 return 0;
1111         }
1112
1113         ptr = kmem_alloc(len, KM_SLEEP);
1114         memcpy(ptr, dp, len);
1115         in_f = (xfs_inode_log_format_t *)ptr;
1116
1117         /* take the tail entry */
1118         item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1119         if (item->ri_total != 0 &&
1120              item->ri_total == item->ri_cnt) {
1121                 /* tail item is in use, get a new one */
1122                 xlog_recover_add_item(&trans->r_itemq);
1123                 item = list_entry(trans->r_itemq.prev,
1124                                         xlog_recover_item_t, ri_list);
1125         }
1126
1127         if (item->ri_total == 0) {              /* first region to be added */
1128                 if (in_f->ilf_size == 0 ||
1129                     in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1130                         xfs_warn(log->l_mp,
1131                 "bad number of regions (%d) in inode log format",
1132                                   in_f->ilf_size);
1133                         ASSERT(0);
1134                         return XFS_ERROR(EIO);
1135                 }
1136
1137                 item->ri_total = in_f->ilf_size;
1138                 item->ri_buf =
1139                         kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
1140                                     KM_SLEEP);
1141         }
1142         ASSERT(item->ri_total > item->ri_cnt);
1143         /* Description region is ri_buf[0] */
1144         item->ri_buf[item->ri_cnt].i_addr = ptr;
1145         item->ri_buf[item->ri_cnt].i_len  = len;
1146         item->ri_cnt++;
1147         trace_xfs_log_recover_item_add(log, trans, item, 0);
1148         return 0;
1149 }
1150
1151 /*
1152  * Free up any resources allocated by the transaction
1153  *
1154  * Remember that EFIs, EFDs, and IUNLINKs are handled later.
1155  */
1156 STATIC void
1157 xlog_recover_free_trans(
1158         struct xlog_recover     *trans)
1159 {
1160         xlog_recover_item_t     *item, *n;
1161         int                     i;
1162
1163         list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
1164                 /* Free the regions in the item. */
1165                 list_del(&item->ri_list);
1166                 for (i = 0; i < item->ri_cnt; i++)
1167                         kmem_free(item->ri_buf[i].i_addr);
1168                 /* Free the item itself */
1169                 kmem_free(item->ri_buf);
1170                 kmem_free(item);
1171         }
1172         /* Free the transaction recover structure */
1173         kmem_free(trans);
1174 }
1175
1176 /*
1177  * Perform the transaction.
1178  *
1179  * If the transaction modifies a buffer or inode, do it now.  Otherwise,
1180  * EFIs and EFDs get queued up by adding entries into the AIL for them.
1181  */
1182 STATIC int
1183 xlog_recover_commit_trans(
1184         struct xlog             *log,
1185         struct xlog_recover     *trans,
1186         int                     pass)
1187 {
1188         int                     error = 0;
1189
1190         hlist_del(&trans->r_list);
1191         if ((error = xlog_recover_do_trans(log, trans, pass)))
1192                 return error;
1193
1194         xlog_recover_free_trans(trans);
1195         return 0;
1196 }
1197
1198 STATIC int
1199 xlog_recover_unmount_trans(
1200         xlog_recover_t          *trans)
1201 {
1202         /* Do nothing now */
1203         xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
1204         return 0;
1205 }
1206
1207 /*
1208  * There are two valid states of the r_state field.  0 indicates that the
1209  * transaction structure is in a normal state.  We have either seen the
1210  * start of the transaction or the last operation we added was not a partial
1211  * operation.  If the last operation we added to the transaction was a
1212  * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
1213  *
1214  * NOTE: skip LRs with 0 data length.
1215  */
1216 STATIC int
1217 xlog_recover_process_data(
1218         struct xlog             *log,
1219         struct hlist_head       rhash[],
1220         struct xlog_rec_header  *rhead,
1221         xfs_caddr_t             dp,
1222         int                     pass)
1223 {
1224         xfs_caddr_t             lp;
1225         int                     num_logops;
1226         xlog_op_header_t        *ohead;
1227         xlog_recover_t          *trans;
1228         xlog_tid_t              tid;
1229         int                     error;
1230         unsigned long           hash;
1231         uint                    flags;
1232
1233         lp = dp + be32_to_cpu(rhead->h_len);
1234         num_logops = be32_to_cpu(rhead->h_num_logops);
1235
1236         /* check the log format matches our own - else we can't recover */
1237         if (xlog_header_check_recover(log->l_mp, rhead))
1238                 return (XFS_ERROR(EIO));
1239
1240         while ((dp < lp) && num_logops) {
1241                 ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
1242                 ohead = (xlog_op_header_t *)dp;
1243                 dp += sizeof(xlog_op_header_t);
1244                 if (ohead->oh_clientid != XFS_TRANSACTION &&
1245                     ohead->oh_clientid != XFS_LOG) {
1246                         xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
1247                                         __func__, ohead->oh_clientid);
1248                         ASSERT(0);
1249                         return (XFS_ERROR(EIO));
1250                 }
1251                 tid = be32_to_cpu(ohead->oh_tid);
1252                 hash = XLOG_RHASH(tid);
1253                 trans = xlog_recover_find_tid(&rhash[hash], tid);
1254                 if (trans == NULL) {               /* not found; add new tid */
1255                         if (ohead->oh_flags & XLOG_START_TRANS)
1256                                 xlog_recover_new_tid(&rhash[hash], tid,
1257                                         be64_to_cpu(rhead->h_lsn));
1258                 } else {
1259                         if (dp + be32_to_cpu(ohead->oh_len) > lp) {
1260                                 xfs_warn(log->l_mp, "%s: bad length 0x%x",
1261                                         __func__, be32_to_cpu(ohead->oh_len));
1262                                 return (XFS_ERROR(EIO));
1263                         }
1264                         flags = ohead->oh_flags & ~XLOG_END_TRANS;
1265                         if (flags & XLOG_WAS_CONT_TRANS)
1266                                 flags &= ~XLOG_CONTINUE_TRANS;
1267                         switch (flags) {
1268                         case XLOG_COMMIT_TRANS:
1269                                 error = xlog_recover_commit_trans(log,
1270                                                                 trans, pass);
1271                                 break;
1272                         case XLOG_UNMOUNT_TRANS:
1273                                 error = xlog_recover_unmount_trans(trans);
1274                                 break;
1275                         case XLOG_WAS_CONT_TRANS:
1276                                 error = xlog_recover_add_to_cont_trans(log,
1277                                                 trans, dp,
1278                                                 be32_to_cpu(ohead->oh_len));
1279                                 break;
1280                         case XLOG_START_TRANS:
1281                                 xfs_warn(log->l_mp, "%s: bad transaction",
1282                                         __func__);
1283                                 ASSERT(0);
1284                                 error = XFS_ERROR(EIO);
1285                                 break;
1286                         case 0:
1287                         case XLOG_CONTINUE_TRANS:
1288                                 error = xlog_recover_add_to_trans(log, trans,
1289                                                 dp, be32_to_cpu(ohead->oh_len));
1290                                 break;
1291                         default:
1292                                 xfs_warn(log->l_mp, "%s: bad flag 0x%x",
1293                                         __func__, flags);
1294                                 ASSERT(0);
1295                                 error = XFS_ERROR(EIO);
1296                                 break;
1297                         }
1298                         if (error)
1299                                 return error;
1300                 }
1301                 dp += be32_to_cpu(ohead->oh_len);
1302                 num_logops--;
1303         }
1304         return 0;
1305 }
1306
1307 /*
1308  * Upack the log buffer data and crc check it. If the check fails, issue a
1309  * warning if and only if the CRC in the header is non-zero. This makes the
1310  * check an advisory warning, and the zero CRC check will prevent failure
1311  * warnings from being emitted when upgrading the kernel from one that does not
1312  * add CRCs by default.
1313  *
1314  * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
1315  * corruption failure
1316  *
1317  * XXX: we do not calculate the CRC here yet. It's not clear what we should do
1318  * with CRC errors here in userspace, so we'll address that problem later on.
1319  */
1320 #define xlog_cksum(l,r,dp,len)  ((r)->h_crc)
1321 STATIC int
1322 xlog_unpack_data_crc(
1323         struct xlog_rec_header  *rhead,
1324         xfs_caddr_t             dp,
1325         struct xlog             *log)
1326 {
1327         __le32                  crc;
1328
1329         crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
1330         if (crc != rhead->h_crc) {
1331                 if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
1332                         xfs_alert(log->l_mp,
1333                 "log record CRC mismatch: found 0x%x, expected 0x%x.",
1334                                         le32_to_cpu(rhead->h_crc),
1335                                         le32_to_cpu(crc));
1336                         xfs_hex_dump(dp, 32);
1337                 }
1338
1339                 /*
1340                  * If we've detected a log record corruption, then we can't
1341                  * recover past this point. Abort recovery if we are enforcing
1342                  * CRC protection by punting an error back up the stack.
1343                  */
1344                 if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
1345                         return EFSCORRUPTED;
1346         }
1347
1348         return 0;
1349 }
1350
1351 STATIC int
1352 xlog_unpack_data(
1353         struct xlog_rec_header  *rhead,
1354         xfs_caddr_t             dp,
1355         struct xlog             *log)
1356 {
1357         int                     i, j, k;
1358         int                     error;
1359
1360         error = xlog_unpack_data_crc(rhead, dp, log);
1361         if (error)
1362                 return error;
1363
1364         for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
1365                   i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
1366                 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
1367                 dp += BBSIZE;
1368         }
1369
1370         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1371                 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
1372                 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
1373                         j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1374                         k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
1375                         *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
1376                         dp += BBSIZE;
1377                 }
1378         }
1379
1380         return 0;
1381 }
1382
1383 STATIC int
1384 xlog_valid_rec_header(
1385         struct xlog             *log,
1386         struct xlog_rec_header  *rhead,
1387         xfs_daddr_t             blkno)
1388 {
1389         int                     hlen;
1390
1391         if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
1392                 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
1393                                 XFS_ERRLEVEL_LOW, log->l_mp);
1394                 return XFS_ERROR(EFSCORRUPTED);
1395         }
1396         if (unlikely(
1397             (!rhead->h_version ||
1398             (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
1399                 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
1400                         __func__, be32_to_cpu(rhead->h_version));
1401                 return XFS_ERROR(EIO);
1402         }
1403
1404         /* LR body must have data or it wouldn't have been written */
1405         hlen = be32_to_cpu(rhead->h_len);
1406         if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
1407                 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
1408                                 XFS_ERRLEVEL_LOW, log->l_mp);
1409                 return XFS_ERROR(EFSCORRUPTED);
1410         }
1411         if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
1412                 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
1413                                 XFS_ERRLEVEL_LOW, log->l_mp);
1414                 return XFS_ERROR(EFSCORRUPTED);
1415         }
1416         return 0;
1417 }
1418
1419 /*
1420  * Read the log from tail to head and process the log records found.
1421  * Handle the two cases where the tail and head are in the same cycle
1422  * and where the active portion of the log wraps around the end of
1423  * the physical log separately.  The pass parameter is passed through
1424  * to the routines called to process the data and is not looked at
1425  * here.
1426  */
1427 int
1428 xlog_do_recovery_pass(
1429         struct xlog             *log,
1430         xfs_daddr_t             head_blk,
1431         xfs_daddr_t             tail_blk,
1432         int                     pass)
1433 {
1434         xlog_rec_header_t       *rhead;
1435         xfs_daddr_t             blk_no;
1436         xfs_caddr_t             offset;
1437         xfs_buf_t               *hbp, *dbp;
1438         int                     error = 0, h_size;
1439         int                     bblks, split_bblks;
1440         int                     hblks, split_hblks, wrapped_hblks;
1441         struct hlist_head       rhash[XLOG_RHASH_SIZE];
1442
1443         ASSERT(head_blk != tail_blk);
1444
1445         /*
1446          * Read the header of the tail block and get the iclog buffer size from
1447          * h_size.  Use this to tell how many sectors make up the log header.
1448          */
1449         if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1450                 /*
1451                  * When using variable length iclogs, read first sector of
1452                  * iclog header and extract the header size from it.  Get a
1453                  * new hbp that is the correct size.
1454                  */
1455                 hbp = xlog_get_bp(log, 1);
1456                 if (!hbp)
1457                         return ENOMEM;
1458
1459                 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
1460                 if (error)
1461                         goto bread_err1;
1462
1463                 rhead = (xlog_rec_header_t *)offset;
1464                 error = xlog_valid_rec_header(log, rhead, tail_blk);
1465                 if (error)
1466                         goto bread_err1;
1467                 h_size = be32_to_cpu(rhead->h_size);
1468                 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
1469                     (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1470                         hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1471                         if (h_size % XLOG_HEADER_CYCLE_SIZE)
1472                                 hblks++;
1473                         xlog_put_bp(hbp);
1474                         hbp = xlog_get_bp(log, hblks);
1475                 } else {
1476                         hblks = 1;
1477                 }
1478         } else {
1479                 ASSERT(log->l_sectBBsize == 1);
1480                 hblks = 1;
1481                 hbp = xlog_get_bp(log, 1);
1482                 h_size = XLOG_BIG_RECORD_BSIZE;
1483         }
1484
1485         if (!hbp)
1486                 return ENOMEM;
1487         dbp = xlog_get_bp(log, BTOBB(h_size));
1488         if (!dbp) {
1489                 xlog_put_bp(hbp);
1490                 return ENOMEM;
1491         }
1492
1493         memset(rhash, 0, sizeof(rhash));
1494         if (tail_blk <= head_blk) {
1495                 for (blk_no = tail_blk; blk_no < head_blk; ) {
1496                         error = xlog_bread(log, blk_no, hblks, hbp, &offset);
1497                         if (error)
1498                                 goto bread_err2;
1499
1500                         rhead = (xlog_rec_header_t *)offset;
1501                         error = xlog_valid_rec_header(log, rhead, blk_no);
1502                         if (error)
1503                                 goto bread_err2;
1504
1505                         /* blocks in data section */
1506                         bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1507                         error = xlog_bread(log, blk_no + hblks, bblks, dbp,
1508                                            &offset);
1509                         if (error)
1510                                 goto bread_err2;
1511
1512                         error = xlog_unpack_data(rhead, offset, log);
1513                         if (error)
1514                                 goto bread_err2;
1515
1516                         error = xlog_recover_process_data(log,
1517                                                 rhash, rhead, offset, pass);
1518                         if (error)
1519                                 goto bread_err2;
1520                         blk_no += bblks + hblks;
1521                 }
1522         } else {
1523                 /*
1524                  * Perform recovery around the end of the physical log.
1525                  * When the head is not on the same cycle number as the tail,
1526                  * we can't do a sequential recovery as above.
1527                  */
1528                 blk_no = tail_blk;
1529                 while (blk_no < log->l_logBBsize) {
1530                         /*
1531                          * Check for header wrapping around physical end-of-log
1532                          */
1533                         offset = hbp->b_addr;
1534                         split_hblks = 0;
1535                         wrapped_hblks = 0;
1536                         if (blk_no + hblks <= log->l_logBBsize) {
1537                                 /* Read header in one read */
1538                                 error = xlog_bread(log, blk_no, hblks, hbp,
1539                                                    &offset);
1540                                 if (error)
1541                                         goto bread_err2;
1542                         } else {
1543                                 /* This LR is split across physical log end */
1544                                 if (blk_no != log->l_logBBsize) {
1545                                         /* some data before physical log end */
1546                                         ASSERT(blk_no <= INT_MAX);
1547                                         split_hblks = log->l_logBBsize - (int)blk_no;
1548                                         ASSERT(split_hblks > 0);
1549                                         error = xlog_bread(log, blk_no,
1550                                                            split_hblks, hbp,
1551                                                            &offset);
1552                                         if (error)
1553                                                 goto bread_err2;
1554                                 }
1555
1556                                 /*
1557                                  * Note: this black magic still works with
1558                                  * large sector sizes (non-512) only because:
1559                                  * - we increased the buffer size originally
1560                                  *   by 1 sector giving us enough extra space
1561                                  *   for the second read;
1562                                  * - the log start is guaranteed to be sector
1563                                  *   aligned;
1564                                  * - we read the log end (LR header start)
1565                                  *   _first_, then the log start (LR header end)
1566                                  *   - order is important.
1567                                  */
1568                                 wrapped_hblks = hblks - split_hblks;
1569                                 error = xlog_bread_offset(log, 0,
1570                                                 wrapped_hblks, hbp,
1571                                                 offset + BBTOB(split_hblks));
1572                                 if (error)
1573                                         goto bread_err2;
1574                         }
1575                         rhead = (xlog_rec_header_t *)offset;
1576                         error = xlog_valid_rec_header(log, rhead,
1577                                                 split_hblks ? blk_no : 0);
1578                         if (error)
1579                                 goto bread_err2;
1580
1581                         bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1582                         blk_no += hblks;
1583
1584                         /* Read in data for log record */
1585                         if (blk_no + bblks <= log->l_logBBsize) {
1586                                 error = xlog_bread(log, blk_no, bblks, dbp,
1587                                                    &offset);
1588                                 if (error)
1589                                         goto bread_err2;
1590                         } else {
1591                                 /* This log record is split across the
1592                                  * physical end of log */
1593                                 offset = dbp->b_addr;
1594                                 split_bblks = 0;
1595                                 if (blk_no != log->l_logBBsize) {
1596                                         /* some data is before the physical
1597                                          * end of log */
1598                                         ASSERT(!wrapped_hblks);
1599                                         ASSERT(blk_no <= INT_MAX);
1600                                         split_bblks =
1601                                                 log->l_logBBsize - (int)blk_no;
1602                                         ASSERT(split_bblks > 0);
1603                                         error = xlog_bread(log, blk_no,
1604                                                         split_bblks, dbp,
1605                                                         &offset);
1606                                         if (error)
1607                                                 goto bread_err2;
1608                                 }
1609
1610                                 /*
1611                                  * Note: this black magic still works with
1612                                  * large sector sizes (non-512) only because:
1613                                  * - we increased the buffer size originally
1614                                  *   by 1 sector giving us enough extra space
1615                                  *   for the second read;
1616                                  * - the log start is guaranteed to be sector
1617                                  *   aligned;
1618                                  * - we read the log end (LR header start)
1619                                  *   _first_, then the log start (LR header end)
1620                                  *   - order is important.
1621                                  */
1622                                 error = xlog_bread_offset(log, 0,
1623                                                 bblks - split_bblks, dbp,
1624                                                 offset + BBTOB(split_bblks));
1625                                 if (error)
1626                                         goto bread_err2;
1627                         }
1628
1629                         error = xlog_unpack_data(rhead, offset, log);
1630                         if (error)
1631                                 goto bread_err2;
1632
1633                         error = xlog_recover_process_data(log, rhash,
1634                                                         rhead, offset, pass);
1635                         if (error)
1636                                 goto bread_err2;
1637                         blk_no += bblks;
1638                 }
1639
1640                 ASSERT(blk_no >= log->l_logBBsize);
1641                 blk_no -= log->l_logBBsize;
1642
1643                 /* read first part of physical log */
1644                 while (blk_no < head_blk) {
1645                         error = xlog_bread(log, blk_no, hblks, hbp, &offset);
1646                         if (error)
1647                                 goto bread_err2;
1648
1649                         rhead = (xlog_rec_header_t *)offset;
1650                         error = xlog_valid_rec_header(log, rhead, blk_no);
1651                         if (error)
1652                                 goto bread_err2;
1653
1654                         bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
1655                         error = xlog_bread(log, blk_no+hblks, bblks, dbp,
1656                                            &offset);
1657                         if (error)
1658                                 goto bread_err2;
1659
1660                         error = xlog_unpack_data(rhead, offset, log);
1661                         if (error)
1662                                 goto bread_err2;
1663
1664                         error = xlog_recover_process_data(log, rhash,
1665                                                         rhead, offset, pass);
1666                         if (error)
1667                                 goto bread_err2;
1668                         blk_no += bblks + hblks;
1669                 }
1670         }
1671
1672  bread_err2:
1673         xlog_put_bp(dbp);
1674  bread_err1:
1675         xlog_put_bp(hbp);
1676         return error;
1677 }