repair/prefetch.c

   1 #include "libxfs.h"
   2 #include <pthread.h>
   3 #include "avl.h"
   4 #include "btree.h"
   5 #include "globals.h"
   6 #include "agheader.h"
   7 #include "incore.h"
   8 #include "dir2.h"
   9 #include "protos.h"
  10 #include "err_protos.h"
  11 #include "dinode.h"
  12 #include "bmap.h"
  13 #include "versions.h"
  14 #include "threads.h"
  15 #include "prefetch.h"
  16 #include "progress.h"
  17
  18 int do_prefetch = 1;
  19
  20 /*
  21  * Performs prefetching by priming the libxfs cache by using a dedicate thread
  22  * scanning inodes and reading blocks in ahead of time they are required.
  23  *
  24  * Any I/O errors can be safely ignored.
  25  */
  26
  27 static xfs_mount_t      *mp;
  28 static int              mp_fd;
  29 static int              pf_max_bytes;
  30 static int              pf_max_bbs;
  31 static int              pf_max_fsbs;
  32 static int              pf_batch_bytes;
  33 static int              pf_batch_fsbs;
  34
  35 static void             pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *);
  36
  37 /*
  38  * Buffer priorities for the libxfs cache
  39  *
  40  * Directory metadata is ranked higher than other metadata as it's used
  41  * in phases 3, 4 and 6, while other metadata is only used in 3 & 4.
  42  */
  43
  44 /* intermediate directory btree nodes - can't be queued */
  45 #define B_DIR_BMAP      CACHE_PREFETCH_PRIORITY + 7
  46 /* directory metadata in secondary queue */
  47 #define B_DIR_META_2    CACHE_PREFETCH_PRIORITY + 6
  48 /* dir metadata that had to fetched from the primary queue to avoid stalling */
  49 #define B_DIR_META_H    CACHE_PREFETCH_PRIORITY + 5
  50 /* single block of directory metadata (can't batch read) */
  51 #define B_DIR_META_S    CACHE_PREFETCH_PRIORITY + 4
  52 /* dir metadata with more than one block fetched in a single I/O */
  53 #define B_DIR_META      CACHE_PREFETCH_PRIORITY + 3
  54 /* inode clusters with directory inodes */
  55 #define B_DIR_INODE     CACHE_PREFETCH_PRIORITY + 2
  56 /* intermediate extent btree nodes */
  57 #define B_BMAP          CACHE_PREFETCH_PRIORITY + 1
  58 /* inode clusters without any directory entries */
  59 #define B_INODE         CACHE_PREFETCH_PRIORITY
  60
  61 /*
  62  * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if
  63  * the buffer is for an inode or other metadata.
  64  */
  65 #define B_IS_INODE(f)   (((f) & 5) == 0)
  66
  67 #define DEF_BATCH_BYTES 0x10000
  68
  69 #define MAX_BUFS        128
  70
  71 #define IO_THRESHOLD    (MAX_BUFS * 2)
  72
  73 typedef enum pf_which {
  74         PF_PRIMARY,
  75         PF_SECONDARY,
  76         PF_META_ONLY
  77 } pf_which_t;
  78
  79
  80 static inline void
  81 pf_start_processing(
  82         prefetch_args_t         *args)
  83 {
  84         if (!args->can_start_processing) {
  85                 pftrace("signalling processing for AG %d", args->agno);
  86
  87                 args->can_start_processing = 1;
  88                 pthread_cond_signal(&args->start_processing);
  89         }
  90 }
  91
  92 static inline void
  93 pf_start_io_workers(
  94         prefetch_args_t         *args)
  95 {
  96         if (!args->can_start_reading) {
  97                 pftrace("signalling reading for AG %d", args->agno);
  98
  99                 args->can_start_reading = 1;
 100                 pthread_cond_broadcast(&args->start_reading);
 101         }
 102 }
 103
 104
 105 static void
 106 pf_queue_io(
 107         prefetch_args_t         *args,
 108         struct xfs_buf_map      *map,
 109         int                     nmaps,
 110         int                     flag)
 111 {
 112         struct xfs_buf          *bp;
 113         xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, map[0].bm_bn);
 114
 115         /*
 116          * Never block on a buffer lock here, given that the actual repair
 117          * code might lock buffers in a different order from us.  Given that
 118          * the lock holder is either reading it from disk himself or
 119          * completely overwriting it this behaviour is perfectly fine.
 120          */
 121         bp = libxfs_getbuf_map(mp->m_dev, map, nmaps, LIBXFS_GETBUF_TRYLOCK);
 122         if (!bp)
 123                 return;
 124
 125         if (bp->b_flags & LIBXFS_B_UPTODATE) {
 126                 if (B_IS_INODE(flag))
 127                         pf_read_inode_dirs(args, bp);
 128                 XFS_BUF_SET_PRIORITY(bp, XFS_BUF_PRIORITY(bp) +
 129                                                 CACHE_PREFETCH_PRIORITY);
 130                 libxfs_putbuf(bp);
 131                 return;
 132         }
 133         XFS_BUF_SET_PRIORITY(bp, flag);
 134
 135         pthread_mutex_lock(&args->lock);
 136
 137         btree_insert(args->io_queue, fsbno, bp);
 138
 139         if (fsbno > args->last_bno_read) {
 140                 if (B_IS_INODE(flag)) {
 141                         args->inode_bufs_queued++;
 142                         if (args->inode_bufs_queued == IO_THRESHOLD)
 143                                 pf_start_io_workers(args);
 144                 }
 145         } else {
 146                 ASSERT(!B_IS_INODE(flag));
 147                 XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
 148         }
 149
 150         pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue"
 151                 "(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag) ?
 152                 'I' : 'M', bp, (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
 153                 args->inode_bufs_queued, args->last_bno_read);
 154
 155         pf_start_processing(args);
 156
 157         pthread_mutex_unlock(&args->lock);
 158 }
 159
 160 static int
 161 pf_read_bmbt_reclist(
 162         prefetch_args_t         *args,
 163         xfs_bmbt_rec_t          *rp,
 164         int                     numrecs)
 165 {
 166         int                     i;
 167         xfs_bmbt_irec_t         irec;
 168         xfs_filblks_t           cp = 0;         /* prev count */
 169         xfs_fileoff_t           op = 0;         /* prev offset */
 170 #define MAP_ARRAY_SZ 4
 171         struct xfs_buf_map      map_array[MAP_ARRAY_SZ];
 172         struct xfs_buf_map      *map = map_array;
 173         int                     max_extents = MAP_ARRAY_SZ;
 174         int                     nmaps = 0;
 175         unsigned int            len = 0;
 176         int                     ret = 0;
 177
 178
 179         for (i = 0; i < numrecs; i++) {
 180                 libxfs_bmbt_disk_get_all(rp + i, &irec);
 181
 182                 if (((i > 0) && (op + cp > irec.br_startoff)) ||
 183                                 (irec.br_blockcount == 0) ||
 184                                 (irec.br_startoff >= fs_max_file_offset))
 185                         goto out_free;
 186
 187                 if (!verify_dfsbno(mp, irec.br_startblock) || !verify_dfsbno(mp,
 188                                 irec.br_startblock + irec.br_blockcount - 1))
 189                         goto out_free;
 190
 191                 if (!args->dirs_only && ((irec.br_startoff +
 192                                 irec.br_blockcount) >= mp->m_dir_geo->freeblk))
 193                         break;  /* only Phase 6 reads the free blocks */
 194
 195                 op = irec.br_startoff;
 196                 cp = irec.br_blockcount;
 197
 198                 while (irec.br_blockcount) {
 199                         unsigned int    bm_len;
 200
 201                         pftrace("queuing dir extent in AG %d", args->agno);
 202
 203                         if (len + irec.br_blockcount >= mp->m_dir_geo->fsbcount)
 204                                 bm_len = mp->m_dir_geo->fsbcount - len;
 205                         else
 206                                 bm_len = irec.br_blockcount;
 207                         len += bm_len;
 208
 209                         map[nmaps].bm_bn = XFS_FSB_TO_DADDR(mp,
 210                                                         irec.br_startblock);
 211                         map[nmaps].bm_len = XFS_FSB_TO_BB(mp, bm_len);
 212                         nmaps++;
 213
 214                         if (len == mp->m_dir_geo->fsbcount) {
 215                                 pf_queue_io(args, map, nmaps, B_DIR_META);
 216                                 len = 0;
 217                                 nmaps = 0;
 218                         }
 219
 220                         irec.br_blockcount -= bm_len;
 221                         irec.br_startblock += bm_len;
 222
 223                         /*
 224                          * Handle very fragmented dir2 blocks with dynamically
 225                          * allocated buffer maps.
 226                          */
 227                         if (nmaps >= max_extents) {
 228                                 struct xfs_buf_map *old_map = NULL;
 229
 230                                 if (map == map_array) {
 231                                         old_map = map;
 232                                         map = NULL;
 233                                 }
 234                                 max_extents *= 2;
 235                                 map = realloc(map, max_extents * sizeof(*map));
 236                                 if (map == NULL) {
 237                                         do_error(
 238                         _("couldn't malloc dir2 buffer list\n"));
 239                                         exit(1);
 240                                 }
 241                                 if (old_map)
 242                                         memcpy(map, old_map, sizeof(map_array));
 243                         }
 244
 245                 }
 246         }
 247         ret = 1;
 248 out_free:
 249         if (map != map_array)
 250                 free(map);
 251         return ret;
 252 }
 253
 254 /*
 255  * simplified version of the main scan_lbtree. Returns 0 to stop.
 256  */
 257
 258 static int
 259 pf_scan_lbtree(
 260         xfs_fsblock_t           dbno,
 261         int                     level,
 262         int                     isadir,
 263         prefetch_args_t         *args,
 264         int                     (*func)(struct xfs_btree_block  *block,
 265                                         int                     level,
 266                                         int                     isadir,
 267                                         prefetch_args_t         *args))
 268 {
 269         xfs_buf_t               *bp;
 270         int                     rc;
 271
 272         bp = libxfs_readbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, dbno),
 273                         XFS_FSB_TO_BB(mp, 1), 0, &xfs_bmbt_buf_ops);
 274         if (!bp)
 275                 return 0;
 276
 277         XFS_BUF_SET_PRIORITY(bp, isadir ? B_DIR_BMAP : B_BMAP);
 278
 279         /*
 280          * If the verifier flagged a problem with the buffer, we can't trust
 281          * its contents for the purposes of reading ahead.  Stop prefetching
 282          * the tree and mark the buffer unchecked so that the next read of the
 283          * buffer will retain the error status and be acted upon appropriately.
 284          */
 285         if (bp->b_error) {
 286                 bp->b_flags |= LIBXFS_B_UNCHECKED;
 287                 libxfs_putbuf(bp);
 288                 return 0;
 289         }
 290
 291         rc = (*func)(XFS_BUF_TO_BLOCK(bp), level - 1, isadir, args);
 292
 293         libxfs_putbuf(bp);
 294
 295         return rc;
 296 }
 297
 298 static int
 299 pf_scanfunc_bmap(
 300         struct xfs_btree_block  *block,
 301         int                     level,
 302         int                     isadir,
 303         prefetch_args_t         *args)
 304 {
 305         xfs_bmbt_ptr_t          *pp;
 306         int                     numrecs;
 307         int                     i;
 308         xfs_fsblock_t           dbno;
 309
 310         /*
 311          * do some validation on the block contents
 312          */
 313         if ((block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) &&
 314              block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC)) ||
 315                         (be16_to_cpu(block->bb_level) != level))
 316                 return 0;
 317
 318         numrecs = be16_to_cpu(block->bb_numrecs);
 319
 320         if (level == 0) {
 321                 if (numrecs > mp->m_bmap_dmxr[0] || !isadir)
 322                         return 0;
 323                 return pf_read_bmbt_reclist(args,
 324                         XFS_BMBT_REC_ADDR(mp, block, 1), numrecs);
 325         }
 326
 327         if (numrecs > mp->m_bmap_dmxr[1])
 328                 return 0;
 329
 330         pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
 331
 332         for (i = 0; i < numrecs; i++) {
 333                 dbno = get_unaligned_be64(&pp[i]);
 334                 if (!verify_dfsbno(mp, dbno))
 335                         return 0;
 336                 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
 337                         return 0;
 338         }
 339         return 1;
 340 }
 341
 342
 343 static void
 344 pf_read_btinode(
 345         prefetch_args_t         *args,
 346         xfs_dinode_t            *dino,
 347         int                     isadir)
 348 {
 349         xfs_bmdr_block_t        *dib;
 350         xfs_bmbt_ptr_t          *pp;
 351         int                     i;
 352         int                     level;
 353         int                     numrecs;
 354         int                     dsize;
 355         xfs_fsblock_t           dbno;
 356
 357         dib = (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dino);
 358
 359         level = be16_to_cpu(dib->bb_level);
 360         numrecs = be16_to_cpu(dib->bb_numrecs);
 361
 362         if ((numrecs == 0) || (level == 0) ||
 363                         (level > XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))
 364                 return;
 365         /*
 366          * use bmdr/dfork_dsize since the root block is in the data fork
 367          */
 368         if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp))
 369                 return;
 370
 371         dsize = XFS_DFORK_DSIZE(dino, mp);
 372         pp = XFS_BMDR_PTR_ADDR(dib, 1, xfs_bmdr_maxrecs(dsize, 0));
 373
 374         for (i = 0; i < numrecs; i++) {
 375                 dbno = get_unaligned_be64(&pp[i]);
 376                 if (!verify_dfsbno(mp, dbno))
 377                         break;
 378                 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
 379                         break;
 380         }
 381 }
 382
 383 static void
 384 pf_read_exinode(
 385         prefetch_args_t         *args,
 386         xfs_dinode_t            *dino)
 387 {
 388         pf_read_bmbt_reclist(args, (xfs_bmbt_rec_t *)XFS_DFORK_DPTR(dino),
 389                         be32_to_cpu(dino->di_nextents));
 390 }
 391
 392 static void
 393 pf_read_inode_dirs(
 394         prefetch_args_t         *args,
 395         xfs_buf_t               *bp)
 396 {
 397         xfs_dinode_t            *dino;
 398         int                     icnt = 0;
 399         int                     hasdir = 0;
 400         int                     isadir;
 401
 402         libxfs_readbuf_verify(bp, &xfs_inode_buf_ops);
 403         if (bp->b_error)
 404                 return;
 405
 406         for (icnt = 0; icnt < (XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog); icnt++) {
 407                 dino = xfs_make_iptr(mp, bp, icnt);
 408
 409                 /*
 410                  * We are only prefetching directory contents in extents
 411                  * and btree nodes for other inodes
 412                  */
 413                 isadir = (be16_to_cpu(dino->di_mode) & S_IFMT) == S_IFDIR;
 414                 hasdir |= isadir;
 415
 416                 if (dino->di_format <= XFS_DINODE_FMT_LOCAL)
 417                         continue;
 418
 419                 if (!isadir && (dino->di_format == XFS_DINODE_FMT_EXTENTS ||
 420                                 args->dirs_only))
 421                         continue;
 422
 423                 /*
 424                  * do some checks on the inode to see if we can prefetch
 425                  * its directory data. It's a cut down version of
 426                  * process_dinode_int() in dinode.c.
 427                  */
 428                 if (dino->di_format > XFS_DINODE_FMT_BTREE)
 429                         continue;
 430
 431                 if (be16_to_cpu(dino->di_magic) != XFS_DINODE_MAGIC)
 432                         continue;
 433
 434                 if (!xfs_dinode_good_version(mp, dino->di_version))
 435                         continue;
 436
 437                 if (be64_to_cpu(dino->di_size) <= XFS_DFORK_DSIZE(dino, mp))
 438                         continue;
 439
 440                 if ((dino->di_forkoff != 0) &&
 441                     (dino->di_forkoff >= XFS_LITINO(mp, dino->di_version) >> 3))
 442                         continue;
 443
 444                 switch (dino->di_format) {
 445                         case XFS_DINODE_FMT_EXTENTS:
 446                                 pf_read_exinode(args, dino);
 447                                 break;
 448                         case XFS_DINODE_FMT_BTREE:
 449                                 pf_read_btinode(args, dino, isadir);
 450                                 break;
 451                 }
 452         }
 453         if (hasdir)
 454                 XFS_BUF_SET_PRIORITY(bp, B_DIR_INODE);
 455 }
 456
 457 /*
 458  * pf_batch_read must be called with the lock locked.
 459  */
 460 static void
 461 pf_batch_read(
 462         prefetch_args_t         *args,
 463         pf_which_t              which,
 464         void                    *buf)
 465 {
 466         xfs_buf_t               *bplist[MAX_BUFS];
 467         unsigned int            num;
 468         off64_t                 first_off, last_off, next_off;
 469         int                     len, size;
 470         int                     i;
 471         int                     inode_bufs;
 472         unsigned long           fsbno = 0;
 473         unsigned long           max_fsbno;
 474         char                    *pbuf;
 475
 476         for (;;) {
 477                 num = 0;
 478                 if (which == PF_SECONDARY) {
 479                         bplist[0] = btree_find(args->io_queue, 0, &fsbno);
 480                         max_fsbno = MIN(fsbno + pf_max_fsbs,
 481                                                         args->last_bno_read);
 482                 } else {
 483                         bplist[0] = btree_find(args->io_queue,
 484                                                 args->last_bno_read, &fsbno);
 485                         max_fsbno = fsbno + pf_max_fsbs;
 486                 }
 487                 while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) {
 488                         /*
 489                          * Discontiguous buffers need special handling, so stop
 490                          * gathering new buffers and process the list and this
 491                          * discontigous buffer immediately. This avoids the
 492                          * complexity of keeping a separate discontigous buffer
 493                          * list and seeking back over ranges we've already done
 494                          * optimised reads for.
 495                          */
 496                         if ((bplist[num]->b_flags & LIBXFS_B_DISCONTIG)) {
 497                                 num++;
 498                                 break;
 499                         }
 500
 501                         if (which != PF_META_ONLY ||
 502                                    !B_IS_INODE(XFS_BUF_PRIORITY(bplist[num])))
 503                                 num++;
 504                         if (num == MAX_BUFS)
 505                                 break;
 506                         bplist[num] = btree_lookup_next(args->io_queue, &fsbno);
 507                 }
 508                 if (!num)
 509                         return;
 510
 511                 /*
 512                  * do a big read if 25% of the potential buffer is useful,
 513                  * otherwise, find as many close together blocks and
 514                  * read them in one read
 515                  */
 516                 first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0]));
 517                 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
 518                         XFS_BUF_SIZE(bplist[num-1]);
 519                 while (num > 1 && last_off - first_off > pf_max_bytes) {
 520                         num--;
 521                         last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
 522                                 XFS_BUF_SIZE(bplist[num-1]);
 523                 }
 524                 if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) {
 525                         /*
 526                          * not enough blocks for one big read, so determine
 527                          * the number of blocks that are close enough.
 528                          */
 529                         last_off = first_off + XFS_BUF_SIZE(bplist[0]);
 530                         for (i = 1; i < num; i++) {
 531                                 next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) +
 532                                                 XFS_BUF_SIZE(bplist[i]);
 533                                 if (next_off - last_off > pf_batch_bytes)
 534                                         break;
 535                                 last_off = next_off;
 536                         }
 537                         num = i;
 538                 }
 539
 540                 for (i = 0; i < num; i++) {
 541                         if (btree_delete(args->io_queue, XFS_DADDR_TO_FSB(mp,
 542                                         XFS_BUF_ADDR(bplist[i]))) == NULL)
 543                                 do_error(_("prefetch corruption\n"));
 544                 }
 545
 546                 if (which == PF_PRIMARY) {
 547                         for (inode_bufs = 0, i = 0; i < num; i++) {
 548                                 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
 549                                         inode_bufs++;
 550                         }
 551                         args->inode_bufs_queued -= inode_bufs;
 552                         if (inode_bufs && (first_off >> mp->m_sb.sb_blocklog) >
 553                                         pf_batch_fsbs)
 554                                 args->last_bno_read = (first_off >> mp->m_sb.sb_blocklog);
 555                 }
 556 #ifdef XR_PF_TRACE
 557                 pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)",
 558                         (long long)XFS_BUF_ADDR(bplist[0]),
 559                         (long long)XFS_BUF_ADDR(bplist[num-1]), num,
 560                         (which != PF_SECONDARY) ? "pri" : "sec", args->agno,
 561                         args->last_bno_read, args->inode_bufs_queued);
 562 #endif
 563                 pthread_mutex_unlock(&args->lock);
 564
 565                 /*
 566                  * now read the data and put into the xfs_but_t's
 567                  */
 568                 len = pread64(mp_fd, buf, (int)(last_off - first_off), first_off);
 569
 570                 /*
 571                  * Check the last buffer on the list to see if we need to
 572                  * process a discontiguous buffer. The gather above loop
 573                  * guarantees that only the last buffer in the list will be a
 574                  * discontiguous buffer.
 575                  */
 576                 if ((bplist[num - 1]->b_flags & LIBXFS_B_DISCONTIG)) {
 577                         libxfs_readbufr_map(mp->m_ddev_targp, bplist[num - 1], 0);
 578                         bplist[num - 1]->b_flags |= LIBXFS_B_UNCHECKED;
 579                         libxfs_putbuf(bplist[num - 1]);
 580                         num--;
 581                 }
 582
 583                 if (len > 0) {
 584                         /*
 585                          * go through the xfs_buf_t list copying from the
 586                          * read buffer into the xfs_buf_t's and release them.
 587                          */
 588                         for (i = 0; i < num; i++) {
 589
 590                                 pbuf = ((char *)buf) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) - first_off);
 591                                 size = XFS_BUF_SIZE(bplist[i]);
 592                                 if (len < size)
 593                                         break;
 594                                 memcpy(XFS_BUF_PTR(bplist[i]), pbuf, size);
 595                                 bplist[i]->b_flags |= (LIBXFS_B_UPTODATE |
 596                                                        LIBXFS_B_UNCHECKED);
 597                                 len -= size;
 598                                 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
 599                                         pf_read_inode_dirs(args, bplist[i]);
 600                                 else if (which == PF_META_ONLY)
 601                                         XFS_BUF_SET_PRIORITY(bplist[i],
 602                                                                 B_DIR_META_H);
 603                                 else if (which == PF_PRIMARY && num == 1)
 604                                         XFS_BUF_SET_PRIORITY(bplist[i],
 605                                                                 B_DIR_META_S);
 606                         }
 607                 }
 608                 for (i = 0; i < num; i++) {
 609                         pftrace("putbuf %c %p (%llu) in AG %d",
 610                                 B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M',
 611                                 bplist[i], (long long)XFS_BUF_ADDR(bplist[i]),
 612                                 args->agno);
 613                         libxfs_putbuf(bplist[i]);
 614                 }
 615                 pthread_mutex_lock(&args->lock);
 616                 if (which != PF_SECONDARY) {
 617                         pftrace("inode_bufs_queued for AG %d = %d", args->agno,
 618                                 args->inode_bufs_queued);
 619                         /*
 620                          * if primary inode queue running low, process metadata
 621                          * in boths queues to avoid I/O starvation as the
 622                          * processing thread would be waiting for a metadata
 623                          * buffer
 624                          */
 625                         if (which == PF_PRIMARY && !args->queuing_done &&
 626                                         args->inode_bufs_queued < IO_THRESHOLD) {
 627                                 pftrace("reading metadata bufs from primary queue for AG %d",
 628                                         args->agno);
 629
 630                                 pf_batch_read(args, PF_META_ONLY, buf);
 631
 632                                 pftrace("reading bufs from secondary queue for AG %d",
 633                                         args->agno);
 634
 635                                 pf_batch_read(args, PF_SECONDARY, buf);
 636                         }
 637                 }
 638         }
 639 }
 640
 641 static void *
 642 pf_io_worker(
 643         void                    *param)
 644 {
 645         prefetch_args_t         *args = param;
 646         void                    *buf = memalign(libxfs_device_alignment(),
 647                                                 pf_max_bytes);
 648
 649         if (buf == NULL)
 650                 return NULL;
 651
 652         pthread_mutex_lock(&args->lock);
 653         while (!args->queuing_done || !btree_is_empty(args->io_queue)) {
 654                 pftrace("waiting to start prefetch I/O for AG %d", args->agno);
 655
 656                 while (!args->can_start_reading && !args->queuing_done)
 657                         pthread_cond_wait(&args->start_reading, &args->lock);
 658
 659                 pftrace("starting prefetch I/O for AG %d", args->agno);
 660
 661                 pf_batch_read(args, PF_PRIMARY, buf);
 662                 pf_batch_read(args, PF_SECONDARY, buf);
 663
 664                 pftrace("ran out of bufs to prefetch for AG %d", args->agno);
 665
 666                 if (!args->queuing_done)
 667                         args->can_start_reading = 0;
 668         }
 669         pthread_mutex_unlock(&args->lock);
 670
 671         free(buf);
 672
 673         pftrace("finished prefetch I/O for AG %d", args->agno);
 674
 675         return NULL;
 676 }
 677
 678 static int
 679 pf_create_prefetch_thread(
 680         prefetch_args_t         *args);
 681
 682 static void *
 683 pf_queuing_worker(
 684         void                    *param)
 685 {
 686         prefetch_args_t         *args = param;
 687         int                     num_inos;
 688         ino_tree_node_t         *irec;
 689         ino_tree_node_t         *cur_irec;
 690         int                     blks_per_cluster;
 691         xfs_agblock_t           bno;
 692         int                     i;
 693         int                     err;
 694         uint64_t                sparse;
 695
 696         blks_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
 697         if (blks_per_cluster == 0)
 698                 blks_per_cluster = 1;
 699
 700         for (i = 0; i < PF_THREAD_COUNT; i++) {
 701                 err = pthread_create(&args->io_threads[i], NULL,
 702                                 pf_io_worker, args);
 703                 if (err != 0) {
 704                         do_warn(_("failed to create prefetch thread: %s\n"),
 705                                 strerror(err));
 706                         if (i == 0) {
 707                                 pf_start_processing(args);
 708                                 return NULL;
 709                         }
 710                         /*
 711                          * since we have at least one I/O thread, use them for
 712                          * prefetch
 713                          */
 714                         break;
 715                 }
 716         }
 717         pftrace("starting prefetch for AG %d", args->agno);
 718
 719         for (irec = findfirst_inode_rec(args->agno); irec != NULL;
 720                         irec = next_ino_rec(irec)) {
 721
 722                 cur_irec = irec;
 723
 724                 num_inos = XFS_INODES_PER_CHUNK;
 725                 while (num_inos < mp->m_ialloc_inos && irec != NULL) {
 726                         irec = next_ino_rec(irec);
 727                         num_inos += XFS_INODES_PER_CHUNK;
 728                 }
 729
 730                 if (args->dirs_only && cur_irec->ino_isa_dir == 0)
 731                         continue;
 732 #ifdef XR_PF_TRACE
 733                 sem_getvalue(&args->ra_count, &i);
 734                 pftrace("queuing irec %p in AG %d, sem count = %d",
 735                         irec, args->agno, i);
 736 #endif
 737                 err = sem_trywait(&args->ra_count);
 738                 if (err < 0 && errno == EAGAIN) {
 739                         /*
 740                          * Kick the queue once we have reached the limit;
 741                          * without this the threads processing the inodes
 742                          * might get stuck on a buffer that has been locked
 743                          * and added to the I/O queue but is waiting for
 744                          * the thread to be woken.
 745                          */
 746                         pf_start_io_workers(args);
 747                         sem_wait(&args->ra_count);
 748                 }
 749
 750                 num_inos = 0;
 751                 bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum);
 752                 sparse = cur_irec->ir_sparse;
 753
 754                 do {
 755                         struct xfs_buf_map      map;
 756
 757                         map.bm_bn = XFS_AGB_TO_DADDR(mp, args->agno, bno);
 758                         map.bm_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
 759
 760                         /*
 761                          * Queue I/O for each non-sparse cluster. We can check
 762                          * sparse state in cluster sized chunks as cluster size
 763                          * is the min. granularity of sparse irec regions.
 764                          */
 765                         if ((sparse & ((1ULL << inodes_per_cluster) - 1)) == 0)
 766                                 pf_queue_io(args, &map, 1,
 767                                             (cur_irec->ino_isa_dir != 0) ?
 768                                              B_DIR_INODE : B_INODE);
 769
 770                         bno += blks_per_cluster;
 771                         num_inos += inodes_per_cluster;
 772                         sparse >>= inodes_per_cluster;
 773                 } while (num_inos < mp->m_ialloc_inos);
 774         }
 775
 776         pthread_mutex_lock(&args->lock);
 777
 778         pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)",
 779                 args->agno, args->inode_bufs_queued);
 780
 781         args->queuing_done = 1;
 782         pf_start_io_workers(args);
 783         pf_start_processing(args);
 784         pthread_mutex_unlock(&args->lock);
 785
 786         /* now wait for the readers to finish */
 787         for (i = 0; i < PF_THREAD_COUNT; i++)
 788                 if (args->io_threads[i])
 789                         pthread_join(args->io_threads[i], NULL);
 790
 791         pftrace("prefetch for AG %d finished", args->agno);
 792
 793         pthread_mutex_lock(&args->lock);
 794
 795         ASSERT(btree_is_empty(args->io_queue));
 796
 797         args->prefetch_done = 1;
 798         if (args->next_args)
 799                 pf_create_prefetch_thread(args->next_args);
 800
 801         pthread_mutex_unlock(&args->lock);
 802
 803         return NULL;
 804 }
 805
 806 static int
 807 pf_create_prefetch_thread(
 808         prefetch_args_t         *args)
 809 {
 810         int                     err;
 811
 812         pftrace("creating queue thread for AG %d", args->agno);
 813
 814         err = pthread_create(&args->queuing_thread, NULL,
 815                         pf_queuing_worker, args);
 816         if (err != 0) {
 817                 do_warn(_("failed to create prefetch thread: %s\n"),
 818                         strerror(err));
 819                 cleanup_inode_prefetch(args);
 820         }
 821
 822         return err == 0;
 823 }
 824
 825 void
 826 init_prefetch(
 827         xfs_mount_t             *pmp)
 828 {
 829         mp = pmp;
 830         mp_fd = libxfs_device_to_fd(mp->m_ddev_targp->dev);
 831         pf_max_bytes = sysconf(_SC_PAGE_SIZE) << 7;
 832         pf_max_bbs = pf_max_bytes >> BBSHIFT;
 833         pf_max_fsbs = pf_max_bytes >> mp->m_sb.sb_blocklog;
 834         pf_batch_bytes = DEF_BATCH_BYTES;
 835         pf_batch_fsbs = DEF_BATCH_BYTES >> (mp->m_sb.sb_blocklog + 1);
 836 }
 837
 838 prefetch_args_t *
 839 start_inode_prefetch(
 840         xfs_agnumber_t          agno,
 841         int                     dirs_only,
 842         prefetch_args_t         *prev_args)
 843 {
 844         prefetch_args_t         *args;
 845         long                    max_queue;
 846
 847         if (!do_prefetch || agno >= mp->m_sb.sb_agcount)
 848                 return NULL;
 849
 850         args = calloc(1, sizeof(prefetch_args_t));
 851
 852         btree_init(&args->io_queue);
 853         if (pthread_mutex_init(&args->lock, NULL) != 0)
 854                 do_error(_("failed to initialize prefetch mutex\n"));
 855         if (pthread_cond_init(&args->start_reading, NULL) != 0)
 856                 do_error(_("failed to initialize prefetch cond var\n"));
 857         if (pthread_cond_init(&args->start_processing, NULL) != 0)
 858                 do_error(_("failed to initialize prefetch cond var\n"));
 859         args->agno = agno;
 860         args->dirs_only = dirs_only;
 861
 862         /*
 863          * use only 1/8 of the libxfs cache as we are only counting inodes
 864          * and not any other associated metadata like directories
 865          */
 866
 867         max_queue = libxfs_bcache->c_maxcount / thread_count / 8;
 868         if (mp->m_inode_cluster_size > mp->m_sb.sb_blocksize)
 869                 max_queue = max_queue *
 870                         (mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog) /
 871                         mp->m_ialloc_blks;
 872
 873         sem_init(&args->ra_count, 0, max_queue);
 874
 875         if (!prev_args) {
 876                 if (!pf_create_prefetch_thread(args))
 877                         return NULL;
 878         } else {
 879                 pthread_mutex_lock(&prev_args->lock);
 880                 if (prev_args->prefetch_done) {
 881                         if (!pf_create_prefetch_thread(args))
 882                                 args = NULL;
 883                 } else
 884                         prev_args->next_args = args;
 885                 pthread_mutex_unlock(&prev_args->lock);
 886         }
 887
 888         return args;
 889 }
 890
 891 /*
 892  * prefetch_ag_range runs a prefetch-and-process loop across a range of AGs. It
 893  * begins with @start+ag, and finishes with @end_ag - 1 (i.e. does not prefetch
 894  * or process @end_ag). The function starts prefetch on the first AG, then loops
 895  * starting prefetch on the next AG and then blocks processing the current AG as
 896  * the prefetch queue brings inodes into the processing queue.
 897  *
 898  * There is only one prefetch taking place at a time, so the prefetch on the
 899  * next AG only starts once the current AG has been completely prefetched. Hence
 900  * the prefetch of the next AG will start some time before the processing of the
 901  * current AG finishes, ensuring that when we iterate an start processing the
 902  * next AG there is already a significant queue of inodes to process.
 903  *
 904  * Prefetch is done this way to prevent it from running too far ahead of the
 905  * processing. Allowing it to do so can cause cache thrashing, where new
 906  * prefetch causes previously prefetched buffers to be reclaimed before the
 907  * processing thread uses them. This results in reading all the inodes and
 908  * metadata twice per phase and it greatly slows down the processing. Hence we
 909  * have to carefully control how far ahead we prefetch...
 910  */
 911 static void
 912 prefetch_ag_range(
 913         struct work_queue       *work,
 914         xfs_agnumber_t          start_ag,
 915         xfs_agnumber_t          end_ag,
 916         bool                    dirs_only,
 917         void                    (*func)(struct work_queue *,
 918                                         xfs_agnumber_t, void *))
 919 {
 920         int                     i;
 921         struct prefetch_args    *pf_args[2];
 922
 923         pf_args[start_ag & 1] = start_inode_prefetch(start_ag, dirs_only, NULL);
 924         for (i = start_ag; i < end_ag; i++) {
 925                 /* Don't prefetch end_ag */
 926                 if (i + 1 < end_ag)
 927                         pf_args[(~i) & 1] = start_inode_prefetch(i + 1,
 928                                                 dirs_only, pf_args[i & 1]);
 929                 func(work, i, pf_args[i & 1]);
 930         }
 931 }
 932
 933 struct pf_work_args {
 934         xfs_agnumber_t  start_ag;
 935         xfs_agnumber_t  end_ag;
 936         bool            dirs_only;
 937         void            (*func)(struct work_queue *, xfs_agnumber_t, void *);
 938 };
 939
 940 static void
 941 prefetch_ag_range_work(
 942         struct work_queue       *work,
 943         xfs_agnumber_t          unused,
 944         void                    *args)
 945 {
 946         struct pf_work_args *wargs = args;
 947
 948         prefetch_ag_range(work, wargs->start_ag, wargs->end_ag,
 949                           wargs->dirs_only, wargs->func);
 950         free(args);
 951 }
 952
 953 /*
 954  * Do inode prefetch in the most optimal way for the context under which repair
 955  * has been run.
 956  */
 957 void
 958 do_inode_prefetch(
 959         struct xfs_mount        *mp,
 960         int                     stride,
 961         void                    (*func)(struct work_queue *,
 962                                         xfs_agnumber_t, void *),
 963         bool                    check_cache,
 964         bool                    dirs_only)
 965 {
 966         int                     i;
 967         struct work_queue       queue;
 968         struct work_queue       *queues;
 969         int                     queues_started = 0;
 970
 971         /*
 972          * If the previous phases of repair have not overflowed the buffer
 973          * cache, then we don't need to re-read any of the metadata in the
 974          * filesystem - it's all in the cache. In that case, run a thread per
 975          * CPU to maximise parallelism of the queue to be processed.
 976          */
 977         if (check_cache && !libxfs_bcache_overflowed()) {
 978                 queue.mp = mp;
 979                 create_work_queue(&queue, mp, libxfs_nproc());
 980                 for (i = 0; i < mp->m_sb.sb_agcount; i++)
 981                         queue_work(&queue, func, i, NULL);
 982                 destroy_work_queue(&queue);
 983                 return;
 984         }
 985
 986         /*
 987          * single threaded behaviour - single prefetch thread, processed
 988          * directly after each AG is queued.
 989          */
 990         if (!stride) {
 991                 queue.mp = mp;
 992                 prefetch_ag_range(&queue, 0, mp->m_sb.sb_agcount,
 993                                   dirs_only, func);
 994                 return;
 995         }
 996
 997         /*
 998          * create one worker thread for each segment of the volume
 999          */
1000         queues = malloc(thread_count * sizeof(work_queue_t));
1001         for (i = 0; i < thread_count; i++) {
1002                 struct pf_work_args *wargs;
1003
1004                 wargs = malloc(sizeof(struct pf_work_args));
1005                 wargs->start_ag = i * stride;
1006                 wargs->end_ag = min((i + 1) * stride,
1007                                     mp->m_sb.sb_agcount);
1008                 wargs->dirs_only = dirs_only;
1009                 wargs->func = func;
1010
1011                 create_work_queue(&queues[i], mp, 1);
1012                 queue_work(&queues[i], prefetch_ag_range_work, 0, wargs);
1013                 queues_started++;
1014
1015                 if (wargs->end_ag >= mp->m_sb.sb_agcount)
1016                         break;
1017         }
1018
1019         /*
1020          * wait for workers to complete
1021          */
1022         for (i = 0; i < queues_started; i++)
1023                 destroy_work_queue(&queues[i]);
1024         free(queues);
1025 }
1026
1027 void
1028 wait_for_inode_prefetch(
1029         prefetch_args_t         *args)
1030 {
1031         if (args == NULL)
1032                 return;
1033
1034         pthread_mutex_lock(&args->lock);
1035
1036         while (!args->can_start_processing) {
1037                 pftrace("waiting to start processing AG %d", args->agno);
1038
1039                 pthread_cond_wait(&args->start_processing, &args->lock);
1040         }
1041         pftrace("can start processing AG %d", args->agno);
1042
1043         pthread_mutex_unlock(&args->lock);
1044 }
1045
1046 void
1047 cleanup_inode_prefetch(
1048         prefetch_args_t         *args)
1049 {
1050         if (args == NULL)
1051                 return;
1052
1053         pftrace("waiting AG %d prefetch to finish", args->agno);
1054
1055         if (args->queuing_thread)
1056                 pthread_join(args->queuing_thread, NULL);
1057
1058         pftrace("AG %d prefetch done", args->agno);
1059
1060         pthread_mutex_destroy(&args->lock);
1061         pthread_cond_destroy(&args->start_reading);
1062         pthread_cond_destroy(&args->start_processing);
1063         sem_destroy(&args->ra_count);
1064         btree_destroy(args->io_queue);
1065
1066         free(args);
1067 }
1068
1069 #ifdef XR_PF_TRACE
1070
1071 static FILE     *pf_trace_file;
1072
1073 void
1074 pftrace_init(void)
1075 {
1076         pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w");
1077         setvbuf(pf_trace_file, NULL, _IOLBF, 1024);
1078 }
1079
1080 void
1081 pftrace_done(void)
1082 {
1083         fclose(pf_trace_file);
1084 }
1085
1086 void
1087 _pftrace(const char *func, const char *msg, ...)
1088 {
1089         char            buf[200];
1090         struct timeval  tv;
1091         va_list         args;
1092
1093         gettimeofday(&tv, NULL);
1094
1095         va_start(args, msg);
1096         vsnprintf(buf, sizeof(buf), msg, args);
1097         buf[sizeof(buf)-1] = '\0';
1098         va_end(args);
1099
1100         fprintf(pf_trace_file, "%lu.%06lu  %s: %s\n", tv.tv_sec, tv.tv_usec,
1101                 func, buf);
1102 }
1103
1104 #endif