repair/prefetch.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include "libxfs.h"
   4 #include <pthread.h>
   5 #include "avl.h"
   6 #include "btree.h"
   7 #include "globals.h"
   8 #include "agheader.h"
   9 #include "incore.h"
  10 #include "dir2.h"
  11 #include "protos.h"
  12 #include "err_protos.h"
  13 #include "dinode.h"
  14 #include "bmap.h"
  15 #include "versions.h"
  16 #include "threads.h"
  17 #include "prefetch.h"
  18 #include "progress.h"
  19
  20 int do_prefetch = 1;
  21
  22 /*
  23  * Performs prefetching by priming the libxfs cache by using a dedicate thread
  24  * scanning inodes and reading blocks in ahead of time they are required.
  25  *
  26  * Any I/O errors can be safely ignored.
  27  */
  28
  29 static xfs_mount_t      *mp;
  30 static int              mp_fd;
  31 static int              pf_max_bytes;
  32 static int              pf_max_bbs;
  33 static int              pf_max_fsbs;
  34 static int              pf_batch_bytes;
  35 static int              pf_batch_fsbs;
  36
  37 static void             pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *);
  38
  39 /*
  40  * Buffer priorities for the libxfs cache
  41  *
  42  * Directory metadata is ranked higher than other metadata as it's used
  43  * in phases 3, 4 and 6, while other metadata is only used in 3 & 4.
  44  */
  45
  46 /* intermediate directory btree nodes - can't be queued */
  47 #define B_DIR_BMAP      CACHE_PREFETCH_PRIORITY + 7
  48 /* directory metadata in secondary queue */
  49 #define B_DIR_META_2    CACHE_PREFETCH_PRIORITY + 6
  50 /* dir metadata that had to fetched from the primary queue to avoid stalling */
  51 #define B_DIR_META_H    CACHE_PREFETCH_PRIORITY + 5
  52 /* single block of directory metadata (can't batch read) */
  53 #define B_DIR_META_S    CACHE_PREFETCH_PRIORITY + 4
  54 /* dir metadata with more than one block fetched in a single I/O */
  55 #define B_DIR_META      CACHE_PREFETCH_PRIORITY + 3
  56 /* inode clusters with directory inodes */
  57 #define B_DIR_INODE     CACHE_PREFETCH_PRIORITY + 2
  58 /* intermediate extent btree nodes */
  59 #define B_BMAP          CACHE_PREFETCH_PRIORITY + 1
  60 /* inode clusters without any directory entries */
  61 #define B_INODE         CACHE_PREFETCH_PRIORITY
  62
  63 /*
  64  * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if
  65  * the buffer is for an inode or other metadata.
  66  */
  67 #define B_IS_INODE(f)   (((f) & 5) == 0)
  68
  69 #define DEF_BATCH_BYTES 0x10000
  70
  71 #define MAX_BUFS        128
  72
  73 #define IO_THRESHOLD    (MAX_BUFS * 2)
  74
  75 typedef enum pf_which {
  76         PF_PRIMARY,
  77         PF_SECONDARY,
  78         PF_META_ONLY
  79 } pf_which_t;
  80
  81
  82 static inline void
  83 pf_start_processing(
  84         prefetch_args_t         *args)
  85 {
  86         if (!args->can_start_processing) {
  87                 pftrace("signalling processing for AG %d", args->agno);
  88
  89                 args->can_start_processing = 1;
  90                 pthread_cond_signal(&args->start_processing);
  91         }
  92 }
  93
  94 static inline void
  95 pf_start_io_workers(
  96         prefetch_args_t         *args)
  97 {
  98         if (!args->can_start_reading) {
  99                 pftrace("signalling reading for AG %d", args->agno);
 100
 101                 args->can_start_reading = 1;
 102                 pthread_cond_broadcast(&args->start_reading);
 103         }
 104 }
 105
 106
 107 static void
 108 pf_queue_io(
 109         prefetch_args_t         *args,
 110         struct xfs_buf_map      *map,
 111         int                     nmaps,
 112         int                     flag)
 113 {
 114         struct xfs_buf          *bp;
 115         xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, map[0].bm_bn);
 116
 117         /*
 118          * Never block on a buffer lock here, given that the actual repair
 119          * code might lock buffers in a different order from us.  Given that
 120          * the lock holder is either reading it from disk himself or
 121          * completely overwriting it this behaviour is perfectly fine.
 122          */
 123         bp = libxfs_getbuf_map(mp->m_dev, map, nmaps, LIBXFS_GETBUF_TRYLOCK);
 124         if (!bp)
 125                 return;
 126
 127         if (bp->b_flags & LIBXFS_B_UPTODATE) {
 128                 if (B_IS_INODE(flag))
 129                         pf_read_inode_dirs(args, bp);
 130                 XFS_BUF_SET_PRIORITY(bp, XFS_BUF_PRIORITY(bp) +
 131                                                 CACHE_PREFETCH_PRIORITY);
 132                 libxfs_putbuf(bp);
 133                 return;
 134         }
 135         XFS_BUF_SET_PRIORITY(bp, flag);
 136
 137         pthread_mutex_lock(&args->lock);
 138
 139         btree_insert(args->io_queue, fsbno, bp);
 140
 141         if (fsbno > args->last_bno_read) {
 142                 if (B_IS_INODE(flag)) {
 143                         args->inode_bufs_queued++;
 144                         if (args->inode_bufs_queued == IO_THRESHOLD)
 145                                 pf_start_io_workers(args);
 146                 }
 147         } else {
 148                 ASSERT(!B_IS_INODE(flag));
 149                 XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
 150         }
 151
 152         pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue"
 153                 "(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag) ?
 154                 'I' : 'M', bp, (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
 155                 args->inode_bufs_queued, args->last_bno_read);
 156
 157         pf_start_processing(args);
 158
 159         pthread_mutex_unlock(&args->lock);
 160 }
 161
 162 static int
 163 pf_read_bmbt_reclist(
 164         prefetch_args_t         *args,
 165         xfs_bmbt_rec_t          *rp,
 166         int                     numrecs)
 167 {
 168         int                     i;
 169         xfs_bmbt_irec_t         irec;
 170         xfs_filblks_t           cp = 0;         /* prev count */
 171         xfs_fileoff_t           op = 0;         /* prev offset */
 172 #define MAP_ARRAY_SZ 4
 173         struct xfs_buf_map      map_array[MAP_ARRAY_SZ];
 174         struct xfs_buf_map      *map = map_array;
 175         int                     max_extents = MAP_ARRAY_SZ;
 176         int                     nmaps = 0;
 177         unsigned int            len = 0;
 178         int                     ret = 0;
 179
 180
 181         for (i = 0; i < numrecs; i++) {
 182                 libxfs_bmbt_disk_get_all(rp + i, &irec);
 183
 184                 if (((i > 0) && (op + cp > irec.br_startoff)) ||
 185                                 (irec.br_blockcount == 0) ||
 186                                 (irec.br_startoff >= fs_max_file_offset))
 187                         goto out_free;
 188
 189                 if (!verify_dfsbno(mp, irec.br_startblock) || !verify_dfsbno(mp,
 190                                 irec.br_startblock + irec.br_blockcount - 1))
 191                         goto out_free;
 192
 193                 if (!args->dirs_only && ((irec.br_startoff +
 194                                 irec.br_blockcount) >= mp->m_dir_geo->freeblk))
 195                         break;  /* only Phase 6 reads the free blocks */
 196
 197                 op = irec.br_startoff;
 198                 cp = irec.br_blockcount;
 199
 200                 while (irec.br_blockcount) {
 201                         unsigned int    bm_len;
 202
 203                         pftrace("queuing dir extent in AG %d", args->agno);
 204
 205                         if (len + irec.br_blockcount >= mp->m_dir_geo->fsbcount)
 206                                 bm_len = mp->m_dir_geo->fsbcount - len;
 207                         else
 208                                 bm_len = irec.br_blockcount;
 209                         len += bm_len;
 210
 211                         map[nmaps].bm_bn = XFS_FSB_TO_DADDR(mp,
 212                                                         irec.br_startblock);
 213                         map[nmaps].bm_len = XFS_FSB_TO_BB(mp, bm_len);
 214                         nmaps++;
 215
 216                         if (len == mp->m_dir_geo->fsbcount) {
 217                                 pf_queue_io(args, map, nmaps, B_DIR_META);
 218                                 len = 0;
 219                                 nmaps = 0;
 220                         }
 221
 222                         irec.br_blockcount -= bm_len;
 223                         irec.br_startblock += bm_len;
 224
 225                         /*
 226                          * Handle very fragmented dir2 blocks with dynamically
 227                          * allocated buffer maps.
 228                          */
 229                         if (nmaps >= max_extents) {
 230                                 struct xfs_buf_map *old_map = NULL;
 231
 232                                 if (map == map_array) {
 233                                         old_map = map;
 234                                         map = NULL;
 235                                 }
 236                                 max_extents *= 2;
 237                                 map = realloc(map, max_extents * sizeof(*map));
 238                                 if (map == NULL) {
 239                                         do_error(
 240                         _("couldn't malloc dir2 buffer list\n"));
 241                                         exit(1);
 242                                 }
 243                                 if (old_map)
 244                                         memcpy(map, old_map, sizeof(map_array));
 245                         }
 246
 247                 }
 248         }
 249         ret = 1;
 250 out_free:
 251         if (map != map_array)
 252                 free(map);
 253         return ret;
 254 }
 255
 256 /*
 257  * simplified version of the main scan_lbtree. Returns 0 to stop.
 258  */
 259
 260 static int
 261 pf_scan_lbtree(
 262         xfs_fsblock_t           dbno,
 263         int                     level,
 264         int                     isadir,
 265         prefetch_args_t         *args,
 266         int                     (*func)(struct xfs_btree_block  *block,
 267                                         int                     level,
 268                                         int                     isadir,
 269                                         prefetch_args_t         *args))
 270 {
 271         xfs_buf_t               *bp;
 272         int                     rc;
 273
 274         bp = libxfs_readbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, dbno),
 275                         XFS_FSB_TO_BB(mp, 1), 0, &xfs_bmbt_buf_ops);
 276         if (!bp)
 277                 return 0;
 278
 279         XFS_BUF_SET_PRIORITY(bp, isadir ? B_DIR_BMAP : B_BMAP);
 280
 281         /*
 282          * If the verifier flagged a problem with the buffer, we can't trust
 283          * its contents for the purposes of reading ahead.  Stop prefetching
 284          * the tree and mark the buffer unchecked so that the next read of the
 285          * buffer will retain the error status and be acted upon appropriately.
 286          */
 287         if (bp->b_error) {
 288                 bp->b_flags |= LIBXFS_B_UNCHECKED;
 289                 libxfs_putbuf(bp);
 290                 return 0;
 291         }
 292
 293         rc = (*func)(XFS_BUF_TO_BLOCK(bp), level - 1, isadir, args);
 294
 295         libxfs_putbuf(bp);
 296
 297         return rc;
 298 }
 299
 300 static int
 301 pf_scanfunc_bmap(
 302         struct xfs_btree_block  *block,
 303         int                     level,
 304         int                     isadir,
 305         prefetch_args_t         *args)
 306 {
 307         xfs_bmbt_ptr_t          *pp;
 308         int                     numrecs;
 309         int                     i;
 310         xfs_fsblock_t           dbno;
 311
 312         /*
 313          * do some validation on the block contents
 314          */
 315         if ((block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) &&
 316              block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC)) ||
 317                         (be16_to_cpu(block->bb_level) != level))
 318                 return 0;
 319
 320         numrecs = be16_to_cpu(block->bb_numrecs);
 321
 322         if (level == 0) {
 323                 if (numrecs > mp->m_bmap_dmxr[0] || !isadir)
 324                         return 0;
 325                 return pf_read_bmbt_reclist(args,
 326                         XFS_BMBT_REC_ADDR(mp, block, 1), numrecs);
 327         }
 328
 329         if (numrecs > mp->m_bmap_dmxr[1])
 330                 return 0;
 331
 332         pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
 333
 334         for (i = 0; i < numrecs; i++) {
 335                 dbno = get_unaligned_be64(&pp[i]);
 336                 if (!verify_dfsbno(mp, dbno))
 337                         return 0;
 338                 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
 339                         return 0;
 340         }
 341         return 1;
 342 }
 343
 344
 345 static void
 346 pf_read_btinode(
 347         prefetch_args_t         *args,
 348         xfs_dinode_t            *dino,
 349         int                     isadir)
 350 {
 351         xfs_bmdr_block_t        *dib;
 352         xfs_bmbt_ptr_t          *pp;
 353         int                     i;
 354         int                     level;
 355         int                     numrecs;
 356         int                     dsize;
 357         xfs_fsblock_t           dbno;
 358
 359         dib = (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dino);
 360
 361         level = be16_to_cpu(dib->bb_level);
 362         numrecs = be16_to_cpu(dib->bb_numrecs);
 363
 364         if ((numrecs == 0) || (level == 0) ||
 365                         (level > XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))
 366                 return;
 367         /*
 368          * use bmdr/dfork_dsize since the root block is in the data fork
 369          */
 370         if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp))
 371                 return;
 372
 373         dsize = XFS_DFORK_DSIZE(dino, mp);
 374         pp = XFS_BMDR_PTR_ADDR(dib, 1, libxfs_bmdr_maxrecs(dsize, 0));
 375
 376         for (i = 0; i < numrecs; i++) {
 377                 dbno = get_unaligned_be64(&pp[i]);
 378                 if (!verify_dfsbno(mp, dbno))
 379                         break;
 380                 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
 381                         break;
 382         }
 383 }
 384
 385 static void
 386 pf_read_exinode(
 387         prefetch_args_t         *args,
 388         xfs_dinode_t            *dino)
 389 {
 390         pf_read_bmbt_reclist(args, (xfs_bmbt_rec_t *)XFS_DFORK_DPTR(dino),
 391                         be32_to_cpu(dino->di_nextents));
 392 }
 393
 394 static void
 395 pf_read_inode_dirs(
 396         prefetch_args_t         *args,
 397         xfs_buf_t               *bp)
 398 {
 399         xfs_dinode_t            *dino;
 400         int                     icnt = 0;
 401         int                     hasdir = 0;
 402         int                     isadir;
 403
 404         libxfs_readbuf_verify(bp, &xfs_inode_buf_ops);
 405         if (bp->b_error)
 406                 return;
 407
 408         for (icnt = 0; icnt < (bp->b_bcount >> mp->m_sb.sb_inodelog); icnt++) {
 409                 dino = xfs_make_iptr(mp, bp, icnt);
 410
 411                 /*
 412                  * We are only prefetching directory contents in extents
 413                  * and btree nodes for other inodes
 414                  */
 415                 isadir = (be16_to_cpu(dino->di_mode) & S_IFMT) == S_IFDIR;
 416                 hasdir |= isadir;
 417
 418                 if (dino->di_format <= XFS_DINODE_FMT_LOCAL)
 419                         continue;
 420
 421                 if (!isadir && (dino->di_format == XFS_DINODE_FMT_EXTENTS ||
 422                                 args->dirs_only))
 423                         continue;
 424
 425                 /*
 426                  * do some checks on the inode to see if we can prefetch
 427                  * its directory data. It's a cut down version of
 428                  * process_dinode_int() in dinode.c.
 429                  */
 430                 if (dino->di_format > XFS_DINODE_FMT_BTREE)
 431                         continue;
 432
 433                 if (be16_to_cpu(dino->di_magic) != XFS_DINODE_MAGIC)
 434                         continue;
 435
 436                 if (!libxfs_dinode_good_version(mp, dino->di_version))
 437                         continue;
 438
 439                 if (be64_to_cpu(dino->di_size) <= XFS_DFORK_DSIZE(dino, mp))
 440                         continue;
 441
 442                 if ((dino->di_forkoff != 0) &&
 443                     (dino->di_forkoff >= XFS_LITINO(mp, dino->di_version) >> 3))
 444                         continue;
 445
 446                 switch (dino->di_format) {
 447                         case XFS_DINODE_FMT_EXTENTS:
 448                                 pf_read_exinode(args, dino);
 449                                 break;
 450                         case XFS_DINODE_FMT_BTREE:
 451                                 pf_read_btinode(args, dino, isadir);
 452                                 break;
 453                 }
 454         }
 455         if (hasdir)
 456                 XFS_BUF_SET_PRIORITY(bp, B_DIR_INODE);
 457 }
 458
 459 /*
 460  * pf_batch_read must be called with the lock locked.
 461  */
 462 static void
 463 pf_batch_read(
 464         prefetch_args_t         *args,
 465         pf_which_t              which,
 466         void                    *buf)
 467 {
 468         xfs_buf_t               *bplist[MAX_BUFS];
 469         unsigned int            num;
 470         off64_t                 first_off, last_off, next_off;
 471         int                     len, size;
 472         int                     i;
 473         int                     inode_bufs;
 474         unsigned long           fsbno = 0;
 475         unsigned long           max_fsbno;
 476         char                    *pbuf;
 477
 478         for (;;) {
 479                 num = 0;
 480                 if (which == PF_SECONDARY) {
 481                         bplist[0] = btree_find(args->io_queue, 0, &fsbno);
 482                         max_fsbno = min(fsbno + pf_max_fsbs,
 483                                                         args->last_bno_read);
 484                 } else {
 485                         bplist[0] = btree_find(args->io_queue,
 486                                                 args->last_bno_read, &fsbno);
 487                         max_fsbno = fsbno + pf_max_fsbs;
 488                 }
 489                 while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) {
 490                         /*
 491                          * Discontiguous buffers need special handling, so stop
 492                          * gathering new buffers and process the list and this
 493                          * discontigous buffer immediately. This avoids the
 494                          * complexity of keeping a separate discontigous buffer
 495                          * list and seeking back over ranges we've already done
 496                          * optimised reads for.
 497                          */
 498                         if ((bplist[num]->b_flags & LIBXFS_B_DISCONTIG)) {
 499                                 num++;
 500                                 break;
 501                         }
 502
 503                         if (which != PF_META_ONLY ||
 504                                    !B_IS_INODE(XFS_BUF_PRIORITY(bplist[num])))
 505                                 num++;
 506                         if (num == MAX_BUFS)
 507                                 break;
 508                         bplist[num] = btree_lookup_next(args->io_queue, &fsbno);
 509                 }
 510                 if (!num)
 511                         return;
 512
 513                 /*
 514                  * do a big read if 25% of the potential buffer is useful,
 515                  * otherwise, find as many close together blocks and
 516                  * read them in one read
 517                  */
 518                 first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0]));
 519                 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
 520                         XFS_BUF_SIZE(bplist[num-1]);
 521                 while (num > 1 && last_off - first_off > pf_max_bytes) {
 522                         num--;
 523                         last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
 524                                 XFS_BUF_SIZE(bplist[num-1]);
 525                 }
 526                 if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) {
 527                         /*
 528                          * not enough blocks for one big read, so determine
 529                          * the number of blocks that are close enough.
 530                          */
 531                         last_off = first_off + XFS_BUF_SIZE(bplist[0]);
 532                         for (i = 1; i < num; i++) {
 533                                 next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) +
 534                                                 XFS_BUF_SIZE(bplist[i]);
 535                                 if (next_off - last_off > pf_batch_bytes)
 536                                         break;
 537                                 last_off = next_off;
 538                         }
 539                         num = i;
 540                 }
 541
 542                 for (i = 0; i < num; i++) {
 543                         if (btree_delete(args->io_queue, XFS_DADDR_TO_FSB(mp,
 544                                         XFS_BUF_ADDR(bplist[i]))) == NULL)
 545                                 do_error(_("prefetch corruption\n"));
 546                 }
 547
 548                 if (which == PF_PRIMARY) {
 549                         for (inode_bufs = 0, i = 0; i < num; i++) {
 550                                 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
 551                                         inode_bufs++;
 552                         }
 553                         args->inode_bufs_queued -= inode_bufs;
 554                         if (inode_bufs && (first_off >> mp->m_sb.sb_blocklog) >
 555                                         pf_batch_fsbs)
 556                                 args->last_bno_read = (first_off >> mp->m_sb.sb_blocklog);
 557                 }
 558 #ifdef XR_PF_TRACE
 559                 pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)",
 560                         (long long)XFS_BUF_ADDR(bplist[0]),
 561                         (long long)XFS_BUF_ADDR(bplist[num-1]), num,
 562                         (which != PF_SECONDARY) ? "pri" : "sec", args->agno,
 563                         args->last_bno_read, args->inode_bufs_queued);
 564 #endif
 565                 pthread_mutex_unlock(&args->lock);
 566
 567                 /*
 568                  * now read the data and put into the xfs_but_t's
 569                  */
 570                 len = pread(mp_fd, buf, (int)(last_off - first_off), first_off);
 571
 572                 /*
 573                  * Check the last buffer on the list to see if we need to
 574                  * process a discontiguous buffer. The gather above loop
 575                  * guarantees that only the last buffer in the list will be a
 576                  * discontiguous buffer.
 577                  */
 578                 if ((bplist[num - 1]->b_flags & LIBXFS_B_DISCONTIG)) {
 579                         libxfs_readbufr_map(mp->m_ddev_targp, bplist[num - 1], 0);
 580                         bplist[num - 1]->b_flags |= LIBXFS_B_UNCHECKED;
 581                         libxfs_putbuf(bplist[num - 1]);
 582                         num--;
 583                 }
 584
 585                 if (len > 0) {
 586                         /*
 587                          * go through the xfs_buf_t list copying from the
 588                          * read buffer into the xfs_buf_t's and release them.
 589                          */
 590                         for (i = 0; i < num; i++) {
 591
 592                                 pbuf = ((char *)buf) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) - first_off);
 593                                 size = XFS_BUF_SIZE(bplist[i]);
 594                                 if (len < size)
 595                                         break;
 596                                 memcpy(bplist[i]->b_addr, pbuf, size);
 597                                 bplist[i]->b_flags |= (LIBXFS_B_UPTODATE |
 598                                                        LIBXFS_B_UNCHECKED);
 599                                 len -= size;
 600                                 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
 601                                         pf_read_inode_dirs(args, bplist[i]);
 602                                 else if (which == PF_META_ONLY)
 603                                         XFS_BUF_SET_PRIORITY(bplist[i],
 604                                                                 B_DIR_META_H);
 605                                 else if (which == PF_PRIMARY && num == 1)
 606                                         XFS_BUF_SET_PRIORITY(bplist[i],
 607                                                                 B_DIR_META_S);
 608                         }
 609                 }
 610                 for (i = 0; i < num; i++) {
 611                         pftrace("putbuf %c %p (%llu) in AG %d",
 612                                 B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M',
 613                                 bplist[i], (long long)XFS_BUF_ADDR(bplist[i]),
 614                                 args->agno);
 615                         libxfs_putbuf(bplist[i]);
 616                 }
 617                 pthread_mutex_lock(&args->lock);
 618                 if (which != PF_SECONDARY) {
 619                         pftrace("inode_bufs_queued for AG %d = %d", args->agno,
 620                                 args->inode_bufs_queued);
 621                         /*
 622                          * if primary inode queue running low, process metadata
 623                          * in boths queues to avoid I/O starvation as the
 624                          * processing thread would be waiting for a metadata
 625                          * buffer
 626                          */
 627                         if (which == PF_PRIMARY && !args->queuing_done &&
 628                                         args->inode_bufs_queued < IO_THRESHOLD) {
 629                                 pftrace("reading metadata bufs from primary queue for AG %d",
 630                                         args->agno);
 631
 632                                 pf_batch_read(args, PF_META_ONLY, buf);
 633
 634                                 pftrace("reading bufs from secondary queue for AG %d",
 635                                         args->agno);
 636
 637                                 pf_batch_read(args, PF_SECONDARY, buf);
 638                         }
 639                 }
 640         }
 641 }
 642
 643 static void *
 644 pf_io_worker(
 645         void                    *param)
 646 {
 647         prefetch_args_t         *args = param;
 648         void                    *buf = memalign(libxfs_device_alignment(),
 649                                                 pf_max_bytes);
 650
 651         if (buf == NULL)
 652                 return NULL;
 653
 654         pthread_mutex_lock(&args->lock);
 655         while (!args->queuing_done || !btree_is_empty(args->io_queue)) {
 656                 pftrace("waiting to start prefetch I/O for AG %d", args->agno);
 657
 658                 while (!args->can_start_reading && !args->queuing_done)
 659                         pthread_cond_wait(&args->start_reading, &args->lock);
 660
 661                 pftrace("starting prefetch I/O for AG %d", args->agno);
 662
 663                 pf_batch_read(args, PF_PRIMARY, buf);
 664                 pf_batch_read(args, PF_SECONDARY, buf);
 665
 666                 pftrace("ran out of bufs to prefetch for AG %d", args->agno);
 667
 668                 if (!args->queuing_done)
 669                         args->can_start_reading = 0;
 670         }
 671         pthread_mutex_unlock(&args->lock);
 672
 673         free(buf);
 674
 675         pftrace("finished prefetch I/O for AG %d", args->agno);
 676
 677         return NULL;
 678 }
 679
 680 static int
 681 pf_create_prefetch_thread(
 682         prefetch_args_t         *args);
 683
 684 /*
 685  * If we fail to create the queuing thread or can't create even one
 686  * prefetch thread, we need to let processing continue without it.
 687  */
 688 static void
 689 pf_skip_prefetch_thread(prefetch_args_t *args)
 690 {
 691         prefetch_args_t *next;
 692
 693         pthread_mutex_lock(&args->lock);
 694         args->prefetch_done = 1;
 695         pf_start_processing(args);
 696         next = args->next_args;
 697         args->next_args = NULL;
 698         pthread_mutex_unlock(&args->lock);
 699
 700         if (next)
 701                 pf_create_prefetch_thread(next);
 702 }
 703
 704 static void *
 705 pf_queuing_worker(
 706         void                    *param)
 707 {
 708         prefetch_args_t         *args = param;
 709         prefetch_args_t         *next_args;
 710         int                     num_inos;
 711         ino_tree_node_t         *irec;
 712         ino_tree_node_t         *cur_irec;
 713         int                     blks_per_cluster;
 714         xfs_agblock_t           bno;
 715         int                     i;
 716         int                     err;
 717         uint64_t                sparse;
 718
 719         blks_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
 720         if (blks_per_cluster == 0)
 721                 blks_per_cluster = 1;
 722
 723         for (i = 0; i < PF_THREAD_COUNT; i++) {
 724                 err = pthread_create(&args->io_threads[i], NULL,
 725                                 pf_io_worker, args);
 726                 if (err != 0) {
 727                         do_warn(_("failed to create prefetch thread: %s\n"),
 728                                 strerror(err));
 729                         pftrace("failed to create prefetch thread for AG %d: %s",
 730                                 args->agno, strerror(err));
 731                         args->io_threads[i] = 0;
 732                         if (i == 0) {
 733                                 pf_skip_prefetch_thread(args);
 734                                 return NULL;
 735                         }
 736                         /*
 737                          * since we have at least one I/O thread, use them for
 738                          * prefetch
 739                          */
 740                         break;
 741                 }
 742         }
 743         pftrace("starting prefetch for AG %d", args->agno);
 744
 745         for (irec = findfirst_inode_rec(args->agno); irec != NULL;
 746                         irec = next_ino_rec(irec)) {
 747
 748                 cur_irec = irec;
 749
 750                 num_inos = XFS_INODES_PER_CHUNK;
 751                 while (num_inos < mp->m_ialloc_inos && irec != NULL) {
 752                         irec = next_ino_rec(irec);
 753                         num_inos += XFS_INODES_PER_CHUNK;
 754                 }
 755
 756                 if (args->dirs_only && cur_irec->ino_isa_dir == 0)
 757                         continue;
 758 #ifdef XR_PF_TRACE
 759                 sem_getvalue(&args->ra_count, &i);
 760                 pftrace("queuing irec %p in AG %d, sem count = %d",
 761                         irec, args->agno, i);
 762 #endif
 763                 err = sem_trywait(&args->ra_count);
 764                 if (err < 0 && errno == EAGAIN) {
 765                         /*
 766                          * Kick the queue once we have reached the limit;
 767                          * without this the threads processing the inodes
 768                          * might get stuck on a buffer that has been locked
 769                          * and added to the I/O queue but is waiting for
 770                          * the thread to be woken.
 771                          */
 772                         pf_start_io_workers(args);
 773                         sem_wait(&args->ra_count);
 774                 }
 775
 776                 num_inos = 0;
 777                 bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum);
 778                 sparse = cur_irec->ir_sparse;
 779
 780                 do {
 781                         struct xfs_buf_map      map;
 782
 783                         map.bm_bn = XFS_AGB_TO_DADDR(mp, args->agno, bno);
 784                         map.bm_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
 785
 786                         /*
 787                          * Queue I/O for each non-sparse cluster. We can check
 788                          * sparse state in cluster sized chunks as cluster size
 789                          * is the min. granularity of sparse irec regions.
 790                          */
 791                         if ((sparse & ((1ULL << inodes_per_cluster) - 1)) == 0)
 792                                 pf_queue_io(args, &map, 1,
 793                                             (cur_irec->ino_isa_dir != 0) ?
 794                                              B_DIR_INODE : B_INODE);
 795
 796                         bno += blks_per_cluster;
 797                         num_inos += inodes_per_cluster;
 798                         sparse >>= inodes_per_cluster;
 799                 } while (num_inos < mp->m_ialloc_inos);
 800         }
 801
 802         pthread_mutex_lock(&args->lock);
 803
 804         pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)",
 805                 args->agno, args->inode_bufs_queued);
 806
 807         args->queuing_done = 1;
 808         pf_start_io_workers(args);
 809         pf_start_processing(args);
 810         pthread_mutex_unlock(&args->lock);
 811
 812         /* now wait for the readers to finish */
 813         for (i = 0; i < PF_THREAD_COUNT; i++)
 814                 if (args->io_threads[i])
 815                         pthread_join(args->io_threads[i], NULL);
 816
 817         pftrace("prefetch for AG %d finished", args->agno);
 818
 819         pthread_mutex_lock(&args->lock);
 820
 821         ASSERT(btree_is_empty(args->io_queue));
 822
 823         args->prefetch_done = 1;
 824         next_args = args->next_args;
 825         args->next_args = NULL;
 826         pthread_mutex_unlock(&args->lock);
 827
 828         if (next_args)
 829                 pf_create_prefetch_thread(next_args);
 830
 831         return NULL;
 832 }
 833
 834 static int
 835 pf_create_prefetch_thread(
 836         prefetch_args_t         *args)
 837 {
 838         int                     err;
 839
 840         pftrace("creating queue thread for AG %d", args->agno);
 841
 842         err = pthread_create(&args->queuing_thread, NULL,
 843                         pf_queuing_worker, args);
 844         if (err != 0) {
 845                 do_warn(_("failed to create prefetch thread: %s\n"),
 846                         strerror(err));
 847                 pftrace("failed to create prefetch thread for AG %d: %s",
 848                         args->agno, strerror(err));
 849                 args->queuing_thread = 0;
 850                 pf_skip_prefetch_thread(args);
 851         }
 852
 853         return err == 0;
 854 }
 855
 856 void
 857 init_prefetch(
 858         xfs_mount_t             *pmp)
 859 {
 860         mp = pmp;
 861         mp_fd = libxfs_device_to_fd(mp->m_ddev_targp->dev);
 862         pf_max_bytes = sysconf(_SC_PAGE_SIZE) << 7;
 863         pf_max_bbs = pf_max_bytes >> BBSHIFT;
 864         pf_max_fsbs = pf_max_bytes >> mp->m_sb.sb_blocklog;
 865         pf_batch_bytes = DEF_BATCH_BYTES;
 866         pf_batch_fsbs = DEF_BATCH_BYTES >> (mp->m_sb.sb_blocklog + 1);
 867 }
 868
 869 prefetch_args_t *
 870 start_inode_prefetch(
 871         xfs_agnumber_t          agno,
 872         int                     dirs_only,
 873         prefetch_args_t         *prev_args)
 874 {
 875         prefetch_args_t         *args;
 876         long                    max_queue;
 877
 878         if (!do_prefetch || agno >= mp->m_sb.sb_agcount)
 879                 return NULL;
 880
 881         args = calloc(1, sizeof(prefetch_args_t));
 882
 883         btree_init(&args->io_queue);
 884         if (pthread_mutex_init(&args->lock, NULL) != 0)
 885                 do_error(_("failed to initialize prefetch mutex\n"));
 886         if (pthread_cond_init(&args->start_reading, NULL) != 0)
 887                 do_error(_("failed to initialize prefetch cond var\n"));
 888         if (pthread_cond_init(&args->start_processing, NULL) != 0)
 889                 do_error(_("failed to initialize prefetch cond var\n"));
 890         args->agno = agno;
 891         args->dirs_only = dirs_only;
 892
 893         /*
 894          * use only 1/8 of the libxfs cache as we are only counting inodes
 895          * and not any other associated metadata like directories
 896          */
 897
 898         max_queue = libxfs_bcache->c_maxcount / thread_count / 8;
 899         if (mp->m_inode_cluster_size > mp->m_sb.sb_blocksize)
 900                 max_queue = max_queue *
 901                         (mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog) /
 902                         mp->m_ialloc_blks;
 903
 904         sem_init(&args->ra_count, 0, max_queue);
 905
 906         if (!prev_args) {
 907                 if (!pf_create_prefetch_thread(args))
 908                         return NULL;
 909         } else {
 910                 pthread_mutex_lock(&prev_args->lock);
 911                 if (prev_args->prefetch_done) {
 912                         pthread_mutex_unlock(&prev_args->lock);
 913                         if (!pf_create_prefetch_thread(args))
 914                                 args = NULL;
 915                 } else {
 916                         prev_args->next_args = args;
 917                         pftrace("queued AG %d after AG %d",
 918                                 args->agno, prev_args->agno);
 919                         pthread_mutex_unlock(&prev_args->lock);
 920                 }
 921         }
 922
 923         return args;
 924 }
 925
 926 /*
 927  * prefetch_ag_range runs a prefetch-and-process loop across a range of AGs. It
 928  * begins with @start+ag, and finishes with @end_ag - 1 (i.e. does not prefetch
 929  * or process @end_ag). The function starts prefetch on the first AG, then loops
 930  * starting prefetch on the next AG and then blocks processing the current AG as
 931  * the prefetch queue brings inodes into the processing queue.
 932  *
 933  * There is only one prefetch taking place at a time, so the prefetch on the
 934  * next AG only starts once the current AG has been completely prefetched. Hence
 935  * the prefetch of the next AG will start some time before the processing of the
 936  * current AG finishes, ensuring that when we iterate an start processing the
 937  * next AG there is already a significant queue of inodes to process.
 938  *
 939  * Prefetch is done this way to prevent it from running too far ahead of the
 940  * processing. Allowing it to do so can cause cache thrashing, where new
 941  * prefetch causes previously prefetched buffers to be reclaimed before the
 942  * processing thread uses them. This results in reading all the inodes and
 943  * metadata twice per phase and it greatly slows down the processing. Hence we
 944  * have to carefully control how far ahead we prefetch...
 945  */
 946 static void
 947 prefetch_ag_range(
 948         struct workqueue        *work,
 949         xfs_agnumber_t          start_ag,
 950         xfs_agnumber_t          end_ag,
 951         bool                    dirs_only,
 952         void                    (*func)(struct workqueue *,
 953                                         xfs_agnumber_t, void *))
 954 {
 955         int                     i;
 956         struct prefetch_args    *pf_args[2];
 957
 958         pf_args[start_ag & 1] = start_inode_prefetch(start_ag, dirs_only, NULL);
 959         for (i = start_ag; i < end_ag; i++) {
 960                 /* Don't prefetch end_ag */
 961                 if (i + 1 < end_ag)
 962                         pf_args[(~i) & 1] = start_inode_prefetch(i + 1,
 963                                                 dirs_only, pf_args[i & 1]);
 964                 func(work, i, pf_args[i & 1]);
 965         }
 966 }
 967
 968 struct pf_work_args {
 969         xfs_agnumber_t  start_ag;
 970         xfs_agnumber_t  end_ag;
 971         bool            dirs_only;
 972         void            (*func)(struct workqueue *, xfs_agnumber_t, void *);
 973 };
 974
 975 static void
 976 prefetch_ag_range_work(
 977         struct workqueue        *work,
 978         xfs_agnumber_t          unused,
 979         void                    *args)
 980 {
 981         struct pf_work_args *wargs = args;
 982
 983         prefetch_ag_range(work, wargs->start_ag, wargs->end_ag,
 984                           wargs->dirs_only, wargs->func);
 985         free(args);
 986 }
 987
 988 /*
 989  * Do inode prefetch in the most optimal way for the context under which repair
 990  * has been run.
 991  */
 992 void
 993 do_inode_prefetch(
 994         struct xfs_mount        *mp,
 995         int                     stride,
 996         void                    (*func)(struct workqueue *,
 997                                         xfs_agnumber_t, void *),
 998         bool                    check_cache,
 999         bool                    dirs_only)
1000 {
1001         int                     i;
1002         struct workqueue        queue;
1003         struct workqueue        *queues;
1004         int                     queues_started = 0;
1005
1006         /*
1007          * If the previous phases of repair have not overflowed the buffer
1008          * cache, then we don't need to re-read any of the metadata in the
1009          * filesystem - it's all in the cache. In that case, run a thread per
1010          * CPU to maximise parallelism of the queue to be processed.
1011          */
1012         if (check_cache && !libxfs_bcache_overflowed()) {
1013                 queue.wq_ctx = mp;
1014                 create_work_queue(&queue, mp, libxfs_nproc());
1015                 for (i = 0; i < mp->m_sb.sb_agcount; i++)
1016                         queue_work(&queue, func, i, NULL);
1017                 destroy_work_queue(&queue);
1018                 return;
1019         }
1020
1021         /*
1022          * single threaded behaviour - single prefetch thread, processed
1023          * directly after each AG is queued.
1024          */
1025         if (!stride) {
1026                 queue.wq_ctx = mp;
1027                 prefetch_ag_range(&queue, 0, mp->m_sb.sb_agcount,
1028                                   dirs_only, func);
1029                 return;
1030         }
1031
1032         /*
1033          * create one worker thread for each segment of the volume
1034          */
1035         queues = malloc(thread_count * sizeof(struct workqueue));
1036         for (i = 0; i < thread_count; i++) {
1037                 struct pf_work_args *wargs;
1038
1039                 wargs = malloc(sizeof(struct pf_work_args));
1040                 wargs->start_ag = i * stride;
1041                 wargs->end_ag = min((i + 1) * stride,
1042                                     mp->m_sb.sb_agcount);
1043                 wargs->dirs_only = dirs_only;
1044                 wargs->func = func;
1045
1046                 create_work_queue(&queues[i], mp, 1);
1047                 queue_work(&queues[i], prefetch_ag_range_work, 0, wargs);
1048                 queues_started++;
1049
1050                 if (wargs->end_ag >= mp->m_sb.sb_agcount)
1051                         break;
1052         }
1053
1054         /*
1055          * wait for workers to complete
1056          */
1057         for (i = 0; i < queues_started; i++)
1058                 destroy_work_queue(&queues[i]);
1059         free(queues);
1060 }
1061
1062 void
1063 wait_for_inode_prefetch(
1064         prefetch_args_t         *args)
1065 {
1066         if (args == NULL)
1067                 return;
1068
1069         pthread_mutex_lock(&args->lock);
1070
1071         while (!args->can_start_processing) {
1072                 pftrace("waiting to start processing AG %d", args->agno);
1073
1074                 pthread_cond_wait(&args->start_processing, &args->lock);
1075         }
1076         pftrace("can start processing AG %d", args->agno);
1077
1078         pthread_mutex_unlock(&args->lock);
1079 }
1080
1081 void
1082 cleanup_inode_prefetch(
1083         prefetch_args_t         *args)
1084 {
1085         if (args == NULL)
1086                 return;
1087
1088         pftrace("waiting AG %d prefetch to finish", args->agno);
1089
1090         if (args->queuing_thread)
1091                 pthread_join(args->queuing_thread, NULL);
1092
1093         pftrace("AG %d prefetch done", args->agno);
1094
1095         ASSERT(args->next_args == NULL);
1096
1097         pthread_mutex_destroy(&args->lock);
1098         pthread_cond_destroy(&args->start_reading);
1099         pthread_cond_destroy(&args->start_processing);
1100         sem_destroy(&args->ra_count);
1101         btree_destroy(args->io_queue);
1102
1103         free(args);
1104 }
1105
1106 #ifdef XR_PF_TRACE
1107
1108 static FILE     *pf_trace_file;
1109
1110 void
1111 pftrace_init(void)
1112 {
1113         pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w");
1114         setvbuf(pf_trace_file, NULL, _IOLBF, 1024);
1115 }
1116
1117 void
1118 pftrace_done(void)
1119 {
1120         fclose(pf_trace_file);
1121 }
1122
1123 void
1124 _pftrace(const char *func, const char *msg, ...)
1125 {
1126         char            buf[200];
1127         struct timeval  tv;
1128         va_list         args;
1129
1130         gettimeofday(&tv, NULL);
1131
1132         va_start(args, msg);
1133         vsnprintf(buf, sizeof(buf), msg, args);
1134         buf[sizeof(buf)-1] = '\0';
1135         va_end(args);
1136
1137         fprintf(pf_trace_file, "%lu.%06lu  %s: %s\n", tv.tv_sec, tv.tv_usec,
1138                 func, buf);
1139 }
1140
1141 #endif