]>
git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - repair/prefetch.c
b4f20d948ea86549fa751e4b65a8319c5984cbea
10 #include "err_protos.h"
21 * Performs prefetching by priming the libxfs cache by using a dedicate thread
22 * scanning inodes and reading blocks in ahead of time they are required.
24 * Any I/O errors can be safely ignored.
27 static xfs_mount_t
*mp
;
29 static int pf_max_bytes
;
30 static int pf_max_bbs
;
31 static int pf_max_fsbs
;
32 static int pf_batch_bytes
;
33 static int pf_batch_fsbs
;
35 static void pf_read_inode_dirs(prefetch_args_t
*, xfs_buf_t
*);
38 * Buffer priorities for the libxfs cache
40 * Directory metadata is ranked higher than other metadata as it's used
41 * in phases 3, 4 and 6, while other metadata is only used in 3 & 4.
44 /* intermediate directory btree nodes - can't be queued */
45 #define B_DIR_BMAP CACHE_PREFETCH_PRIORITY + 7
46 /* directory metadata in secondary queue */
47 #define B_DIR_META_2 CACHE_PREFETCH_PRIORITY + 6
48 /* dir metadata that had to fetched from the primary queue to avoid stalling */
49 #define B_DIR_META_H CACHE_PREFETCH_PRIORITY + 5
50 /* single block of directory metadata (can't batch read) */
51 #define B_DIR_META_S CACHE_PREFETCH_PRIORITY + 4
52 /* dir metadata with more than one block fetched in a single I/O */
53 #define B_DIR_META CACHE_PREFETCH_PRIORITY + 3
54 /* inode clusters with directory inodes */
55 #define B_DIR_INODE CACHE_PREFETCH_PRIORITY + 2
56 /* intermediate extent btree nodes */
57 #define B_BMAP CACHE_PREFETCH_PRIORITY + 1
58 /* inode clusters without any directory entries */
59 #define B_INODE CACHE_PREFETCH_PRIORITY
62 * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if
63 * the buffer is for an inode or other metadata.
65 #define B_IS_INODE(f) (((f) & 5) == 0)
67 #define DEF_BATCH_BYTES 0x10000
71 #define IO_THRESHOLD (MAX_BUFS * 2)
73 typedef enum pf_which
{
82 prefetch_args_t
*args
)
84 if (!args
->can_start_processing
) {
85 pftrace("signalling processing for AG %d", args
->agno
);
87 args
->can_start_processing
= 1;
88 pthread_cond_signal(&args
->start_processing
);
94 prefetch_args_t
*args
)
96 if (!args
->can_start_reading
) {
97 pftrace("signalling reading for AG %d", args
->agno
);
99 args
->can_start_reading
= 1;
100 pthread_cond_broadcast(&args
->start_reading
);
107 prefetch_args_t
*args
,
108 struct xfs_buf_map
*map
,
113 xfs_fsblock_t fsbno
= XFS_DADDR_TO_FSB(mp
, map
[0].bm_bn
);
116 * Never block on a buffer lock here, given that the actual repair
117 * code might lock buffers in a different order from us. Given that
118 * the lock holder is either reading it from disk himself or
119 * completely overwriting it this behaviour is perfectly fine.
121 bp
= libxfs_getbuf_map(mp
->m_dev
, map
, nmaps
, LIBXFS_GETBUF_TRYLOCK
);
125 if (bp
->b_flags
& LIBXFS_B_UPTODATE
) {
126 if (B_IS_INODE(flag
))
127 pf_read_inode_dirs(args
, bp
);
128 XFS_BUF_SET_PRIORITY(bp
, XFS_BUF_PRIORITY(bp
) +
129 CACHE_PREFETCH_PRIORITY
);
133 XFS_BUF_SET_PRIORITY(bp
, flag
);
135 pthread_mutex_lock(&args
->lock
);
137 btree_insert(args
->io_queue
, fsbno
, bp
);
139 if (fsbno
> args
->last_bno_read
) {
140 if (B_IS_INODE(flag
)) {
141 args
->inode_bufs_queued
++;
142 if (args
->inode_bufs_queued
== IO_THRESHOLD
)
143 pf_start_io_workers(args
);
146 ASSERT(!B_IS_INODE(flag
));
147 XFS_BUF_SET_PRIORITY(bp
, B_DIR_META_2
);
150 pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue"
151 "(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag
) ?
152 'I' : 'M', bp
, (long long)XFS_BUF_ADDR(bp
), args
->agno
, fsbno
,
153 args
->inode_bufs_queued
, args
->last_bno_read
);
155 pf_start_processing(args
);
157 pthread_mutex_unlock(&args
->lock
);
161 pf_read_bmbt_reclist(
162 prefetch_args_t
*args
,
167 xfs_bmbt_irec_t irec
;
168 xfs_filblks_t cp
= 0; /* prev count */
169 xfs_fileoff_t op
= 0; /* prev offset */
170 #define MAP_ARRAY_SZ 4
171 struct xfs_buf_map map_array
[MAP_ARRAY_SZ
];
172 struct xfs_buf_map
*map
= map_array
;
173 int max_extents
= MAP_ARRAY_SZ
;
175 unsigned int len
= 0;
179 for (i
= 0; i
< numrecs
; i
++) {
180 libxfs_bmbt_disk_get_all(rp
+ i
, &irec
);
182 if (((i
> 0) && (op
+ cp
> irec
.br_startoff
)) ||
183 (irec
.br_blockcount
== 0) ||
184 (irec
.br_startoff
>= fs_max_file_offset
))
187 if (!verify_dfsbno(mp
, irec
.br_startblock
) || !verify_dfsbno(mp
,
188 irec
.br_startblock
+ irec
.br_blockcount
- 1))
191 if (!args
->dirs_only
&& ((irec
.br_startoff
+
192 irec
.br_blockcount
) >= mp
->m_dir_geo
->freeblk
))
193 break; /* only Phase 6 reads the free blocks */
195 op
= irec
.br_startoff
;
196 cp
= irec
.br_blockcount
;
198 while (irec
.br_blockcount
) {
201 pftrace("queuing dir extent in AG %d", args
->agno
);
203 if (len
+ irec
.br_blockcount
>= mp
->m_dir_geo
->fsbcount
)
204 bm_len
= mp
->m_dir_geo
->fsbcount
- len
;
206 bm_len
= irec
.br_blockcount
;
209 map
[nmaps
].bm_bn
= XFS_FSB_TO_DADDR(mp
,
211 map
[nmaps
].bm_len
= XFS_FSB_TO_BB(mp
, bm_len
);
214 if (len
== mp
->m_dir_geo
->fsbcount
) {
215 pf_queue_io(args
, map
, nmaps
, B_DIR_META
);
220 irec
.br_blockcount
-= bm_len
;
221 irec
.br_startblock
+= bm_len
;
224 * Handle very fragmented dir2 blocks with dynamically
225 * allocated buffer maps.
227 if (nmaps
>= max_extents
) {
228 struct xfs_buf_map
*old_map
= NULL
;
230 if (map
== map_array
) {
235 map
= realloc(map
, max_extents
* sizeof(*map
));
238 _("couldn't malloc dir2 buffer list\n"));
242 memcpy(map
, old_map
, sizeof(map_array
));
249 if (map
!= map_array
)
255 * simplified version of the main scan_lbtree. Returns 0 to stop.
263 prefetch_args_t
*args
,
264 int (*func
)(struct xfs_btree_block
*block
,
267 prefetch_args_t
*args
))
272 bp
= libxfs_readbuf(mp
->m_dev
, XFS_FSB_TO_DADDR(mp
, dbno
),
273 XFS_FSB_TO_BB(mp
, 1), 0, &xfs_bmbt_buf_ops
);
277 XFS_BUF_SET_PRIORITY(bp
, isadir
? B_DIR_BMAP
: B_BMAP
);
280 * If the verifier flagged a problem with the buffer, we can't trust
281 * its contents for the purposes of reading ahead. Stop prefetching
282 * the tree and mark the buffer unchecked so that the next read of the
283 * buffer will retain the error status and be acted upon appropriately.
286 bp
->b_flags
|= LIBXFS_B_UNCHECKED
;
291 rc
= (*func
)(XFS_BUF_TO_BLOCK(bp
), level
- 1, isadir
, args
);
300 struct xfs_btree_block
*block
,
303 prefetch_args_t
*args
)
311 * do some validation on the block contents
313 if ((block
->bb_magic
!= cpu_to_be32(XFS_BMAP_MAGIC
) &&
314 block
->bb_magic
!= cpu_to_be32(XFS_BMAP_CRC_MAGIC
)) ||
315 (be16_to_cpu(block
->bb_level
) != level
))
318 numrecs
= be16_to_cpu(block
->bb_numrecs
);
321 if (numrecs
> mp
->m_bmap_dmxr
[0] || !isadir
)
323 return pf_read_bmbt_reclist(args
,
324 XFS_BMBT_REC_ADDR(mp
, block
, 1), numrecs
);
327 if (numrecs
> mp
->m_bmap_dmxr
[1])
330 pp
= XFS_BMBT_PTR_ADDR(mp
, block
, 1, mp
->m_bmap_dmxr
[1]);
332 for (i
= 0; i
< numrecs
; i
++) {
333 dbno
= get_unaligned_be64(&pp
[i
]);
334 if (!verify_dfsbno(mp
, dbno
))
336 if (!pf_scan_lbtree(dbno
, level
, isadir
, args
, pf_scanfunc_bmap
))
345 prefetch_args_t
*args
,
349 xfs_bmdr_block_t
*dib
;
357 dib
= (xfs_bmdr_block_t
*)XFS_DFORK_DPTR(dino
);
359 level
= be16_to_cpu(dib
->bb_level
);
360 numrecs
= be16_to_cpu(dib
->bb_numrecs
);
362 if ((numrecs
== 0) || (level
== 0) ||
363 (level
> XFS_BM_MAXLEVELS(mp
, XFS_DATA_FORK
)))
366 * use bmdr/dfork_dsize since the root block is in the data fork
368 if (XFS_BMDR_SPACE_CALC(numrecs
) > XFS_DFORK_DSIZE(dino
, mp
))
371 dsize
= XFS_DFORK_DSIZE(dino
, mp
);
372 pp
= XFS_BMDR_PTR_ADDR(dib
, 1, xfs_bmdr_maxrecs(dsize
, 0));
374 for (i
= 0; i
< numrecs
; i
++) {
375 dbno
= get_unaligned_be64(&pp
[i
]);
376 if (!verify_dfsbno(mp
, dbno
))
378 if (!pf_scan_lbtree(dbno
, level
, isadir
, args
, pf_scanfunc_bmap
))
385 prefetch_args_t
*args
,
388 pf_read_bmbt_reclist(args
, (xfs_bmbt_rec_t
*)XFS_DFORK_DPTR(dino
),
389 be32_to_cpu(dino
->di_nextents
));
394 prefetch_args_t
*args
,
402 libxfs_readbuf_verify(bp
, &xfs_inode_buf_ops
);
406 for (icnt
= 0; icnt
< (XFS_BUF_COUNT(bp
) >> mp
->m_sb
.sb_inodelog
); icnt
++) {
407 dino
= xfs_make_iptr(mp
, bp
, icnt
);
410 * We are only prefetching directory contents in extents
411 * and btree nodes for other inodes
413 isadir
= (be16_to_cpu(dino
->di_mode
) & S_IFMT
) == S_IFDIR
;
416 if (dino
->di_format
<= XFS_DINODE_FMT_LOCAL
)
419 if (!isadir
&& (dino
->di_format
== XFS_DINODE_FMT_EXTENTS
||
424 * do some checks on the inode to see if we can prefetch
425 * its directory data. It's a cut down version of
426 * process_dinode_int() in dinode.c.
428 if (dino
->di_format
> XFS_DINODE_FMT_BTREE
)
431 if (be16_to_cpu(dino
->di_magic
) != XFS_DINODE_MAGIC
)
434 if (!xfs_dinode_good_version(mp
, dino
->di_version
))
437 if (be64_to_cpu(dino
->di_size
) <= XFS_DFORK_DSIZE(dino
, mp
))
440 if ((dino
->di_forkoff
!= 0) &&
441 (dino
->di_forkoff
>= XFS_LITINO(mp
, dino
->di_version
) >> 3))
444 switch (dino
->di_format
) {
445 case XFS_DINODE_FMT_EXTENTS
:
446 pf_read_exinode(args
, dino
);
448 case XFS_DINODE_FMT_BTREE
:
449 pf_read_btinode(args
, dino
, isadir
);
454 XFS_BUF_SET_PRIORITY(bp
, B_DIR_INODE
);
458 * pf_batch_read must be called with the lock locked.
462 prefetch_args_t
*args
,
466 xfs_buf_t
*bplist
[MAX_BUFS
];
468 off64_t first_off
, last_off
, next_off
;
472 unsigned long fsbno
= 0;
473 unsigned long max_fsbno
;
478 if (which
== PF_SECONDARY
) {
479 bplist
[0] = btree_find(args
->io_queue
, 0, &fsbno
);
480 max_fsbno
= MIN(fsbno
+ pf_max_fsbs
,
481 args
->last_bno_read
);
483 bplist
[0] = btree_find(args
->io_queue
,
484 args
->last_bno_read
, &fsbno
);
485 max_fsbno
= fsbno
+ pf_max_fsbs
;
487 while (bplist
[num
] && num
< MAX_BUFS
&& fsbno
< max_fsbno
) {
489 * Discontiguous buffers need special handling, so stop
490 * gathering new buffers and process the list and this
491 * discontigous buffer immediately. This avoids the
492 * complexity of keeping a separate discontigous buffer
493 * list and seeking back over ranges we've already done
494 * optimised reads for.
496 if ((bplist
[num
]->b_flags
& LIBXFS_B_DISCONTIG
)) {
501 if (which
!= PF_META_ONLY
||
502 !B_IS_INODE(XFS_BUF_PRIORITY(bplist
[num
])))
506 bplist
[num
] = btree_lookup_next(args
->io_queue
, &fsbno
);
512 * do a big read if 25% of the potential buffer is useful,
513 * otherwise, find as many close together blocks and
514 * read them in one read
516 first_off
= LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist
[0]));
517 last_off
= LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist
[num
-1])) +
518 XFS_BUF_SIZE(bplist
[num
-1]);
519 while (num
> 1 && last_off
- first_off
> pf_max_bytes
) {
521 last_off
= LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist
[num
-1])) +
522 XFS_BUF_SIZE(bplist
[num
-1]);
524 if (num
< ((last_off
- first_off
) >> (mp
->m_sb
.sb_blocklog
+ 3))) {
526 * not enough blocks for one big read, so determine
527 * the number of blocks that are close enough.
529 last_off
= first_off
+ XFS_BUF_SIZE(bplist
[0]);
530 for (i
= 1; i
< num
; i
++) {
531 next_off
= LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist
[i
])) +
532 XFS_BUF_SIZE(bplist
[i
]);
533 if (next_off
- last_off
> pf_batch_bytes
)
540 for (i
= 0; i
< num
; i
++) {
541 if (btree_delete(args
->io_queue
, XFS_DADDR_TO_FSB(mp
,
542 XFS_BUF_ADDR(bplist
[i
]))) == NULL
)
543 do_error(_("prefetch corruption\n"));
546 if (which
== PF_PRIMARY
) {
547 for (inode_bufs
= 0, i
= 0; i
< num
; i
++) {
548 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist
[i
])))
551 args
->inode_bufs_queued
-= inode_bufs
;
552 if (inode_bufs
&& (first_off
>> mp
->m_sb
.sb_blocklog
) >
554 args
->last_bno_read
= (first_off
>> mp
->m_sb
.sb_blocklog
);
557 pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)",
558 (long long)XFS_BUF_ADDR(bplist
[0]),
559 (long long)XFS_BUF_ADDR(bplist
[num
-1]), num
,
560 (which
!= PF_SECONDARY
) ? "pri" : "sec", args
->agno
,
561 args
->last_bno_read
, args
->inode_bufs_queued
);
563 pthread_mutex_unlock(&args
->lock
);
566 * now read the data and put into the xfs_but_t's
568 len
= pread64(mp_fd
, buf
, (int)(last_off
- first_off
), first_off
);
571 * Check the last buffer on the list to see if we need to
572 * process a discontiguous buffer. The gather above loop
573 * guarantees that only the last buffer in the list will be a
574 * discontiguous buffer.
576 if ((bplist
[num
- 1]->b_flags
& LIBXFS_B_DISCONTIG
)) {
577 libxfs_readbufr_map(mp
->m_ddev_targp
, bplist
[num
- 1], 0);
578 bplist
[num
- 1]->b_flags
|= LIBXFS_B_UNCHECKED
;
579 libxfs_putbuf(bplist
[num
- 1]);
585 * go through the xfs_buf_t list copying from the
586 * read buffer into the xfs_buf_t's and release them.
588 for (i
= 0; i
< num
; i
++) {
590 pbuf
= ((char *)buf
) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist
[i
])) - first_off
);
591 size
= XFS_BUF_SIZE(bplist
[i
]);
594 memcpy(XFS_BUF_PTR(bplist
[i
]), pbuf
, size
);
595 bplist
[i
]->b_flags
|= (LIBXFS_B_UPTODATE
|
598 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist
[i
])))
599 pf_read_inode_dirs(args
, bplist
[i
]);
600 else if (which
== PF_META_ONLY
)
601 XFS_BUF_SET_PRIORITY(bplist
[i
],
603 else if (which
== PF_PRIMARY
&& num
== 1)
604 XFS_BUF_SET_PRIORITY(bplist
[i
],
608 for (i
= 0; i
< num
; i
++) {
609 pftrace("putbuf %c %p (%llu) in AG %d",
610 B_IS_INODE(XFS_BUF_PRIORITY(bplist
[i
])) ? 'I' : 'M',
611 bplist
[i
], (long long)XFS_BUF_ADDR(bplist
[i
]),
613 libxfs_putbuf(bplist
[i
]);
615 pthread_mutex_lock(&args
->lock
);
616 if (which
!= PF_SECONDARY
) {
617 pftrace("inode_bufs_queued for AG %d = %d", args
->agno
,
618 args
->inode_bufs_queued
);
620 * if primary inode queue running low, process metadata
621 * in boths queues to avoid I/O starvation as the
622 * processing thread would be waiting for a metadata
625 if (which
== PF_PRIMARY
&& !args
->queuing_done
&&
626 args
->inode_bufs_queued
< IO_THRESHOLD
) {
627 pftrace("reading metadata bufs from primary queue for AG %d",
630 pf_batch_read(args
, PF_META_ONLY
, buf
);
632 pftrace("reading bufs from secondary queue for AG %d",
635 pf_batch_read(args
, PF_SECONDARY
, buf
);
645 prefetch_args_t
*args
= param
;
646 void *buf
= memalign(libxfs_device_alignment(),
652 pthread_mutex_lock(&args
->lock
);
653 while (!args
->queuing_done
|| !btree_is_empty(args
->io_queue
)) {
654 pftrace("waiting to start prefetch I/O for AG %d", args
->agno
);
656 while (!args
->can_start_reading
&& !args
->queuing_done
)
657 pthread_cond_wait(&args
->start_reading
, &args
->lock
);
659 pftrace("starting prefetch I/O for AG %d", args
->agno
);
661 pf_batch_read(args
, PF_PRIMARY
, buf
);
662 pf_batch_read(args
, PF_SECONDARY
, buf
);
664 pftrace("ran out of bufs to prefetch for AG %d", args
->agno
);
666 if (!args
->queuing_done
)
667 args
->can_start_reading
= 0;
669 pthread_mutex_unlock(&args
->lock
);
673 pftrace("finished prefetch I/O for AG %d", args
->agno
);
679 pf_create_prefetch_thread(
680 prefetch_args_t
*args
);
686 prefetch_args_t
*args
= param
;
688 ino_tree_node_t
*irec
;
689 ino_tree_node_t
*cur_irec
;
690 int blks_per_cluster
;
696 blks_per_cluster
= mp
->m_inode_cluster_size
>> mp
->m_sb
.sb_blocklog
;
697 if (blks_per_cluster
== 0)
698 blks_per_cluster
= 1;
700 for (i
= 0; i
< PF_THREAD_COUNT
; i
++) {
701 err
= pthread_create(&args
->io_threads
[i
], NULL
,
704 do_warn(_("failed to create prefetch thread: %s\n"),
707 pf_start_processing(args
);
711 * since we have at least one I/O thread, use them for
717 pftrace("starting prefetch for AG %d", args
->agno
);
719 for (irec
= findfirst_inode_rec(args
->agno
); irec
!= NULL
;
720 irec
= next_ino_rec(irec
)) {
724 num_inos
= XFS_INODES_PER_CHUNK
;
725 while (num_inos
< mp
->m_ialloc_inos
&& irec
!= NULL
) {
726 irec
= next_ino_rec(irec
);
727 num_inos
+= XFS_INODES_PER_CHUNK
;
730 if (args
->dirs_only
&& cur_irec
->ino_isa_dir
== 0)
733 sem_getvalue(&args
->ra_count
, &i
);
734 pftrace("queuing irec %p in AG %d, sem count = %d",
735 irec
, args
->agno
, i
);
737 err
= sem_trywait(&args
->ra_count
);
738 if (err
< 0 && errno
== EAGAIN
) {
740 * Kick the queue once we have reached the limit;
741 * without this the threads processing the inodes
742 * might get stuck on a buffer that has been locked
743 * and added to the I/O queue but is waiting for
744 * the thread to be woken.
746 pf_start_io_workers(args
);
747 sem_wait(&args
->ra_count
);
751 bno
= XFS_AGINO_TO_AGBNO(mp
, cur_irec
->ino_startnum
);
752 sparse
= cur_irec
->ir_sparse
;
755 struct xfs_buf_map map
;
757 map
.bm_bn
= XFS_AGB_TO_DADDR(mp
, args
->agno
, bno
);
758 map
.bm_len
= XFS_FSB_TO_BB(mp
, blks_per_cluster
);
761 * Queue I/O for each non-sparse cluster. We can check
762 * sparse state in cluster sized chunks as cluster size
763 * is the min. granularity of sparse irec regions.
765 if ((sparse
& ((1ULL << inodes_per_cluster
) - 1)) == 0)
766 pf_queue_io(args
, &map
, 1,
767 (cur_irec
->ino_isa_dir
!= 0) ?
768 B_DIR_INODE
: B_INODE
);
770 bno
+= blks_per_cluster
;
771 num_inos
+= inodes_per_cluster
;
772 sparse
>>= inodes_per_cluster
;
773 } while (num_inos
< mp
->m_ialloc_inos
);
776 pthread_mutex_lock(&args
->lock
);
778 pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)",
779 args
->agno
, args
->inode_bufs_queued
);
781 args
->queuing_done
= 1;
782 pf_start_io_workers(args
);
783 pf_start_processing(args
);
784 pthread_mutex_unlock(&args
->lock
);
786 /* now wait for the readers to finish */
787 for (i
= 0; i
< PF_THREAD_COUNT
; i
++)
788 if (args
->io_threads
[i
])
789 pthread_join(args
->io_threads
[i
], NULL
);
791 pftrace("prefetch for AG %d finished", args
->agno
);
793 pthread_mutex_lock(&args
->lock
);
795 ASSERT(btree_is_empty(args
->io_queue
));
797 args
->prefetch_done
= 1;
799 pf_create_prefetch_thread(args
->next_args
);
801 pthread_mutex_unlock(&args
->lock
);
807 pf_create_prefetch_thread(
808 prefetch_args_t
*args
)
812 pftrace("creating queue thread for AG %d", args
->agno
);
814 err
= pthread_create(&args
->queuing_thread
, NULL
,
815 pf_queuing_worker
, args
);
817 do_warn(_("failed to create prefetch thread: %s\n"),
819 cleanup_inode_prefetch(args
);
830 mp_fd
= libxfs_device_to_fd(mp
->m_ddev_targp
->dev
);
831 pf_max_bytes
= sysconf(_SC_PAGE_SIZE
) << 7;
832 pf_max_bbs
= pf_max_bytes
>> BBSHIFT
;
833 pf_max_fsbs
= pf_max_bytes
>> mp
->m_sb
.sb_blocklog
;
834 pf_batch_bytes
= DEF_BATCH_BYTES
;
835 pf_batch_fsbs
= DEF_BATCH_BYTES
>> (mp
->m_sb
.sb_blocklog
+ 1);
839 start_inode_prefetch(
842 prefetch_args_t
*prev_args
)
844 prefetch_args_t
*args
;
847 if (!do_prefetch
|| agno
>= mp
->m_sb
.sb_agcount
)
850 args
= calloc(1, sizeof(prefetch_args_t
));
852 btree_init(&args
->io_queue
);
853 if (pthread_mutex_init(&args
->lock
, NULL
) != 0)
854 do_error(_("failed to initialize prefetch mutex\n"));
855 if (pthread_cond_init(&args
->start_reading
, NULL
) != 0)
856 do_error(_("failed to initialize prefetch cond var\n"));
857 if (pthread_cond_init(&args
->start_processing
, NULL
) != 0)
858 do_error(_("failed to initialize prefetch cond var\n"));
860 args
->dirs_only
= dirs_only
;
863 * use only 1/8 of the libxfs cache as we are only counting inodes
864 * and not any other associated metadata like directories
867 max_queue
= libxfs_bcache
->c_maxcount
/ thread_count
/ 8;
868 if (mp
->m_inode_cluster_size
> mp
->m_sb
.sb_blocksize
)
869 max_queue
= max_queue
*
870 (mp
->m_inode_cluster_size
>> mp
->m_sb
.sb_blocklog
) /
873 sem_init(&args
->ra_count
, 0, max_queue
);
876 if (!pf_create_prefetch_thread(args
))
879 pthread_mutex_lock(&prev_args
->lock
);
880 if (prev_args
->prefetch_done
) {
881 if (!pf_create_prefetch_thread(args
))
884 prev_args
->next_args
= args
;
885 pthread_mutex_unlock(&prev_args
->lock
);
892 * prefetch_ag_range runs a prefetch-and-process loop across a range of AGs. It
893 * begins with @start+ag, and finishes with @end_ag - 1 (i.e. does not prefetch
894 * or process @end_ag). The function starts prefetch on the first AG, then loops
895 * starting prefetch on the next AG and then blocks processing the current AG as
896 * the prefetch queue brings inodes into the processing queue.
898 * There is only one prefetch taking place at a time, so the prefetch on the
899 * next AG only starts once the current AG has been completely prefetched. Hence
900 * the prefetch of the next AG will start some time before the processing of the
901 * current AG finishes, ensuring that when we iterate an start processing the
902 * next AG there is already a significant queue of inodes to process.
904 * Prefetch is done this way to prevent it from running too far ahead of the
905 * processing. Allowing it to do so can cause cache thrashing, where new
906 * prefetch causes previously prefetched buffers to be reclaimed before the
907 * processing thread uses them. This results in reading all the inodes and
908 * metadata twice per phase and it greatly slows down the processing. Hence we
909 * have to carefully control how far ahead we prefetch...
913 struct work_queue
*work
,
914 xfs_agnumber_t start_ag
,
915 xfs_agnumber_t end_ag
,
917 void (*func
)(struct work_queue
*,
918 xfs_agnumber_t
, void *))
921 struct prefetch_args
*pf_args
[2];
923 pf_args
[start_ag
& 1] = start_inode_prefetch(start_ag
, dirs_only
, NULL
);
924 for (i
= start_ag
; i
< end_ag
; i
++) {
925 /* Don't prefetch end_ag */
927 pf_args
[(~i
) & 1] = start_inode_prefetch(i
+ 1,
928 dirs_only
, pf_args
[i
& 1]);
929 func(work
, i
, pf_args
[i
& 1]);
933 struct pf_work_args
{
934 xfs_agnumber_t start_ag
;
935 xfs_agnumber_t end_ag
;
937 void (*func
)(struct work_queue
*, xfs_agnumber_t
, void *);
941 prefetch_ag_range_work(
942 struct work_queue
*work
,
943 xfs_agnumber_t unused
,
946 struct pf_work_args
*wargs
= args
;
948 prefetch_ag_range(work
, wargs
->start_ag
, wargs
->end_ag
,
949 wargs
->dirs_only
, wargs
->func
);
954 * Do inode prefetch in the most optimal way for the context under which repair
959 struct xfs_mount
*mp
,
961 void (*func
)(struct work_queue
*,
962 xfs_agnumber_t
, void *),
967 struct work_queue queue
;
968 struct work_queue
*queues
;
969 int queues_started
= 0;
972 * If the previous phases of repair have not overflowed the buffer
973 * cache, then we don't need to re-read any of the metadata in the
974 * filesystem - it's all in the cache. In that case, run a thread per
975 * CPU to maximise parallelism of the queue to be processed.
977 if (check_cache
&& !libxfs_bcache_overflowed()) {
979 create_work_queue(&queue
, mp
, libxfs_nproc());
980 for (i
= 0; i
< mp
->m_sb
.sb_agcount
; i
++)
981 queue_work(&queue
, func
, i
, NULL
);
982 destroy_work_queue(&queue
);
987 * single threaded behaviour - single prefetch thread, processed
988 * directly after each AG is queued.
992 prefetch_ag_range(&queue
, 0, mp
->m_sb
.sb_agcount
,
998 * create one worker thread for each segment of the volume
1000 queues
= malloc(thread_count
* sizeof(work_queue_t
));
1001 for (i
= 0; i
< thread_count
; i
++) {
1002 struct pf_work_args
*wargs
;
1004 wargs
= malloc(sizeof(struct pf_work_args
));
1005 wargs
->start_ag
= i
* stride
;
1006 wargs
->end_ag
= min((i
+ 1) * stride
,
1007 mp
->m_sb
.sb_agcount
);
1008 wargs
->dirs_only
= dirs_only
;
1011 create_work_queue(&queues
[i
], mp
, 1);
1012 queue_work(&queues
[i
], prefetch_ag_range_work
, 0, wargs
);
1015 if (wargs
->end_ag
>= mp
->m_sb
.sb_agcount
)
1020 * wait for workers to complete
1022 for (i
= 0; i
< queues_started
; i
++)
1023 destroy_work_queue(&queues
[i
]);
1028 wait_for_inode_prefetch(
1029 prefetch_args_t
*args
)
1034 pthread_mutex_lock(&args
->lock
);
1036 while (!args
->can_start_processing
) {
1037 pftrace("waiting to start processing AG %d", args
->agno
);
1039 pthread_cond_wait(&args
->start_processing
, &args
->lock
);
1041 pftrace("can start processing AG %d", args
->agno
);
1043 pthread_mutex_unlock(&args
->lock
);
1047 cleanup_inode_prefetch(
1048 prefetch_args_t
*args
)
1053 pftrace("waiting AG %d prefetch to finish", args
->agno
);
1055 if (args
->queuing_thread
)
1056 pthread_join(args
->queuing_thread
, NULL
);
1058 pftrace("AG %d prefetch done", args
->agno
);
1060 pthread_mutex_destroy(&args
->lock
);
1061 pthread_cond_destroy(&args
->start_reading
);
1062 pthread_cond_destroy(&args
->start_processing
);
1063 sem_destroy(&args
->ra_count
);
1064 btree_destroy(args
->io_queue
);
1071 static FILE *pf_trace_file
;
1076 pf_trace_file
= fopen("/tmp/xfs_repair_prefetch.trace", "w");
1077 setvbuf(pf_trace_file
, NULL
, _IOLBF
, 1024);
1083 fclose(pf_trace_file
);
1087 _pftrace(const char *func
, const char *msg
, ...)
1093 gettimeofday(&tv
, NULL
);
1095 va_start(args
, msg
);
1096 vsnprintf(buf
, sizeof(buf
), msg
, args
);
1097 buf
[sizeof(buf
)-1] = '\0';
1100 fprintf(pf_trace_file
, "%lu.%06lu %s: %s\n", tv
.tv_sec
, tv
.tv_usec
,