]>
git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - repair/prefetch.c
1 // SPDX-License-Identifier: GPL-2.0
12 #include "err_protos.h"
23 * Performs prefetching by priming the libxfs cache by using a dedicate thread
24 * scanning inodes and reading blocks in ahead of time they are required.
26 * Any I/O errors can be safely ignored.
29 static xfs_mount_t
*mp
;
31 static int pf_max_bytes
;
32 static int pf_max_bbs
;
33 static int pf_max_fsbs
;
34 static int pf_batch_bytes
;
35 static int pf_batch_fsbs
;
37 static void pf_read_inode_dirs(prefetch_args_t
*, xfs_buf_t
*);
40 * Buffer priorities for the libxfs cache
42 * Directory metadata is ranked higher than other metadata as it's used
43 * in phases 3, 4 and 6, while other metadata is only used in 3 & 4.
46 /* intermediate directory btree nodes - can't be queued */
47 #define B_DIR_BMAP CACHE_PREFETCH_PRIORITY + 7
48 /* directory metadata in secondary queue */
49 #define B_DIR_META_2 CACHE_PREFETCH_PRIORITY + 6
50 /* dir metadata that had to fetched from the primary queue to avoid stalling */
51 #define B_DIR_META_H CACHE_PREFETCH_PRIORITY + 5
52 /* single block of directory metadata (can't batch read) */
53 #define B_DIR_META_S CACHE_PREFETCH_PRIORITY + 4
54 /* dir metadata with more than one block fetched in a single I/O */
55 #define B_DIR_META CACHE_PREFETCH_PRIORITY + 3
56 /* inode clusters with directory inodes */
57 #define B_DIR_INODE CACHE_PREFETCH_PRIORITY + 2
58 /* intermediate extent btree nodes */
59 #define B_BMAP CACHE_PREFETCH_PRIORITY + 1
60 /* inode clusters without any directory entries */
61 #define B_INODE CACHE_PREFETCH_PRIORITY
64 * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if
65 * the buffer is for an inode or other metadata.
67 #define B_IS_INODE(f) (((f) & 5) == 0)
69 #define DEF_BATCH_BYTES 0x10000
73 #define IO_THRESHOLD (MAX_BUFS * 2)
75 typedef enum pf_which
{
84 prefetch_args_t
*args
)
86 if (!args
->can_start_processing
) {
87 pftrace("signalling processing for AG %d", args
->agno
);
89 args
->can_start_processing
= 1;
90 pthread_cond_signal(&args
->start_processing
);
96 prefetch_args_t
*args
)
98 if (!args
->can_start_reading
) {
99 pftrace("signalling reading for AG %d", args
->agno
);
101 args
->can_start_reading
= 1;
102 pthread_cond_broadcast(&args
->start_reading
);
109 prefetch_args_t
*args
,
110 struct xfs_buf_map
*map
,
115 xfs_fsblock_t fsbno
= XFS_DADDR_TO_FSB(mp
, map
[0].bm_bn
);
118 * Never block on a buffer lock here, given that the actual repair
119 * code might lock buffers in a different order from us. Given that
120 * the lock holder is either reading it from disk himself or
121 * completely overwriting it this behaviour is perfectly fine.
123 bp
= libxfs_getbuf_map(mp
->m_dev
, map
, nmaps
, LIBXFS_GETBUF_TRYLOCK
);
127 if (bp
->b_flags
& LIBXFS_B_UPTODATE
) {
128 if (B_IS_INODE(flag
))
129 pf_read_inode_dirs(args
, bp
);
130 XFS_BUF_SET_PRIORITY(bp
, XFS_BUF_PRIORITY(bp
) +
131 CACHE_PREFETCH_PRIORITY
);
135 XFS_BUF_SET_PRIORITY(bp
, flag
);
137 pthread_mutex_lock(&args
->lock
);
139 btree_insert(args
->io_queue
, fsbno
, bp
);
141 if (fsbno
> args
->last_bno_read
) {
142 if (B_IS_INODE(flag
)) {
143 args
->inode_bufs_queued
++;
144 if (args
->inode_bufs_queued
== IO_THRESHOLD
)
145 pf_start_io_workers(args
);
148 ASSERT(!B_IS_INODE(flag
));
149 XFS_BUF_SET_PRIORITY(bp
, B_DIR_META_2
);
152 pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue"
153 "(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag
) ?
154 'I' : 'M', bp
, (long long)XFS_BUF_ADDR(bp
), args
->agno
, fsbno
,
155 args
->inode_bufs_queued
, args
->last_bno_read
);
157 pf_start_processing(args
);
159 pthread_mutex_unlock(&args
->lock
);
163 pf_read_bmbt_reclist(
164 prefetch_args_t
*args
,
169 xfs_bmbt_irec_t irec
;
170 xfs_filblks_t cp
= 0; /* prev count */
171 xfs_fileoff_t op
= 0; /* prev offset */
172 #define MAP_ARRAY_SZ 4
173 struct xfs_buf_map map_array
[MAP_ARRAY_SZ
];
174 struct xfs_buf_map
*map
= map_array
;
175 int max_extents
= MAP_ARRAY_SZ
;
177 unsigned int len
= 0;
181 for (i
= 0; i
< numrecs
; i
++) {
182 libxfs_bmbt_disk_get_all(rp
+ i
, &irec
);
184 if (((i
> 0) && (op
+ cp
> irec
.br_startoff
)) ||
185 (irec
.br_blockcount
== 0) ||
186 (irec
.br_startoff
>= fs_max_file_offset
))
189 if (!verify_dfsbno(mp
, irec
.br_startblock
) || !verify_dfsbno(mp
,
190 irec
.br_startblock
+ irec
.br_blockcount
- 1))
193 if (!args
->dirs_only
&& ((irec
.br_startoff
+
194 irec
.br_blockcount
) >= mp
->m_dir_geo
->freeblk
))
195 break; /* only Phase 6 reads the free blocks */
197 op
= irec
.br_startoff
;
198 cp
= irec
.br_blockcount
;
200 while (irec
.br_blockcount
) {
203 pftrace("queuing dir extent in AG %d", args
->agno
);
205 if (len
+ irec
.br_blockcount
>= mp
->m_dir_geo
->fsbcount
)
206 bm_len
= mp
->m_dir_geo
->fsbcount
- len
;
208 bm_len
= irec
.br_blockcount
;
211 map
[nmaps
].bm_bn
= XFS_FSB_TO_DADDR(mp
,
213 map
[nmaps
].bm_len
= XFS_FSB_TO_BB(mp
, bm_len
);
216 if (len
== mp
->m_dir_geo
->fsbcount
) {
217 pf_queue_io(args
, map
, nmaps
, B_DIR_META
);
222 irec
.br_blockcount
-= bm_len
;
223 irec
.br_startblock
+= bm_len
;
226 * Handle very fragmented dir2 blocks with dynamically
227 * allocated buffer maps.
229 if (nmaps
>= max_extents
) {
230 struct xfs_buf_map
*old_map
= NULL
;
232 if (map
== map_array
) {
237 map
= realloc(map
, max_extents
* sizeof(*map
));
240 _("couldn't malloc dir2 buffer list\n"));
244 memcpy(map
, old_map
, sizeof(map_array
));
251 if (map
!= map_array
)
257 * simplified version of the main scan_lbtree. Returns 0 to stop.
265 prefetch_args_t
*args
,
266 int (*func
)(struct xfs_btree_block
*block
,
269 prefetch_args_t
*args
))
274 bp
= libxfs_readbuf(mp
->m_dev
, XFS_FSB_TO_DADDR(mp
, dbno
),
275 XFS_FSB_TO_BB(mp
, 1), 0, &xfs_bmbt_buf_ops
);
279 XFS_BUF_SET_PRIORITY(bp
, isadir
? B_DIR_BMAP
: B_BMAP
);
282 * If the verifier flagged a problem with the buffer, we can't trust
283 * its contents for the purposes of reading ahead. Stop prefetching
284 * the tree and mark the buffer unchecked so that the next read of the
285 * buffer will retain the error status and be acted upon appropriately.
288 bp
->b_flags
|= LIBXFS_B_UNCHECKED
;
293 rc
= (*func
)(XFS_BUF_TO_BLOCK(bp
), level
- 1, isadir
, args
);
302 struct xfs_btree_block
*block
,
305 prefetch_args_t
*args
)
313 * do some validation on the block contents
315 if ((block
->bb_magic
!= cpu_to_be32(XFS_BMAP_MAGIC
) &&
316 block
->bb_magic
!= cpu_to_be32(XFS_BMAP_CRC_MAGIC
)) ||
317 (be16_to_cpu(block
->bb_level
) != level
))
320 numrecs
= be16_to_cpu(block
->bb_numrecs
);
323 if (numrecs
> mp
->m_bmap_dmxr
[0] || !isadir
)
325 return pf_read_bmbt_reclist(args
,
326 XFS_BMBT_REC_ADDR(mp
, block
, 1), numrecs
);
329 if (numrecs
> mp
->m_bmap_dmxr
[1])
332 pp
= XFS_BMBT_PTR_ADDR(mp
, block
, 1, mp
->m_bmap_dmxr
[1]);
334 for (i
= 0; i
< numrecs
; i
++) {
335 dbno
= get_unaligned_be64(&pp
[i
]);
336 if (!verify_dfsbno(mp
, dbno
))
338 if (!pf_scan_lbtree(dbno
, level
, isadir
, args
, pf_scanfunc_bmap
))
347 prefetch_args_t
*args
,
351 xfs_bmdr_block_t
*dib
;
359 dib
= (xfs_bmdr_block_t
*)XFS_DFORK_DPTR(dino
);
361 level
= be16_to_cpu(dib
->bb_level
);
362 numrecs
= be16_to_cpu(dib
->bb_numrecs
);
364 if ((numrecs
== 0) || (level
== 0) ||
365 (level
> XFS_BM_MAXLEVELS(mp
, XFS_DATA_FORK
)))
368 * use bmdr/dfork_dsize since the root block is in the data fork
370 if (XFS_BMDR_SPACE_CALC(numrecs
) > XFS_DFORK_DSIZE(dino
, mp
))
373 dsize
= XFS_DFORK_DSIZE(dino
, mp
);
374 pp
= XFS_BMDR_PTR_ADDR(dib
, 1, libxfs_bmdr_maxrecs(dsize
, 0));
376 for (i
= 0; i
< numrecs
; i
++) {
377 dbno
= get_unaligned_be64(&pp
[i
]);
378 if (!verify_dfsbno(mp
, dbno
))
380 if (!pf_scan_lbtree(dbno
, level
, isadir
, args
, pf_scanfunc_bmap
))
387 prefetch_args_t
*args
,
390 pf_read_bmbt_reclist(args
, (xfs_bmbt_rec_t
*)XFS_DFORK_DPTR(dino
),
391 be32_to_cpu(dino
->di_nextents
));
396 prefetch_args_t
*args
,
404 libxfs_readbuf_verify(bp
, &xfs_inode_buf_ops
);
408 for (icnt
= 0; icnt
< (bp
->b_bcount
>> mp
->m_sb
.sb_inodelog
); icnt
++) {
409 dino
= xfs_make_iptr(mp
, bp
, icnt
);
412 * We are only prefetching directory contents in extents
413 * and btree nodes for other inodes
415 isadir
= (be16_to_cpu(dino
->di_mode
) & S_IFMT
) == S_IFDIR
;
418 if (dino
->di_format
<= XFS_DINODE_FMT_LOCAL
)
421 if (!isadir
&& (dino
->di_format
== XFS_DINODE_FMT_EXTENTS
||
426 * do some checks on the inode to see if we can prefetch
427 * its directory data. It's a cut down version of
428 * process_dinode_int() in dinode.c.
430 if (dino
->di_format
> XFS_DINODE_FMT_BTREE
)
433 if (be16_to_cpu(dino
->di_magic
) != XFS_DINODE_MAGIC
)
436 if (!libxfs_dinode_good_version(mp
, dino
->di_version
))
439 if (be64_to_cpu(dino
->di_size
) <= XFS_DFORK_DSIZE(dino
, mp
))
442 if ((dino
->di_forkoff
!= 0) &&
443 (dino
->di_forkoff
>= XFS_LITINO(mp
, dino
->di_version
) >> 3))
446 switch (dino
->di_format
) {
447 case XFS_DINODE_FMT_EXTENTS
:
448 pf_read_exinode(args
, dino
);
450 case XFS_DINODE_FMT_BTREE
:
451 pf_read_btinode(args
, dino
, isadir
);
456 XFS_BUF_SET_PRIORITY(bp
, B_DIR_INODE
);
460 * pf_batch_read must be called with the lock locked.
464 prefetch_args_t
*args
,
468 xfs_buf_t
*bplist
[MAX_BUFS
];
470 off64_t first_off
, last_off
, next_off
;
474 unsigned long fsbno
= 0;
475 unsigned long max_fsbno
;
480 if (which
== PF_SECONDARY
) {
481 bplist
[0] = btree_find(args
->io_queue
, 0, &fsbno
);
482 max_fsbno
= min(fsbno
+ pf_max_fsbs
,
483 args
->last_bno_read
);
485 bplist
[0] = btree_find(args
->io_queue
,
486 args
->last_bno_read
, &fsbno
);
487 max_fsbno
= fsbno
+ pf_max_fsbs
;
489 while (bplist
[num
] && num
< MAX_BUFS
&& fsbno
< max_fsbno
) {
491 * Discontiguous buffers need special handling, so stop
492 * gathering new buffers and process the list and this
493 * discontigous buffer immediately. This avoids the
494 * complexity of keeping a separate discontigous buffer
495 * list and seeking back over ranges we've already done
496 * optimised reads for.
498 if ((bplist
[num
]->b_flags
& LIBXFS_B_DISCONTIG
)) {
503 if (which
!= PF_META_ONLY
||
504 !B_IS_INODE(XFS_BUF_PRIORITY(bplist
[num
])))
508 bplist
[num
] = btree_lookup_next(args
->io_queue
, &fsbno
);
514 * do a big read if 25% of the potential buffer is useful,
515 * otherwise, find as many close together blocks and
516 * read them in one read
518 first_off
= LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist
[0]));
519 last_off
= LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist
[num
-1])) +
520 XFS_BUF_SIZE(bplist
[num
-1]);
521 while (num
> 1 && last_off
- first_off
> pf_max_bytes
) {
523 last_off
= LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist
[num
-1])) +
524 XFS_BUF_SIZE(bplist
[num
-1]);
526 if (num
< ((last_off
- first_off
) >> (mp
->m_sb
.sb_blocklog
+ 3))) {
528 * not enough blocks for one big read, so determine
529 * the number of blocks that are close enough.
531 last_off
= first_off
+ XFS_BUF_SIZE(bplist
[0]);
532 for (i
= 1; i
< num
; i
++) {
533 next_off
= LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist
[i
])) +
534 XFS_BUF_SIZE(bplist
[i
]);
535 if (next_off
- last_off
> pf_batch_bytes
)
542 for (i
= 0; i
< num
; i
++) {
543 if (btree_delete(args
->io_queue
, XFS_DADDR_TO_FSB(mp
,
544 XFS_BUF_ADDR(bplist
[i
]))) == NULL
)
545 do_error(_("prefetch corruption\n"));
548 if (which
== PF_PRIMARY
) {
549 for (inode_bufs
= 0, i
= 0; i
< num
; i
++) {
550 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist
[i
])))
553 args
->inode_bufs_queued
-= inode_bufs
;
554 if (inode_bufs
&& (first_off
>> mp
->m_sb
.sb_blocklog
) >
556 args
->last_bno_read
= (first_off
>> mp
->m_sb
.sb_blocklog
);
559 pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)",
560 (long long)XFS_BUF_ADDR(bplist
[0]),
561 (long long)XFS_BUF_ADDR(bplist
[num
-1]), num
,
562 (which
!= PF_SECONDARY
) ? "pri" : "sec", args
->agno
,
563 args
->last_bno_read
, args
->inode_bufs_queued
);
565 pthread_mutex_unlock(&args
->lock
);
568 * now read the data and put into the xfs_but_t's
570 len
= pread(mp_fd
, buf
, (int)(last_off
- first_off
), first_off
);
573 * Check the last buffer on the list to see if we need to
574 * process a discontiguous buffer. The gather above loop
575 * guarantees that only the last buffer in the list will be a
576 * discontiguous buffer.
578 if ((bplist
[num
- 1]->b_flags
& LIBXFS_B_DISCONTIG
)) {
579 libxfs_readbufr_map(mp
->m_ddev_targp
, bplist
[num
- 1], 0);
580 bplist
[num
- 1]->b_flags
|= LIBXFS_B_UNCHECKED
;
581 libxfs_putbuf(bplist
[num
- 1]);
587 * go through the xfs_buf_t list copying from the
588 * read buffer into the xfs_buf_t's and release them.
590 for (i
= 0; i
< num
; i
++) {
592 pbuf
= ((char *)buf
) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist
[i
])) - first_off
);
593 size
= XFS_BUF_SIZE(bplist
[i
]);
596 memcpy(bplist
[i
]->b_addr
, pbuf
, size
);
597 bplist
[i
]->b_flags
|= (LIBXFS_B_UPTODATE
|
600 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist
[i
])))
601 pf_read_inode_dirs(args
, bplist
[i
]);
602 else if (which
== PF_META_ONLY
)
603 XFS_BUF_SET_PRIORITY(bplist
[i
],
605 else if (which
== PF_PRIMARY
&& num
== 1)
606 XFS_BUF_SET_PRIORITY(bplist
[i
],
610 for (i
= 0; i
< num
; i
++) {
611 pftrace("putbuf %c %p (%llu) in AG %d",
612 B_IS_INODE(XFS_BUF_PRIORITY(bplist
[i
])) ? 'I' : 'M',
613 bplist
[i
], (long long)XFS_BUF_ADDR(bplist
[i
]),
615 libxfs_putbuf(bplist
[i
]);
617 pthread_mutex_lock(&args
->lock
);
618 if (which
!= PF_SECONDARY
) {
619 pftrace("inode_bufs_queued for AG %d = %d", args
->agno
,
620 args
->inode_bufs_queued
);
622 * if primary inode queue running low, process metadata
623 * in boths queues to avoid I/O starvation as the
624 * processing thread would be waiting for a metadata
627 if (which
== PF_PRIMARY
&& !args
->queuing_done
&&
628 args
->inode_bufs_queued
< IO_THRESHOLD
) {
629 pftrace("reading metadata bufs from primary queue for AG %d",
632 pf_batch_read(args
, PF_META_ONLY
, buf
);
634 pftrace("reading bufs from secondary queue for AG %d",
637 pf_batch_read(args
, PF_SECONDARY
, buf
);
647 prefetch_args_t
*args
= param
;
648 void *buf
= memalign(libxfs_device_alignment(),
654 pthread_mutex_lock(&args
->lock
);
655 while (!args
->queuing_done
|| !btree_is_empty(args
->io_queue
)) {
656 pftrace("waiting to start prefetch I/O for AG %d", args
->agno
);
658 while (!args
->can_start_reading
&& !args
->queuing_done
)
659 pthread_cond_wait(&args
->start_reading
, &args
->lock
);
661 pftrace("starting prefetch I/O for AG %d", args
->agno
);
663 pf_batch_read(args
, PF_PRIMARY
, buf
);
664 pf_batch_read(args
, PF_SECONDARY
, buf
);
666 pftrace("ran out of bufs to prefetch for AG %d", args
->agno
);
668 if (!args
->queuing_done
)
669 args
->can_start_reading
= 0;
671 pthread_mutex_unlock(&args
->lock
);
675 pftrace("finished prefetch I/O for AG %d", args
->agno
);
681 pf_create_prefetch_thread(
682 prefetch_args_t
*args
);
685 * If we fail to create the queuing thread or can't create even one
686 * prefetch thread, we need to let processing continue without it.
689 pf_skip_prefetch_thread(prefetch_args_t
*args
)
691 prefetch_args_t
*next
;
693 pthread_mutex_lock(&args
->lock
);
694 args
->prefetch_done
= 1;
695 pf_start_processing(args
);
696 next
= args
->next_args
;
697 args
->next_args
= NULL
;
698 pthread_mutex_unlock(&args
->lock
);
701 pf_create_prefetch_thread(next
);
708 prefetch_args_t
*args
= param
;
709 prefetch_args_t
*next_args
;
711 ino_tree_node_t
*irec
;
712 ino_tree_node_t
*cur_irec
;
713 int blks_per_cluster
;
719 blks_per_cluster
= mp
->m_inode_cluster_size
>> mp
->m_sb
.sb_blocklog
;
720 if (blks_per_cluster
== 0)
721 blks_per_cluster
= 1;
723 for (i
= 0; i
< PF_THREAD_COUNT
; i
++) {
724 err
= pthread_create(&args
->io_threads
[i
], NULL
,
727 do_warn(_("failed to create prefetch thread: %s\n"),
729 pftrace("failed to create prefetch thread for AG %d: %s",
730 args
->agno
, strerror(err
));
731 args
->io_threads
[i
] = 0;
733 pf_skip_prefetch_thread(args
);
737 * since we have at least one I/O thread, use them for
743 pftrace("starting prefetch for AG %d", args
->agno
);
745 for (irec
= findfirst_inode_rec(args
->agno
); irec
!= NULL
;
746 irec
= next_ino_rec(irec
)) {
750 num_inos
= XFS_INODES_PER_CHUNK
;
751 while (num_inos
< mp
->m_ialloc_inos
&& irec
!= NULL
) {
752 irec
= next_ino_rec(irec
);
753 num_inos
+= XFS_INODES_PER_CHUNK
;
756 if (args
->dirs_only
&& cur_irec
->ino_isa_dir
== 0)
759 sem_getvalue(&args
->ra_count
, &i
);
760 pftrace("queuing irec %p in AG %d, sem count = %d",
761 irec
, args
->agno
, i
);
763 err
= sem_trywait(&args
->ra_count
);
764 if (err
< 0 && errno
== EAGAIN
) {
766 * Kick the queue once we have reached the limit;
767 * without this the threads processing the inodes
768 * might get stuck on a buffer that has been locked
769 * and added to the I/O queue but is waiting for
770 * the thread to be woken.
772 pf_start_io_workers(args
);
773 sem_wait(&args
->ra_count
);
777 bno
= XFS_AGINO_TO_AGBNO(mp
, cur_irec
->ino_startnum
);
778 sparse
= cur_irec
->ir_sparse
;
781 struct xfs_buf_map map
;
783 map
.bm_bn
= XFS_AGB_TO_DADDR(mp
, args
->agno
, bno
);
784 map
.bm_len
= XFS_FSB_TO_BB(mp
, blks_per_cluster
);
787 * Queue I/O for each non-sparse cluster. We can check
788 * sparse state in cluster sized chunks as cluster size
789 * is the min. granularity of sparse irec regions.
791 if ((sparse
& ((1ULL << inodes_per_cluster
) - 1)) == 0)
792 pf_queue_io(args
, &map
, 1,
793 (cur_irec
->ino_isa_dir
!= 0) ?
794 B_DIR_INODE
: B_INODE
);
796 bno
+= blks_per_cluster
;
797 num_inos
+= inodes_per_cluster
;
798 sparse
>>= inodes_per_cluster
;
799 } while (num_inos
< mp
->m_ialloc_inos
);
802 pthread_mutex_lock(&args
->lock
);
804 pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)",
805 args
->agno
, args
->inode_bufs_queued
);
807 args
->queuing_done
= 1;
808 pf_start_io_workers(args
);
809 pf_start_processing(args
);
810 pthread_mutex_unlock(&args
->lock
);
812 /* now wait for the readers to finish */
813 for (i
= 0; i
< PF_THREAD_COUNT
; i
++)
814 if (args
->io_threads
[i
])
815 pthread_join(args
->io_threads
[i
], NULL
);
817 pftrace("prefetch for AG %d finished", args
->agno
);
819 pthread_mutex_lock(&args
->lock
);
821 ASSERT(btree_is_empty(args
->io_queue
));
823 args
->prefetch_done
= 1;
824 next_args
= args
->next_args
;
825 args
->next_args
= NULL
;
826 pthread_mutex_unlock(&args
->lock
);
829 pf_create_prefetch_thread(next_args
);
835 pf_create_prefetch_thread(
836 prefetch_args_t
*args
)
840 pftrace("creating queue thread for AG %d", args
->agno
);
842 err
= pthread_create(&args
->queuing_thread
, NULL
,
843 pf_queuing_worker
, args
);
845 do_warn(_("failed to create prefetch thread: %s\n"),
847 pftrace("failed to create prefetch thread for AG %d: %s",
848 args
->agno
, strerror(err
));
849 args
->queuing_thread
= 0;
850 pf_skip_prefetch_thread(args
);
861 mp_fd
= libxfs_device_to_fd(mp
->m_ddev_targp
->dev
);
862 pf_max_bytes
= sysconf(_SC_PAGE_SIZE
) << 7;
863 pf_max_bbs
= pf_max_bytes
>> BBSHIFT
;
864 pf_max_fsbs
= pf_max_bytes
>> mp
->m_sb
.sb_blocklog
;
865 pf_batch_bytes
= DEF_BATCH_BYTES
;
866 pf_batch_fsbs
= DEF_BATCH_BYTES
>> (mp
->m_sb
.sb_blocklog
+ 1);
870 start_inode_prefetch(
873 prefetch_args_t
*prev_args
)
875 prefetch_args_t
*args
;
878 if (!do_prefetch
|| agno
>= mp
->m_sb
.sb_agcount
)
881 args
= calloc(1, sizeof(prefetch_args_t
));
883 btree_init(&args
->io_queue
);
884 if (pthread_mutex_init(&args
->lock
, NULL
) != 0)
885 do_error(_("failed to initialize prefetch mutex\n"));
886 if (pthread_cond_init(&args
->start_reading
, NULL
) != 0)
887 do_error(_("failed to initialize prefetch cond var\n"));
888 if (pthread_cond_init(&args
->start_processing
, NULL
) != 0)
889 do_error(_("failed to initialize prefetch cond var\n"));
891 args
->dirs_only
= dirs_only
;
894 * use only 1/8 of the libxfs cache as we are only counting inodes
895 * and not any other associated metadata like directories
898 max_queue
= libxfs_bcache
->c_maxcount
/ thread_count
/ 8;
899 if (mp
->m_inode_cluster_size
> mp
->m_sb
.sb_blocksize
)
900 max_queue
= max_queue
*
901 (mp
->m_inode_cluster_size
>> mp
->m_sb
.sb_blocklog
) /
904 sem_init(&args
->ra_count
, 0, max_queue
);
907 if (!pf_create_prefetch_thread(args
))
910 pthread_mutex_lock(&prev_args
->lock
);
911 if (prev_args
->prefetch_done
) {
912 pthread_mutex_unlock(&prev_args
->lock
);
913 if (!pf_create_prefetch_thread(args
))
916 prev_args
->next_args
= args
;
917 pftrace("queued AG %d after AG %d",
918 args
->agno
, prev_args
->agno
);
919 pthread_mutex_unlock(&prev_args
->lock
);
927 * prefetch_ag_range runs a prefetch-and-process loop across a range of AGs. It
928 * begins with @start+ag, and finishes with @end_ag - 1 (i.e. does not prefetch
929 * or process @end_ag). The function starts prefetch on the first AG, then loops
930 * starting prefetch on the next AG and then blocks processing the current AG as
931 * the prefetch queue brings inodes into the processing queue.
933 * There is only one prefetch taking place at a time, so the prefetch on the
934 * next AG only starts once the current AG has been completely prefetched. Hence
935 * the prefetch of the next AG will start some time before the processing of the
936 * current AG finishes, ensuring that when we iterate an start processing the
937 * next AG there is already a significant queue of inodes to process.
939 * Prefetch is done this way to prevent it from running too far ahead of the
940 * processing. Allowing it to do so can cause cache thrashing, where new
941 * prefetch causes previously prefetched buffers to be reclaimed before the
942 * processing thread uses them. This results in reading all the inodes and
943 * metadata twice per phase and it greatly slows down the processing. Hence we
944 * have to carefully control how far ahead we prefetch...
948 struct workqueue
*work
,
949 xfs_agnumber_t start_ag
,
950 xfs_agnumber_t end_ag
,
952 void (*func
)(struct workqueue
*,
953 xfs_agnumber_t
, void *))
956 struct prefetch_args
*pf_args
[2];
958 pf_args
[start_ag
& 1] = start_inode_prefetch(start_ag
, dirs_only
, NULL
);
959 for (i
= start_ag
; i
< end_ag
; i
++) {
960 /* Don't prefetch end_ag */
962 pf_args
[(~i
) & 1] = start_inode_prefetch(i
+ 1,
963 dirs_only
, pf_args
[i
& 1]);
964 func(work
, i
, pf_args
[i
& 1]);
968 struct pf_work_args
{
969 xfs_agnumber_t start_ag
;
970 xfs_agnumber_t end_ag
;
972 void (*func
)(struct workqueue
*, xfs_agnumber_t
, void *);
976 prefetch_ag_range_work(
977 struct workqueue
*work
,
978 xfs_agnumber_t unused
,
981 struct pf_work_args
*wargs
= args
;
983 prefetch_ag_range(work
, wargs
->start_ag
, wargs
->end_ag
,
984 wargs
->dirs_only
, wargs
->func
);
989 * Do inode prefetch in the most optimal way for the context under which repair
994 struct xfs_mount
*mp
,
996 void (*func
)(struct workqueue
*,
997 xfs_agnumber_t
, void *),
1002 struct workqueue queue
;
1003 struct workqueue
*queues
;
1004 int queues_started
= 0;
1007 * If the previous phases of repair have not overflowed the buffer
1008 * cache, then we don't need to re-read any of the metadata in the
1009 * filesystem - it's all in the cache. In that case, run a thread per
1010 * CPU to maximise parallelism of the queue to be processed.
1012 if (check_cache
&& !libxfs_bcache_overflowed()) {
1014 create_work_queue(&queue
, mp
, libxfs_nproc());
1015 for (i
= 0; i
< mp
->m_sb
.sb_agcount
; i
++)
1016 queue_work(&queue
, func
, i
, NULL
);
1017 destroy_work_queue(&queue
);
1022 * single threaded behaviour - single prefetch thread, processed
1023 * directly after each AG is queued.
1027 prefetch_ag_range(&queue
, 0, mp
->m_sb
.sb_agcount
,
1033 * create one worker thread for each segment of the volume
1035 queues
= malloc(thread_count
* sizeof(struct workqueue
));
1036 for (i
= 0; i
< thread_count
; i
++) {
1037 struct pf_work_args
*wargs
;
1039 wargs
= malloc(sizeof(struct pf_work_args
));
1040 wargs
->start_ag
= i
* stride
;
1041 wargs
->end_ag
= min((i
+ 1) * stride
,
1042 mp
->m_sb
.sb_agcount
);
1043 wargs
->dirs_only
= dirs_only
;
1046 create_work_queue(&queues
[i
], mp
, 1);
1047 queue_work(&queues
[i
], prefetch_ag_range_work
, 0, wargs
);
1050 if (wargs
->end_ag
>= mp
->m_sb
.sb_agcount
)
1055 * wait for workers to complete
1057 for (i
= 0; i
< queues_started
; i
++)
1058 destroy_work_queue(&queues
[i
]);
1063 wait_for_inode_prefetch(
1064 prefetch_args_t
*args
)
1069 pthread_mutex_lock(&args
->lock
);
1071 while (!args
->can_start_processing
) {
1072 pftrace("waiting to start processing AG %d", args
->agno
);
1074 pthread_cond_wait(&args
->start_processing
, &args
->lock
);
1076 pftrace("can start processing AG %d", args
->agno
);
1078 pthread_mutex_unlock(&args
->lock
);
1082 cleanup_inode_prefetch(
1083 prefetch_args_t
*args
)
1088 pftrace("waiting AG %d prefetch to finish", args
->agno
);
1090 if (args
->queuing_thread
)
1091 pthread_join(args
->queuing_thread
, NULL
);
1093 pftrace("AG %d prefetch done", args
->agno
);
1095 ASSERT(args
->next_args
== NULL
);
1097 pthread_mutex_destroy(&args
->lock
);
1098 pthread_cond_destroy(&args
->start_reading
);
1099 pthread_cond_destroy(&args
->start_processing
);
1100 sem_destroy(&args
->ra_count
);
1101 btree_destroy(args
->io_queue
);
1108 static FILE *pf_trace_file
;
1113 pf_trace_file
= fopen("/tmp/xfs_repair_prefetch.trace", "w");
1114 setvbuf(pf_trace_file
, NULL
, _IOLBF
, 1024);
1120 fclose(pf_trace_file
);
1124 _pftrace(const char *func
, const char *msg
, ...)
1130 gettimeofday(&tv
, NULL
);
1132 va_start(args
, msg
);
1133 vsnprintf(buf
, sizeof(buf
), msg
, args
);
1134 buf
[sizeof(buf
)-1] = '\0';
1137 fprintf(pf_trace_file
, "%lu.%06lu %s: %s\n", tv
.tv_sec
, tv
.tv_usec
,