]>
git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - repair/prefetch.c
10 #include "err_protos.h"
21 * Performs prefetching by priming the libxfs cache by using a dedicate thread
22 * scanning inodes and reading blocks in ahead of time they are required.
24 * Any I/O errors can be safely ignored.
27 static xfs_mount_t
*mp
;
29 static int pf_max_bytes
;
30 static int pf_max_bbs
;
31 static int pf_max_fsbs
;
32 static int pf_batch_bytes
;
33 static int pf_batch_fsbs
;
35 static void pf_read_inode_dirs(prefetch_args_t
*, xfs_buf_t
*);
38 * Buffer priorities for the libxfs cache
40 * Directory metadata is ranked higher than other metadata as it's used
41 * in phases 3, 4 and 6, while other metadata is only used in 3 & 4.
44 /* intermediate directory btree nodes - can't be queued */
45 #define B_DIR_BMAP CACHE_PREFETCH_PRIORITY + 7
46 /* directory metadata in secondary queue */
47 #define B_DIR_META_2 CACHE_PREFETCH_PRIORITY + 6
48 /* dir metadata that had to fetched from the primary queue to avoid stalling */
49 #define B_DIR_META_H CACHE_PREFETCH_PRIORITY + 5
50 /* single block of directory metadata (can't batch read) */
51 #define B_DIR_META_S CACHE_PREFETCH_PRIORITY + 4
52 /* dir metadata with more than one block fetched in a single I/O */
53 #define B_DIR_META CACHE_PREFETCH_PRIORITY + 3
54 /* inode clusters with directory inodes */
55 #define B_DIR_INODE CACHE_PREFETCH_PRIORITY + 2
56 /* intermediate extent btree nodes */
57 #define B_BMAP CACHE_PREFETCH_PRIORITY + 1
58 /* inode clusters without any directory entries */
59 #define B_INODE CACHE_PREFETCH_PRIORITY
62 * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if
63 * the buffer is for an inode or other metadata.
65 #define B_IS_INODE(f) (((f) & 5) == 0)
67 #define DEF_BATCH_BYTES 0x10000
71 #define IO_THRESHOLD (MAX_BUFS * 2)
73 typedef enum pf_which
{
82 prefetch_args_t
*args
)
84 if (!args
->can_start_processing
) {
85 pftrace("signalling processing for AG %d", args
->agno
);
87 args
->can_start_processing
= 1;
88 pthread_cond_signal(&args
->start_processing
);
94 prefetch_args_t
*args
)
96 if (!args
->can_start_reading
) {
97 pftrace("signalling reading for AG %d", args
->agno
);
99 args
->can_start_reading
= 1;
100 pthread_cond_broadcast(&args
->start_reading
);
107 prefetch_args_t
*args
,
108 struct xfs_buf_map
*map
,
113 xfs_fsblock_t fsbno
= XFS_DADDR_TO_FSB(mp
, map
[0].bm_bn
);
116 * Never block on a buffer lock here, given that the actual repair
117 * code might lock buffers in a different order from us. Given that
118 * the lock holder is either reading it from disk himself or
119 * completely overwriting it this behaviour is perfectly fine.
121 bp
= libxfs_getbuf_map(mp
->m_dev
, map
, nmaps
, LIBXFS_GETBUF_TRYLOCK
);
125 if (bp
->b_flags
& LIBXFS_B_UPTODATE
) {
126 if (B_IS_INODE(flag
))
127 pf_read_inode_dirs(args
, bp
);
128 XFS_BUF_SET_PRIORITY(bp
, XFS_BUF_PRIORITY(bp
) +
129 CACHE_PREFETCH_PRIORITY
);
133 XFS_BUF_SET_PRIORITY(bp
, flag
);
135 pthread_mutex_lock(&args
->lock
);
137 btree_insert(args
->io_queue
, fsbno
, bp
);
139 if (fsbno
> args
->last_bno_read
) {
140 if (B_IS_INODE(flag
)) {
141 args
->inode_bufs_queued
++;
142 if (args
->inode_bufs_queued
== IO_THRESHOLD
)
143 pf_start_io_workers(args
);
146 ASSERT(!B_IS_INODE(flag
));
147 XFS_BUF_SET_PRIORITY(bp
, B_DIR_META_2
);
150 pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue"
151 "(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag
) ?
152 'I' : 'M', bp
, (long long)XFS_BUF_ADDR(bp
), args
->agno
, fsbno
,
153 args
->inode_bufs_queued
, args
->last_bno_read
);
155 pf_start_processing(args
);
157 pthread_mutex_unlock(&args
->lock
);
161 pf_read_bmbt_reclist(
162 prefetch_args_t
*args
,
167 xfs_bmbt_irec_t irec
;
168 xfs_dfilblks_t cp
= 0; /* prev count */
169 xfs_dfiloff_t op
= 0; /* prev offset */
170 #define MAP_ARRAY_SZ 4
171 struct xfs_buf_map map_array
[MAP_ARRAY_SZ
];
172 struct xfs_buf_map
*map
= map_array
;
173 int max_extents
= MAP_ARRAY_SZ
;
175 unsigned int len
= 0;
179 for (i
= 0; i
< numrecs
; i
++) {
180 libxfs_bmbt_disk_get_all(rp
+ i
, &irec
);
182 if (((i
> 0) && (op
+ cp
> irec
.br_startoff
)) ||
183 (irec
.br_blockcount
== 0) ||
184 (irec
.br_startoff
>= fs_max_file_offset
))
187 if (!verify_dfsbno(mp
, irec
.br_startblock
) || !verify_dfsbno(mp
,
188 irec
.br_startblock
+ irec
.br_blockcount
- 1))
191 if (!args
->dirs_only
&& ((irec
.br_startoff
+
192 irec
.br_blockcount
) >= mp
->m_dirfreeblk
))
193 break; /* only Phase 6 reads the free blocks */
195 op
= irec
.br_startoff
;
196 cp
= irec
.br_blockcount
;
198 while (irec
.br_blockcount
) {
201 pftrace("queuing dir extent in AG %d", args
->agno
);
203 if (len
+ irec
.br_blockcount
>= mp
->m_dirblkfsbs
)
204 bm_len
= mp
->m_dirblkfsbs
- len
;
206 bm_len
= irec
.br_blockcount
;
209 map
[nmaps
].bm_bn
= XFS_FSB_TO_DADDR(mp
,
211 map
[nmaps
].bm_len
= XFS_FSB_TO_BB(mp
, bm_len
);
214 if (len
== mp
->m_dirblkfsbs
) {
215 pf_queue_io(args
, map
, nmaps
, B_DIR_META
);
220 irec
.br_blockcount
-= bm_len
;
221 irec
.br_startblock
+= bm_len
;
224 * Handle very fragmented dir2 blocks with dynamically
225 * allocated buffer maps.
227 if (nmaps
>= max_extents
) {
228 struct xfs_buf_map
*old_map
= NULL
;
230 if (map
== map_array
) {
235 map
= realloc(map
, max_extents
* sizeof(*map
));
238 _("couldn't malloc dir2 buffer list\n"));
242 memcpy(map
, old_map
, sizeof(map_array
));
249 if (map
!= map_array
)
255 * simplified version of the main scan_lbtree. Returns 0 to stop.
263 prefetch_args_t
*args
,
264 int (*func
)(struct xfs_btree_block
*block
,
267 prefetch_args_t
*args
))
272 bp
= libxfs_readbuf(mp
->m_dev
, XFS_FSB_TO_DADDR(mp
, dbno
),
273 XFS_FSB_TO_BB(mp
, 1), 0, &xfs_bmbt_buf_ops
);
277 XFS_BUF_SET_PRIORITY(bp
, isadir
? B_DIR_BMAP
: B_BMAP
);
279 rc
= (*func
)(XFS_BUF_TO_BLOCK(bp
), level
- 1, isadir
, args
);
288 struct xfs_btree_block
*block
,
291 prefetch_args_t
*args
)
299 * do some validation on the block contents
301 if ((block
->bb_magic
!= cpu_to_be32(XFS_BMAP_MAGIC
) &&
302 block
->bb_magic
!= cpu_to_be32(XFS_BMAP_CRC_MAGIC
)) ||
303 (be16_to_cpu(block
->bb_level
) != level
))
306 numrecs
= be16_to_cpu(block
->bb_numrecs
);
309 if (numrecs
> mp
->m_bmap_dmxr
[0] || !isadir
)
311 return pf_read_bmbt_reclist(args
,
312 XFS_BMBT_REC_ADDR(mp
, block
, 1), numrecs
);
315 if (numrecs
> mp
->m_bmap_dmxr
[1])
318 pp
= XFS_BMBT_PTR_ADDR(mp
, block
, 1, mp
->m_bmap_dmxr
[1]);
320 for (i
= 0; i
< numrecs
; i
++) {
321 dbno
= be64_to_cpu(pp
[i
]);
322 if (!verify_dfsbno(mp
, dbno
))
324 if (!pf_scan_lbtree(dbno
, level
, isadir
, args
, pf_scanfunc_bmap
))
333 prefetch_args_t
*args
,
337 xfs_bmdr_block_t
*dib
;
345 dib
= (xfs_bmdr_block_t
*)XFS_DFORK_DPTR(dino
);
347 level
= be16_to_cpu(dib
->bb_level
);
348 numrecs
= be16_to_cpu(dib
->bb_numrecs
);
350 if ((numrecs
== 0) || (level
== 0) ||
351 (level
> XFS_BM_MAXLEVELS(mp
, XFS_DATA_FORK
)))
354 * use bmdr/dfork_dsize since the root block is in the data fork
356 if (XFS_BMDR_SPACE_CALC(numrecs
) > XFS_DFORK_DSIZE(dino
, mp
))
359 dsize
= XFS_DFORK_DSIZE(dino
, mp
);
360 pp
= XFS_BMDR_PTR_ADDR(dib
, 1, xfs_bmdr_maxrecs(mp
, dsize
, 0));
362 for (i
= 0; i
< numrecs
; i
++) {
363 dbno
= be64_to_cpu(pp
[i
]);
364 if (!verify_dfsbno(mp
, dbno
))
366 if (!pf_scan_lbtree(dbno
, level
, isadir
, args
, pf_scanfunc_bmap
))
373 prefetch_args_t
*args
,
376 pf_read_bmbt_reclist(args
, (xfs_bmbt_rec_t
*)XFS_DFORK_DPTR(dino
),
377 be32_to_cpu(dino
->di_nextents
));
382 prefetch_args_t
*args
,
390 libxfs_readbuf_verify(bp
, &xfs_inode_buf_ops
);
394 for (icnt
= 0; icnt
< (XFS_BUF_COUNT(bp
) >> mp
->m_sb
.sb_inodelog
); icnt
++) {
395 dino
= xfs_make_iptr(mp
, bp
, icnt
);
398 * We are only prefetching directory contents in extents
399 * and btree nodes for other inodes
401 isadir
= (be16_to_cpu(dino
->di_mode
) & S_IFMT
) == S_IFDIR
;
404 if (dino
->di_format
<= XFS_DINODE_FMT_LOCAL
)
407 if (!isadir
&& (dino
->di_format
== XFS_DINODE_FMT_EXTENTS
||
412 * do some checks on the inode to see if we can prefetch
413 * its directory data. It's a cut down version of
414 * process_dinode_int() in dinode.c.
416 if (dino
->di_format
> XFS_DINODE_FMT_BTREE
)
419 if (be16_to_cpu(dino
->di_magic
) != XFS_DINODE_MAGIC
)
422 if (!XFS_DINODE_GOOD_VERSION(dino
->di_version
))
425 if (be64_to_cpu(dino
->di_size
) <= XFS_DFORK_DSIZE(dino
, mp
))
428 if ((dino
->di_forkoff
!= 0) &&
429 (dino
->di_forkoff
>= XFS_LITINO(mp
, dino
->di_version
) >> 3))
432 switch (dino
->di_format
) {
433 case XFS_DINODE_FMT_EXTENTS
:
434 pf_read_exinode(args
, dino
);
436 case XFS_DINODE_FMT_BTREE
:
437 pf_read_btinode(args
, dino
, isadir
);
442 XFS_BUF_SET_PRIORITY(bp
, B_DIR_INODE
);
446 * pf_batch_read must be called with the lock locked.
450 prefetch_args_t
*args
,
454 xfs_buf_t
*bplist
[MAX_BUFS
];
456 off64_t first_off
, last_off
, next_off
;
460 unsigned long fsbno
= 0;
461 unsigned long max_fsbno
;
466 if (which
== PF_SECONDARY
) {
467 bplist
[0] = btree_find(args
->io_queue
, 0, &fsbno
);
468 max_fsbno
= MIN(fsbno
+ pf_max_fsbs
,
469 args
->last_bno_read
);
471 bplist
[0] = btree_find(args
->io_queue
,
472 args
->last_bno_read
, &fsbno
);
473 max_fsbno
= fsbno
+ pf_max_fsbs
;
475 while (bplist
[num
] && num
< MAX_BUFS
&& fsbno
< max_fsbno
) {
477 * Discontiguous buffers need special handling, so stop
478 * gathering new buffers and process the list and this
479 * discontigous buffer immediately. This avoids the
480 * complexity of keeping a separate discontigous buffer
481 * list and seeking back over ranges we've already done
482 * optimised reads for.
484 if ((bplist
[num
]->b_flags
& LIBXFS_B_DISCONTIG
)) {
489 if (which
!= PF_META_ONLY
||
490 !B_IS_INODE(XFS_BUF_PRIORITY(bplist
[num
])))
494 bplist
[num
] = btree_lookup_next(args
->io_queue
, &fsbno
);
500 * do a big read if 25% of the potential buffer is useful,
501 * otherwise, find as many close together blocks and
502 * read them in one read
504 first_off
= LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist
[0]));
505 last_off
= LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist
[num
-1])) +
506 XFS_BUF_SIZE(bplist
[num
-1]);
507 while (num
> 1 && last_off
- first_off
> pf_max_bytes
) {
509 last_off
= LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist
[num
-1])) +
510 XFS_BUF_SIZE(bplist
[num
-1]);
512 if (num
< ((last_off
- first_off
) >> (mp
->m_sb
.sb_blocklog
+ 3))) {
514 * not enough blocks for one big read, so determine
515 * the number of blocks that are close enough.
517 last_off
= first_off
+ XFS_BUF_SIZE(bplist
[0]);
518 for (i
= 1; i
< num
; i
++) {
519 next_off
= LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist
[i
])) +
520 XFS_BUF_SIZE(bplist
[i
]);
521 if (next_off
- last_off
> pf_batch_bytes
)
528 for (i
= 0; i
< num
; i
++) {
529 if (btree_delete(args
->io_queue
, XFS_DADDR_TO_FSB(mp
,
530 XFS_BUF_ADDR(bplist
[i
]))) == NULL
)
531 do_error(_("prefetch corruption\n"));
534 if (which
== PF_PRIMARY
) {
535 for (inode_bufs
= 0, i
= 0; i
< num
; i
++) {
536 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist
[i
])))
539 args
->inode_bufs_queued
-= inode_bufs
;
540 if (inode_bufs
&& (first_off
>> mp
->m_sb
.sb_blocklog
) >
542 args
->last_bno_read
= (first_off
>> mp
->m_sb
.sb_blocklog
);
545 pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)",
546 (long long)XFS_BUF_ADDR(bplist
[0]),
547 (long long)XFS_BUF_ADDR(bplist
[num
-1]), num
,
548 (which
!= PF_SECONDARY
) ? "pri" : "sec", args
->agno
,
549 args
->last_bno_read
, args
->inode_bufs_queued
);
551 pthread_mutex_unlock(&args
->lock
);
554 * now read the data and put into the xfs_but_t's
556 len
= pread64(mp_fd
, buf
, (int)(last_off
- first_off
), first_off
);
559 * Check the last buffer on the list to see if we need to
560 * process a discontiguous buffer. The gather above loop
561 * guarantees that only the last buffer in the list will be a
562 * discontiguous buffer.
564 if ((bplist
[num
- 1]->b_flags
& LIBXFS_B_DISCONTIG
)) {
565 libxfs_readbufr_map(mp
->m_ddev_targp
, bplist
[num
- 1], 0);
566 bplist
[num
- 1]->b_flags
|= LIBXFS_B_UNCHECKED
;
567 libxfs_putbuf(bplist
[num
- 1]);
573 * go through the xfs_buf_t list copying from the
574 * read buffer into the xfs_buf_t's and release them.
576 for (i
= 0; i
< num
; i
++) {
578 pbuf
= ((char *)buf
) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist
[i
])) - first_off
);
579 size
= XFS_BUF_SIZE(bplist
[i
]);
582 memcpy(XFS_BUF_PTR(bplist
[i
]), pbuf
, size
);
583 bplist
[i
]->b_flags
|= (LIBXFS_B_UPTODATE
|
586 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist
[i
])))
587 pf_read_inode_dirs(args
, bplist
[i
]);
588 else if (which
== PF_META_ONLY
)
589 XFS_BUF_SET_PRIORITY(bplist
[i
],
591 else if (which
== PF_PRIMARY
&& num
== 1)
592 XFS_BUF_SET_PRIORITY(bplist
[i
],
596 for (i
= 0; i
< num
; i
++) {
597 pftrace("putbuf %c %p (%llu) in AG %d",
598 B_IS_INODE(XFS_BUF_PRIORITY(bplist
[i
])) ? 'I' : 'M',
599 bplist
[i
], (long long)XFS_BUF_ADDR(bplist
[i
]),
601 libxfs_putbuf(bplist
[i
]);
603 pthread_mutex_lock(&args
->lock
);
604 if (which
!= PF_SECONDARY
) {
605 pftrace("inode_bufs_queued for AG %d = %d", args
->agno
,
606 args
->inode_bufs_queued
);
608 * if primary inode queue running low, process metadata
609 * in boths queues to avoid I/O starvation as the
610 * processing thread would be waiting for a metadata
613 if (which
== PF_PRIMARY
&& !args
->queuing_done
&&
614 args
->inode_bufs_queued
< IO_THRESHOLD
) {
615 pftrace("reading metadata bufs from primary queue for AG %d",
618 pf_batch_read(args
, PF_META_ONLY
, buf
);
620 pftrace("reading bufs from secondary queue for AG %d",
623 pf_batch_read(args
, PF_SECONDARY
, buf
);
633 prefetch_args_t
*args
= param
;
634 void *buf
= memalign(libxfs_device_alignment(),
640 pthread_mutex_lock(&args
->lock
);
641 while (!args
->queuing_done
|| !btree_is_empty(args
->io_queue
)) {
642 pftrace("waiting to start prefetch I/O for AG %d", args
->agno
);
644 while (!args
->can_start_reading
&& !args
->queuing_done
)
645 pthread_cond_wait(&args
->start_reading
, &args
->lock
);
647 pftrace("starting prefetch I/O for AG %d", args
->agno
);
649 pf_batch_read(args
, PF_PRIMARY
, buf
);
650 pf_batch_read(args
, PF_SECONDARY
, buf
);
652 pftrace("ran out of bufs to prefetch for AG %d", args
->agno
);
654 if (!args
->queuing_done
)
655 args
->can_start_reading
= 0;
657 pthread_mutex_unlock(&args
->lock
);
661 pftrace("finished prefetch I/O for AG %d", args
->agno
);
667 pf_create_prefetch_thread(
668 prefetch_args_t
*args
);
674 prefetch_args_t
*args
= param
;
676 ino_tree_node_t
*irec
;
677 ino_tree_node_t
*cur_irec
;
678 int blks_per_cluster
;
683 blks_per_cluster
= XFS_INODE_CLUSTER_SIZE(mp
) >> mp
->m_sb
.sb_blocklog
;
684 if (blks_per_cluster
== 0)
685 blks_per_cluster
= 1;
687 for (i
= 0; i
< PF_THREAD_COUNT
; i
++) {
688 err
= pthread_create(&args
->io_threads
[i
], NULL
,
691 do_warn(_("failed to create prefetch thread: %s\n"),
694 pf_start_processing(args
);
698 * since we have at least one I/O thread, use them for
704 pftrace("starting prefetch for AG %d", args
->agno
);
706 for (irec
= findfirst_inode_rec(args
->agno
); irec
!= NULL
;
707 irec
= next_ino_rec(irec
)) {
711 num_inos
= XFS_INODES_PER_CHUNK
;
712 while (num_inos
< XFS_IALLOC_INODES(mp
) && irec
!= NULL
) {
713 irec
= next_ino_rec(irec
);
714 num_inos
+= XFS_INODES_PER_CHUNK
;
717 if (args
->dirs_only
&& cur_irec
->ino_isa_dir
== 0)
720 sem_getvalue(&args
->ra_count
, &i
);
721 pftrace("queuing irec %p in AG %d, sem count = %d",
722 irec
, args
->agno
, i
);
724 err
= sem_trywait(&args
->ra_count
);
725 if (err
< 0 && errno
== EAGAIN
) {
727 * Kick the queue once we have reached the limit;
728 * without this the threads processing the inodes
729 * might get stuck on a buffer that has been locked
730 * and added to the I/O queue but is waiting for
731 * the thread to be woken.
733 pf_start_io_workers(args
);
734 sem_wait(&args
->ra_count
);
738 bno
= XFS_AGINO_TO_AGBNO(mp
, cur_irec
->ino_startnum
);
741 struct xfs_buf_map map
;
743 map
.bm_bn
= XFS_AGB_TO_DADDR(mp
, args
->agno
, bno
);
744 map
.bm_len
= XFS_FSB_TO_BB(mp
, blks_per_cluster
);
745 pf_queue_io(args
, &map
, 1,
746 (cur_irec
->ino_isa_dir
!= 0) ? B_DIR_INODE
748 bno
+= blks_per_cluster
;
749 num_inos
+= inodes_per_cluster
;
750 } while (num_inos
< XFS_IALLOC_INODES(mp
));
753 pthread_mutex_lock(&args
->lock
);
755 pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)",
756 args
->agno
, args
->inode_bufs_queued
);
758 args
->queuing_done
= 1;
759 pf_start_io_workers(args
);
760 pf_start_processing(args
);
761 pthread_mutex_unlock(&args
->lock
);
763 /* now wait for the readers to finish */
764 for (i
= 0; i
< PF_THREAD_COUNT
; i
++)
765 if (args
->io_threads
[i
])
766 pthread_join(args
->io_threads
[i
], NULL
);
768 pftrace("prefetch for AG %d finished", args
->agno
);
770 pthread_mutex_lock(&args
->lock
);
772 ASSERT(btree_is_empty(args
->io_queue
));
774 args
->prefetch_done
= 1;
776 pf_create_prefetch_thread(args
->next_args
);
778 pthread_mutex_unlock(&args
->lock
);
784 pf_create_prefetch_thread(
785 prefetch_args_t
*args
)
789 pftrace("creating queue thread for AG %d", args
->agno
);
791 err
= pthread_create(&args
->queuing_thread
, NULL
,
792 pf_queuing_worker
, args
);
794 do_warn(_("failed to create prefetch thread: %s\n"),
796 cleanup_inode_prefetch(args
);
807 mp_fd
= libxfs_device_to_fd(mp
->m_ddev_targp
->dev
);
808 pf_max_bytes
= sysconf(_SC_PAGE_SIZE
) << 7;
809 pf_max_bbs
= pf_max_bytes
>> BBSHIFT
;
810 pf_max_fsbs
= pf_max_bytes
>> mp
->m_sb
.sb_blocklog
;
811 pf_batch_bytes
= DEF_BATCH_BYTES
;
812 pf_batch_fsbs
= DEF_BATCH_BYTES
>> (mp
->m_sb
.sb_blocklog
+ 1);
816 start_inode_prefetch(
819 prefetch_args_t
*prev_args
)
821 prefetch_args_t
*args
;
824 if (!do_prefetch
|| agno
>= mp
->m_sb
.sb_agcount
)
827 args
= calloc(1, sizeof(prefetch_args_t
));
829 btree_init(&args
->io_queue
);
830 if (pthread_mutex_init(&args
->lock
, NULL
) != 0)
831 do_error(_("failed to initialize prefetch mutex\n"));
832 if (pthread_cond_init(&args
->start_reading
, NULL
) != 0)
833 do_error(_("failed to initialize prefetch cond var\n"));
834 if (pthread_cond_init(&args
->start_processing
, NULL
) != 0)
835 do_error(_("failed to initialize prefetch cond var\n"));
837 args
->dirs_only
= dirs_only
;
840 * use only 1/8 of the libxfs cache as we are only counting inodes
841 * and not any other associated metadata like directories
844 max_queue
= libxfs_bcache
->c_maxcount
/ thread_count
/ 8;
845 if (XFS_INODE_CLUSTER_SIZE(mp
) > mp
->m_sb
.sb_blocksize
)
846 max_queue
= max_queue
* (XFS_INODE_CLUSTER_SIZE(mp
) >>
847 mp
->m_sb
.sb_blocklog
) / XFS_IALLOC_BLOCKS(mp
);
849 sem_init(&args
->ra_count
, 0, max_queue
);
852 if (!pf_create_prefetch_thread(args
))
855 pthread_mutex_lock(&prev_args
->lock
);
856 if (prev_args
->prefetch_done
) {
857 if (!pf_create_prefetch_thread(args
))
860 prev_args
->next_args
= args
;
861 pthread_mutex_unlock(&prev_args
->lock
);
868 * prefetch_ag_range runs a prefetch-and-process loop across a range of AGs. It
869 * begins with @start+ag, and finishes with @end_ag - 1 (i.e. does not prefetch
870 * or process @end_ag). The function starts prefetch on the first AG, then loops
871 * starting prefetch on the next AG and then blocks processing the current AG as
872 * the prefetch queue brings inodes into the processing queue.
874 * There is only one prefetch taking place at a time, so the prefetch on the
875 * next AG only starts once the current AG has been completely prefetched. Hence
876 * the prefetch of the next AG will start some time before the processing of the
877 * current AG finishes, ensuring that when we iterate an start processing the
878 * next AG there is already a significant queue of inodes to process.
880 * Prefetch is done this way to prevent it from running too far ahead of the
881 * processing. Allowing it to do so can cause cache thrashing, where new
882 * prefetch causes previously prefetched buffers to be reclaimed before the
883 * processing thread uses them. This results in reading all the inodes and
884 * metadata twice per phase and it greatly slows down the processing. Hence we
885 * have to carefully control how far ahead we prefetch...
889 struct work_queue
*work
,
890 xfs_agnumber_t start_ag
,
891 xfs_agnumber_t end_ag
,
893 void (*func
)(struct work_queue
*,
894 xfs_agnumber_t
, void *))
897 struct prefetch_args
*pf_args
[2];
899 pf_args
[start_ag
& 1] = start_inode_prefetch(start_ag
, dirs_only
, NULL
);
900 for (i
= start_ag
; i
< end_ag
; i
++) {
901 /* Don't prefetch end_ag */
903 pf_args
[(~i
) & 1] = start_inode_prefetch(i
+ 1,
904 dirs_only
, pf_args
[i
& 1]);
905 func(work
, i
, pf_args
[i
& 1]);
909 struct pf_work_args
{
910 xfs_agnumber_t start_ag
;
911 xfs_agnumber_t end_ag
;
913 void (*func
)(struct work_queue
*, xfs_agnumber_t
, void *);
917 prefetch_ag_range_work(
918 struct work_queue
*work
,
919 xfs_agnumber_t unused
,
922 struct pf_work_args
*wargs
= args
;
924 prefetch_ag_range(work
, wargs
->start_ag
, wargs
->end_ag
,
925 wargs
->dirs_only
, wargs
->func
);
930 * Do inode prefetch in the most optimal way for the context under which repair
935 struct xfs_mount
*mp
,
937 void (*func
)(struct work_queue
*,
938 xfs_agnumber_t
, void *),
943 struct work_queue queue
;
944 struct work_queue
*queues
;
945 int queues_started
= 0;
948 * If the previous phases of repair have not overflowed the buffer
949 * cache, then we don't need to re-read any of the metadata in the
950 * filesystem - it's all in the cache. In that case, run a thread per
951 * CPU to maximise parallelism of the queue to be processed.
953 if (check_cache
&& !libxfs_bcache_overflowed()) {
955 create_work_queue(&queue
, mp
, libxfs_nproc());
956 for (i
= 0; i
< mp
->m_sb
.sb_agcount
; i
++)
957 queue_work(&queue
, func
, i
, NULL
);
958 destroy_work_queue(&queue
);
963 * single threaded behaviour - single prefetch thread, processed
964 * directly after each AG is queued.
968 prefetch_ag_range(&queue
, 0, mp
->m_sb
.sb_agcount
,
974 * create one worker thread for each segment of the volume
976 queues
= malloc(thread_count
* sizeof(work_queue_t
));
977 for (i
= 0; i
< thread_count
; i
++) {
978 struct pf_work_args
*wargs
;
980 wargs
= malloc(sizeof(struct pf_work_args
));
981 wargs
->start_ag
= i
* stride
;
982 wargs
->end_ag
= min((i
+ 1) * stride
,
983 mp
->m_sb
.sb_agcount
);
984 wargs
->dirs_only
= dirs_only
;
987 create_work_queue(&queues
[i
], mp
, 1);
988 queue_work(&queues
[i
], prefetch_ag_range_work
, 0, wargs
);
991 if (wargs
->end_ag
>= mp
->m_sb
.sb_agcount
)
996 * wait for workers to complete
998 for (i
= 0; i
< queues_started
; i
++)
999 destroy_work_queue(&queues
[i
]);
1004 wait_for_inode_prefetch(
1005 prefetch_args_t
*args
)
1010 pthread_mutex_lock(&args
->lock
);
1012 while (!args
->can_start_processing
) {
1013 pftrace("waiting to start processing AG %d", args
->agno
);
1015 pthread_cond_wait(&args
->start_processing
, &args
->lock
);
1017 pftrace("can start processing AG %d", args
->agno
);
1019 pthread_mutex_unlock(&args
->lock
);
1023 cleanup_inode_prefetch(
1024 prefetch_args_t
*args
)
1029 pftrace("waiting AG %d prefetch to finish", args
->agno
);
1031 if (args
->queuing_thread
)
1032 pthread_join(args
->queuing_thread
, NULL
);
1034 pftrace("AG %d prefetch done", args
->agno
);
1036 pthread_mutex_destroy(&args
->lock
);
1037 pthread_cond_destroy(&args
->start_reading
);
1038 pthread_cond_destroy(&args
->start_processing
);
1039 sem_destroy(&args
->ra_count
);
1040 btree_destroy(args
->io_queue
);
1047 static FILE *pf_trace_file
;
1052 pf_trace_file
= fopen("/tmp/xfs_repair_prefetch.trace", "w");
1053 setvbuf(pf_trace_file
, NULL
, _IOLBF
, 1024);
1059 fclose(pf_trace_file
);
1063 _pftrace(const char *func
, const char *msg
, ...)
1069 gettimeofday(&tv
, NULL
);
1071 va_start(args
, msg
);
1072 vsnprintf(buf
, sizeof(buf
), msg
, args
);
1073 buf
[sizeof(buf
)-1] = '\0';
1076 fprintf(pf_trace_file
, "%lu.%06lu %s: %s\n", tv
.tv_sec
, tv
.tv_usec
,