]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - repair/prefetch.c
b4f20d948ea86549fa751e4b65a8319c5984cbea
[thirdparty/xfsprogs-dev.git] / repair / prefetch.c
1 #include "libxfs.h"
2 #include <pthread.h>
3 #include "avl.h"
4 #include "btree.h"
5 #include "globals.h"
6 #include "agheader.h"
7 #include "incore.h"
8 #include "dir2.h"
9 #include "protos.h"
10 #include "err_protos.h"
11 #include "dinode.h"
12 #include "bmap.h"
13 #include "versions.h"
14 #include "threads.h"
15 #include "prefetch.h"
16 #include "progress.h"
17
18 int do_prefetch = 1;
19
20 /*
21 * Performs prefetching by priming the libxfs cache by using a dedicate thread
22 * scanning inodes and reading blocks in ahead of time they are required.
23 *
24 * Any I/O errors can be safely ignored.
25 */
26
27 static xfs_mount_t *mp;
28 static int mp_fd;
29 static int pf_max_bytes;
30 static int pf_max_bbs;
31 static int pf_max_fsbs;
32 static int pf_batch_bytes;
33 static int pf_batch_fsbs;
34
35 static void pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *);
36
37 /*
38 * Buffer priorities for the libxfs cache
39 *
40 * Directory metadata is ranked higher than other metadata as it's used
41 * in phases 3, 4 and 6, while other metadata is only used in 3 & 4.
42 */
43
44 /* intermediate directory btree nodes - can't be queued */
45 #define B_DIR_BMAP CACHE_PREFETCH_PRIORITY + 7
46 /* directory metadata in secondary queue */
47 #define B_DIR_META_2 CACHE_PREFETCH_PRIORITY + 6
48 /* dir metadata that had to fetched from the primary queue to avoid stalling */
49 #define B_DIR_META_H CACHE_PREFETCH_PRIORITY + 5
50 /* single block of directory metadata (can't batch read) */
51 #define B_DIR_META_S CACHE_PREFETCH_PRIORITY + 4
52 /* dir metadata with more than one block fetched in a single I/O */
53 #define B_DIR_META CACHE_PREFETCH_PRIORITY + 3
54 /* inode clusters with directory inodes */
55 #define B_DIR_INODE CACHE_PREFETCH_PRIORITY + 2
56 /* intermediate extent btree nodes */
57 #define B_BMAP CACHE_PREFETCH_PRIORITY + 1
58 /* inode clusters without any directory entries */
59 #define B_INODE CACHE_PREFETCH_PRIORITY
60
61 /*
62 * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if
63 * the buffer is for an inode or other metadata.
64 */
65 #define B_IS_INODE(f) (((f) & 5) == 0)
66
67 #define DEF_BATCH_BYTES 0x10000
68
69 #define MAX_BUFS 128
70
71 #define IO_THRESHOLD (MAX_BUFS * 2)
72
73 typedef enum pf_which {
74 PF_PRIMARY,
75 PF_SECONDARY,
76 PF_META_ONLY
77 } pf_which_t;
78
79
80 static inline void
81 pf_start_processing(
82 prefetch_args_t *args)
83 {
84 if (!args->can_start_processing) {
85 pftrace("signalling processing for AG %d", args->agno);
86
87 args->can_start_processing = 1;
88 pthread_cond_signal(&args->start_processing);
89 }
90 }
91
92 static inline void
93 pf_start_io_workers(
94 prefetch_args_t *args)
95 {
96 if (!args->can_start_reading) {
97 pftrace("signalling reading for AG %d", args->agno);
98
99 args->can_start_reading = 1;
100 pthread_cond_broadcast(&args->start_reading);
101 }
102 }
103
104
105 static void
106 pf_queue_io(
107 prefetch_args_t *args,
108 struct xfs_buf_map *map,
109 int nmaps,
110 int flag)
111 {
112 struct xfs_buf *bp;
113 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, map[0].bm_bn);
114
115 /*
116 * Never block on a buffer lock here, given that the actual repair
117 * code might lock buffers in a different order from us. Given that
118 * the lock holder is either reading it from disk himself or
119 * completely overwriting it this behaviour is perfectly fine.
120 */
121 bp = libxfs_getbuf_map(mp->m_dev, map, nmaps, LIBXFS_GETBUF_TRYLOCK);
122 if (!bp)
123 return;
124
125 if (bp->b_flags & LIBXFS_B_UPTODATE) {
126 if (B_IS_INODE(flag))
127 pf_read_inode_dirs(args, bp);
128 XFS_BUF_SET_PRIORITY(bp, XFS_BUF_PRIORITY(bp) +
129 CACHE_PREFETCH_PRIORITY);
130 libxfs_putbuf(bp);
131 return;
132 }
133 XFS_BUF_SET_PRIORITY(bp, flag);
134
135 pthread_mutex_lock(&args->lock);
136
137 btree_insert(args->io_queue, fsbno, bp);
138
139 if (fsbno > args->last_bno_read) {
140 if (B_IS_INODE(flag)) {
141 args->inode_bufs_queued++;
142 if (args->inode_bufs_queued == IO_THRESHOLD)
143 pf_start_io_workers(args);
144 }
145 } else {
146 ASSERT(!B_IS_INODE(flag));
147 XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
148 }
149
150 pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue"
151 "(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag) ?
152 'I' : 'M', bp, (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
153 args->inode_bufs_queued, args->last_bno_read);
154
155 pf_start_processing(args);
156
157 pthread_mutex_unlock(&args->lock);
158 }
159
160 static int
161 pf_read_bmbt_reclist(
162 prefetch_args_t *args,
163 xfs_bmbt_rec_t *rp,
164 int numrecs)
165 {
166 int i;
167 xfs_bmbt_irec_t irec;
168 xfs_filblks_t cp = 0; /* prev count */
169 xfs_fileoff_t op = 0; /* prev offset */
170 #define MAP_ARRAY_SZ 4
171 struct xfs_buf_map map_array[MAP_ARRAY_SZ];
172 struct xfs_buf_map *map = map_array;
173 int max_extents = MAP_ARRAY_SZ;
174 int nmaps = 0;
175 unsigned int len = 0;
176 int ret = 0;
177
178
179 for (i = 0; i < numrecs; i++) {
180 libxfs_bmbt_disk_get_all(rp + i, &irec);
181
182 if (((i > 0) && (op + cp > irec.br_startoff)) ||
183 (irec.br_blockcount == 0) ||
184 (irec.br_startoff >= fs_max_file_offset))
185 goto out_free;
186
187 if (!verify_dfsbno(mp, irec.br_startblock) || !verify_dfsbno(mp,
188 irec.br_startblock + irec.br_blockcount - 1))
189 goto out_free;
190
191 if (!args->dirs_only && ((irec.br_startoff +
192 irec.br_blockcount) >= mp->m_dir_geo->freeblk))
193 break; /* only Phase 6 reads the free blocks */
194
195 op = irec.br_startoff;
196 cp = irec.br_blockcount;
197
198 while (irec.br_blockcount) {
199 unsigned int bm_len;
200
201 pftrace("queuing dir extent in AG %d", args->agno);
202
203 if (len + irec.br_blockcount >= mp->m_dir_geo->fsbcount)
204 bm_len = mp->m_dir_geo->fsbcount - len;
205 else
206 bm_len = irec.br_blockcount;
207 len += bm_len;
208
209 map[nmaps].bm_bn = XFS_FSB_TO_DADDR(mp,
210 irec.br_startblock);
211 map[nmaps].bm_len = XFS_FSB_TO_BB(mp, bm_len);
212 nmaps++;
213
214 if (len == mp->m_dir_geo->fsbcount) {
215 pf_queue_io(args, map, nmaps, B_DIR_META);
216 len = 0;
217 nmaps = 0;
218 }
219
220 irec.br_blockcount -= bm_len;
221 irec.br_startblock += bm_len;
222
223 /*
224 * Handle very fragmented dir2 blocks with dynamically
225 * allocated buffer maps.
226 */
227 if (nmaps >= max_extents) {
228 struct xfs_buf_map *old_map = NULL;
229
230 if (map == map_array) {
231 old_map = map;
232 map = NULL;
233 }
234 max_extents *= 2;
235 map = realloc(map, max_extents * sizeof(*map));
236 if (map == NULL) {
237 do_error(
238 _("couldn't malloc dir2 buffer list\n"));
239 exit(1);
240 }
241 if (old_map)
242 memcpy(map, old_map, sizeof(map_array));
243 }
244
245 }
246 }
247 ret = 1;
248 out_free:
249 if (map != map_array)
250 free(map);
251 return ret;
252 }
253
254 /*
255 * simplified version of the main scan_lbtree. Returns 0 to stop.
256 */
257
258 static int
259 pf_scan_lbtree(
260 xfs_fsblock_t dbno,
261 int level,
262 int isadir,
263 prefetch_args_t *args,
264 int (*func)(struct xfs_btree_block *block,
265 int level,
266 int isadir,
267 prefetch_args_t *args))
268 {
269 xfs_buf_t *bp;
270 int rc;
271
272 bp = libxfs_readbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, dbno),
273 XFS_FSB_TO_BB(mp, 1), 0, &xfs_bmbt_buf_ops);
274 if (!bp)
275 return 0;
276
277 XFS_BUF_SET_PRIORITY(bp, isadir ? B_DIR_BMAP : B_BMAP);
278
279 /*
280 * If the verifier flagged a problem with the buffer, we can't trust
281 * its contents for the purposes of reading ahead. Stop prefetching
282 * the tree and mark the buffer unchecked so that the next read of the
283 * buffer will retain the error status and be acted upon appropriately.
284 */
285 if (bp->b_error) {
286 bp->b_flags |= LIBXFS_B_UNCHECKED;
287 libxfs_putbuf(bp);
288 return 0;
289 }
290
291 rc = (*func)(XFS_BUF_TO_BLOCK(bp), level - 1, isadir, args);
292
293 libxfs_putbuf(bp);
294
295 return rc;
296 }
297
298 static int
299 pf_scanfunc_bmap(
300 struct xfs_btree_block *block,
301 int level,
302 int isadir,
303 prefetch_args_t *args)
304 {
305 xfs_bmbt_ptr_t *pp;
306 int numrecs;
307 int i;
308 xfs_fsblock_t dbno;
309
310 /*
311 * do some validation on the block contents
312 */
313 if ((block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) &&
314 block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC)) ||
315 (be16_to_cpu(block->bb_level) != level))
316 return 0;
317
318 numrecs = be16_to_cpu(block->bb_numrecs);
319
320 if (level == 0) {
321 if (numrecs > mp->m_bmap_dmxr[0] || !isadir)
322 return 0;
323 return pf_read_bmbt_reclist(args,
324 XFS_BMBT_REC_ADDR(mp, block, 1), numrecs);
325 }
326
327 if (numrecs > mp->m_bmap_dmxr[1])
328 return 0;
329
330 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
331
332 for (i = 0; i < numrecs; i++) {
333 dbno = get_unaligned_be64(&pp[i]);
334 if (!verify_dfsbno(mp, dbno))
335 return 0;
336 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
337 return 0;
338 }
339 return 1;
340 }
341
342
343 static void
344 pf_read_btinode(
345 prefetch_args_t *args,
346 xfs_dinode_t *dino,
347 int isadir)
348 {
349 xfs_bmdr_block_t *dib;
350 xfs_bmbt_ptr_t *pp;
351 int i;
352 int level;
353 int numrecs;
354 int dsize;
355 xfs_fsblock_t dbno;
356
357 dib = (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dino);
358
359 level = be16_to_cpu(dib->bb_level);
360 numrecs = be16_to_cpu(dib->bb_numrecs);
361
362 if ((numrecs == 0) || (level == 0) ||
363 (level > XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))
364 return;
365 /*
366 * use bmdr/dfork_dsize since the root block is in the data fork
367 */
368 if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp))
369 return;
370
371 dsize = XFS_DFORK_DSIZE(dino, mp);
372 pp = XFS_BMDR_PTR_ADDR(dib, 1, xfs_bmdr_maxrecs(dsize, 0));
373
374 for (i = 0; i < numrecs; i++) {
375 dbno = get_unaligned_be64(&pp[i]);
376 if (!verify_dfsbno(mp, dbno))
377 break;
378 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
379 break;
380 }
381 }
382
383 static void
384 pf_read_exinode(
385 prefetch_args_t *args,
386 xfs_dinode_t *dino)
387 {
388 pf_read_bmbt_reclist(args, (xfs_bmbt_rec_t *)XFS_DFORK_DPTR(dino),
389 be32_to_cpu(dino->di_nextents));
390 }
391
392 static void
393 pf_read_inode_dirs(
394 prefetch_args_t *args,
395 xfs_buf_t *bp)
396 {
397 xfs_dinode_t *dino;
398 int icnt = 0;
399 int hasdir = 0;
400 int isadir;
401
402 libxfs_readbuf_verify(bp, &xfs_inode_buf_ops);
403 if (bp->b_error)
404 return;
405
406 for (icnt = 0; icnt < (XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog); icnt++) {
407 dino = xfs_make_iptr(mp, bp, icnt);
408
409 /*
410 * We are only prefetching directory contents in extents
411 * and btree nodes for other inodes
412 */
413 isadir = (be16_to_cpu(dino->di_mode) & S_IFMT) == S_IFDIR;
414 hasdir |= isadir;
415
416 if (dino->di_format <= XFS_DINODE_FMT_LOCAL)
417 continue;
418
419 if (!isadir && (dino->di_format == XFS_DINODE_FMT_EXTENTS ||
420 args->dirs_only))
421 continue;
422
423 /*
424 * do some checks on the inode to see if we can prefetch
425 * its directory data. It's a cut down version of
426 * process_dinode_int() in dinode.c.
427 */
428 if (dino->di_format > XFS_DINODE_FMT_BTREE)
429 continue;
430
431 if (be16_to_cpu(dino->di_magic) != XFS_DINODE_MAGIC)
432 continue;
433
434 if (!xfs_dinode_good_version(mp, dino->di_version))
435 continue;
436
437 if (be64_to_cpu(dino->di_size) <= XFS_DFORK_DSIZE(dino, mp))
438 continue;
439
440 if ((dino->di_forkoff != 0) &&
441 (dino->di_forkoff >= XFS_LITINO(mp, dino->di_version) >> 3))
442 continue;
443
444 switch (dino->di_format) {
445 case XFS_DINODE_FMT_EXTENTS:
446 pf_read_exinode(args, dino);
447 break;
448 case XFS_DINODE_FMT_BTREE:
449 pf_read_btinode(args, dino, isadir);
450 break;
451 }
452 }
453 if (hasdir)
454 XFS_BUF_SET_PRIORITY(bp, B_DIR_INODE);
455 }
456
457 /*
458 * pf_batch_read must be called with the lock locked.
459 */
460 static void
461 pf_batch_read(
462 prefetch_args_t *args,
463 pf_which_t which,
464 void *buf)
465 {
466 xfs_buf_t *bplist[MAX_BUFS];
467 unsigned int num;
468 off64_t first_off, last_off, next_off;
469 int len, size;
470 int i;
471 int inode_bufs;
472 unsigned long fsbno = 0;
473 unsigned long max_fsbno;
474 char *pbuf;
475
476 for (;;) {
477 num = 0;
478 if (which == PF_SECONDARY) {
479 bplist[0] = btree_find(args->io_queue, 0, &fsbno);
480 max_fsbno = MIN(fsbno + pf_max_fsbs,
481 args->last_bno_read);
482 } else {
483 bplist[0] = btree_find(args->io_queue,
484 args->last_bno_read, &fsbno);
485 max_fsbno = fsbno + pf_max_fsbs;
486 }
487 while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) {
488 /*
489 * Discontiguous buffers need special handling, so stop
490 * gathering new buffers and process the list and this
491 * discontigous buffer immediately. This avoids the
492 * complexity of keeping a separate discontigous buffer
493 * list and seeking back over ranges we've already done
494 * optimised reads for.
495 */
496 if ((bplist[num]->b_flags & LIBXFS_B_DISCONTIG)) {
497 num++;
498 break;
499 }
500
501 if (which != PF_META_ONLY ||
502 !B_IS_INODE(XFS_BUF_PRIORITY(bplist[num])))
503 num++;
504 if (num == MAX_BUFS)
505 break;
506 bplist[num] = btree_lookup_next(args->io_queue, &fsbno);
507 }
508 if (!num)
509 return;
510
511 /*
512 * do a big read if 25% of the potential buffer is useful,
513 * otherwise, find as many close together blocks and
514 * read them in one read
515 */
516 first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0]));
517 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
518 XFS_BUF_SIZE(bplist[num-1]);
519 while (num > 1 && last_off - first_off > pf_max_bytes) {
520 num--;
521 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
522 XFS_BUF_SIZE(bplist[num-1]);
523 }
524 if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) {
525 /*
526 * not enough blocks for one big read, so determine
527 * the number of blocks that are close enough.
528 */
529 last_off = first_off + XFS_BUF_SIZE(bplist[0]);
530 for (i = 1; i < num; i++) {
531 next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) +
532 XFS_BUF_SIZE(bplist[i]);
533 if (next_off - last_off > pf_batch_bytes)
534 break;
535 last_off = next_off;
536 }
537 num = i;
538 }
539
540 for (i = 0; i < num; i++) {
541 if (btree_delete(args->io_queue, XFS_DADDR_TO_FSB(mp,
542 XFS_BUF_ADDR(bplist[i]))) == NULL)
543 do_error(_("prefetch corruption\n"));
544 }
545
546 if (which == PF_PRIMARY) {
547 for (inode_bufs = 0, i = 0; i < num; i++) {
548 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
549 inode_bufs++;
550 }
551 args->inode_bufs_queued -= inode_bufs;
552 if (inode_bufs && (first_off >> mp->m_sb.sb_blocklog) >
553 pf_batch_fsbs)
554 args->last_bno_read = (first_off >> mp->m_sb.sb_blocklog);
555 }
556 #ifdef XR_PF_TRACE
557 pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)",
558 (long long)XFS_BUF_ADDR(bplist[0]),
559 (long long)XFS_BUF_ADDR(bplist[num-1]), num,
560 (which != PF_SECONDARY) ? "pri" : "sec", args->agno,
561 args->last_bno_read, args->inode_bufs_queued);
562 #endif
563 pthread_mutex_unlock(&args->lock);
564
565 /*
566 * now read the data and put into the xfs_but_t's
567 */
568 len = pread64(mp_fd, buf, (int)(last_off - first_off), first_off);
569
570 /*
571 * Check the last buffer on the list to see if we need to
572 * process a discontiguous buffer. The gather above loop
573 * guarantees that only the last buffer in the list will be a
574 * discontiguous buffer.
575 */
576 if ((bplist[num - 1]->b_flags & LIBXFS_B_DISCONTIG)) {
577 libxfs_readbufr_map(mp->m_ddev_targp, bplist[num - 1], 0);
578 bplist[num - 1]->b_flags |= LIBXFS_B_UNCHECKED;
579 libxfs_putbuf(bplist[num - 1]);
580 num--;
581 }
582
583 if (len > 0) {
584 /*
585 * go through the xfs_buf_t list copying from the
586 * read buffer into the xfs_buf_t's and release them.
587 */
588 for (i = 0; i < num; i++) {
589
590 pbuf = ((char *)buf) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) - first_off);
591 size = XFS_BUF_SIZE(bplist[i]);
592 if (len < size)
593 break;
594 memcpy(XFS_BUF_PTR(bplist[i]), pbuf, size);
595 bplist[i]->b_flags |= (LIBXFS_B_UPTODATE |
596 LIBXFS_B_UNCHECKED);
597 len -= size;
598 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
599 pf_read_inode_dirs(args, bplist[i]);
600 else if (which == PF_META_ONLY)
601 XFS_BUF_SET_PRIORITY(bplist[i],
602 B_DIR_META_H);
603 else if (which == PF_PRIMARY && num == 1)
604 XFS_BUF_SET_PRIORITY(bplist[i],
605 B_DIR_META_S);
606 }
607 }
608 for (i = 0; i < num; i++) {
609 pftrace("putbuf %c %p (%llu) in AG %d",
610 B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M',
611 bplist[i], (long long)XFS_BUF_ADDR(bplist[i]),
612 args->agno);
613 libxfs_putbuf(bplist[i]);
614 }
615 pthread_mutex_lock(&args->lock);
616 if (which != PF_SECONDARY) {
617 pftrace("inode_bufs_queued for AG %d = %d", args->agno,
618 args->inode_bufs_queued);
619 /*
620 * if primary inode queue running low, process metadata
621 * in boths queues to avoid I/O starvation as the
622 * processing thread would be waiting for a metadata
623 * buffer
624 */
625 if (which == PF_PRIMARY && !args->queuing_done &&
626 args->inode_bufs_queued < IO_THRESHOLD) {
627 pftrace("reading metadata bufs from primary queue for AG %d",
628 args->agno);
629
630 pf_batch_read(args, PF_META_ONLY, buf);
631
632 pftrace("reading bufs from secondary queue for AG %d",
633 args->agno);
634
635 pf_batch_read(args, PF_SECONDARY, buf);
636 }
637 }
638 }
639 }
640
641 static void *
642 pf_io_worker(
643 void *param)
644 {
645 prefetch_args_t *args = param;
646 void *buf = memalign(libxfs_device_alignment(),
647 pf_max_bytes);
648
649 if (buf == NULL)
650 return NULL;
651
652 pthread_mutex_lock(&args->lock);
653 while (!args->queuing_done || !btree_is_empty(args->io_queue)) {
654 pftrace("waiting to start prefetch I/O for AG %d", args->agno);
655
656 while (!args->can_start_reading && !args->queuing_done)
657 pthread_cond_wait(&args->start_reading, &args->lock);
658
659 pftrace("starting prefetch I/O for AG %d", args->agno);
660
661 pf_batch_read(args, PF_PRIMARY, buf);
662 pf_batch_read(args, PF_SECONDARY, buf);
663
664 pftrace("ran out of bufs to prefetch for AG %d", args->agno);
665
666 if (!args->queuing_done)
667 args->can_start_reading = 0;
668 }
669 pthread_mutex_unlock(&args->lock);
670
671 free(buf);
672
673 pftrace("finished prefetch I/O for AG %d", args->agno);
674
675 return NULL;
676 }
677
678 static int
679 pf_create_prefetch_thread(
680 prefetch_args_t *args);
681
682 static void *
683 pf_queuing_worker(
684 void *param)
685 {
686 prefetch_args_t *args = param;
687 int num_inos;
688 ino_tree_node_t *irec;
689 ino_tree_node_t *cur_irec;
690 int blks_per_cluster;
691 xfs_agblock_t bno;
692 int i;
693 int err;
694 uint64_t sparse;
695
696 blks_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
697 if (blks_per_cluster == 0)
698 blks_per_cluster = 1;
699
700 for (i = 0; i < PF_THREAD_COUNT; i++) {
701 err = pthread_create(&args->io_threads[i], NULL,
702 pf_io_worker, args);
703 if (err != 0) {
704 do_warn(_("failed to create prefetch thread: %s\n"),
705 strerror(err));
706 if (i == 0) {
707 pf_start_processing(args);
708 return NULL;
709 }
710 /*
711 * since we have at least one I/O thread, use them for
712 * prefetch
713 */
714 break;
715 }
716 }
717 pftrace("starting prefetch for AG %d", args->agno);
718
719 for (irec = findfirst_inode_rec(args->agno); irec != NULL;
720 irec = next_ino_rec(irec)) {
721
722 cur_irec = irec;
723
724 num_inos = XFS_INODES_PER_CHUNK;
725 while (num_inos < mp->m_ialloc_inos && irec != NULL) {
726 irec = next_ino_rec(irec);
727 num_inos += XFS_INODES_PER_CHUNK;
728 }
729
730 if (args->dirs_only && cur_irec->ino_isa_dir == 0)
731 continue;
732 #ifdef XR_PF_TRACE
733 sem_getvalue(&args->ra_count, &i);
734 pftrace("queuing irec %p in AG %d, sem count = %d",
735 irec, args->agno, i);
736 #endif
737 err = sem_trywait(&args->ra_count);
738 if (err < 0 && errno == EAGAIN) {
739 /*
740 * Kick the queue once we have reached the limit;
741 * without this the threads processing the inodes
742 * might get stuck on a buffer that has been locked
743 * and added to the I/O queue but is waiting for
744 * the thread to be woken.
745 */
746 pf_start_io_workers(args);
747 sem_wait(&args->ra_count);
748 }
749
750 num_inos = 0;
751 bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum);
752 sparse = cur_irec->ir_sparse;
753
754 do {
755 struct xfs_buf_map map;
756
757 map.bm_bn = XFS_AGB_TO_DADDR(mp, args->agno, bno);
758 map.bm_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
759
760 /*
761 * Queue I/O for each non-sparse cluster. We can check
762 * sparse state in cluster sized chunks as cluster size
763 * is the min. granularity of sparse irec regions.
764 */
765 if ((sparse & ((1ULL << inodes_per_cluster) - 1)) == 0)
766 pf_queue_io(args, &map, 1,
767 (cur_irec->ino_isa_dir != 0) ?
768 B_DIR_INODE : B_INODE);
769
770 bno += blks_per_cluster;
771 num_inos += inodes_per_cluster;
772 sparse >>= inodes_per_cluster;
773 } while (num_inos < mp->m_ialloc_inos);
774 }
775
776 pthread_mutex_lock(&args->lock);
777
778 pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)",
779 args->agno, args->inode_bufs_queued);
780
781 args->queuing_done = 1;
782 pf_start_io_workers(args);
783 pf_start_processing(args);
784 pthread_mutex_unlock(&args->lock);
785
786 /* now wait for the readers to finish */
787 for (i = 0; i < PF_THREAD_COUNT; i++)
788 if (args->io_threads[i])
789 pthread_join(args->io_threads[i], NULL);
790
791 pftrace("prefetch for AG %d finished", args->agno);
792
793 pthread_mutex_lock(&args->lock);
794
795 ASSERT(btree_is_empty(args->io_queue));
796
797 args->prefetch_done = 1;
798 if (args->next_args)
799 pf_create_prefetch_thread(args->next_args);
800
801 pthread_mutex_unlock(&args->lock);
802
803 return NULL;
804 }
805
806 static int
807 pf_create_prefetch_thread(
808 prefetch_args_t *args)
809 {
810 int err;
811
812 pftrace("creating queue thread for AG %d", args->agno);
813
814 err = pthread_create(&args->queuing_thread, NULL,
815 pf_queuing_worker, args);
816 if (err != 0) {
817 do_warn(_("failed to create prefetch thread: %s\n"),
818 strerror(err));
819 cleanup_inode_prefetch(args);
820 }
821
822 return err == 0;
823 }
824
825 void
826 init_prefetch(
827 xfs_mount_t *pmp)
828 {
829 mp = pmp;
830 mp_fd = libxfs_device_to_fd(mp->m_ddev_targp->dev);
831 pf_max_bytes = sysconf(_SC_PAGE_SIZE) << 7;
832 pf_max_bbs = pf_max_bytes >> BBSHIFT;
833 pf_max_fsbs = pf_max_bytes >> mp->m_sb.sb_blocklog;
834 pf_batch_bytes = DEF_BATCH_BYTES;
835 pf_batch_fsbs = DEF_BATCH_BYTES >> (mp->m_sb.sb_blocklog + 1);
836 }
837
838 prefetch_args_t *
839 start_inode_prefetch(
840 xfs_agnumber_t agno,
841 int dirs_only,
842 prefetch_args_t *prev_args)
843 {
844 prefetch_args_t *args;
845 long max_queue;
846
847 if (!do_prefetch || agno >= mp->m_sb.sb_agcount)
848 return NULL;
849
850 args = calloc(1, sizeof(prefetch_args_t));
851
852 btree_init(&args->io_queue);
853 if (pthread_mutex_init(&args->lock, NULL) != 0)
854 do_error(_("failed to initialize prefetch mutex\n"));
855 if (pthread_cond_init(&args->start_reading, NULL) != 0)
856 do_error(_("failed to initialize prefetch cond var\n"));
857 if (pthread_cond_init(&args->start_processing, NULL) != 0)
858 do_error(_("failed to initialize prefetch cond var\n"));
859 args->agno = agno;
860 args->dirs_only = dirs_only;
861
862 /*
863 * use only 1/8 of the libxfs cache as we are only counting inodes
864 * and not any other associated metadata like directories
865 */
866
867 max_queue = libxfs_bcache->c_maxcount / thread_count / 8;
868 if (mp->m_inode_cluster_size > mp->m_sb.sb_blocksize)
869 max_queue = max_queue *
870 (mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog) /
871 mp->m_ialloc_blks;
872
873 sem_init(&args->ra_count, 0, max_queue);
874
875 if (!prev_args) {
876 if (!pf_create_prefetch_thread(args))
877 return NULL;
878 } else {
879 pthread_mutex_lock(&prev_args->lock);
880 if (prev_args->prefetch_done) {
881 if (!pf_create_prefetch_thread(args))
882 args = NULL;
883 } else
884 prev_args->next_args = args;
885 pthread_mutex_unlock(&prev_args->lock);
886 }
887
888 return args;
889 }
890
891 /*
892 * prefetch_ag_range runs a prefetch-and-process loop across a range of AGs. It
893 * begins with @start+ag, and finishes with @end_ag - 1 (i.e. does not prefetch
894 * or process @end_ag). The function starts prefetch on the first AG, then loops
895 * starting prefetch on the next AG and then blocks processing the current AG as
896 * the prefetch queue brings inodes into the processing queue.
897 *
898 * There is only one prefetch taking place at a time, so the prefetch on the
899 * next AG only starts once the current AG has been completely prefetched. Hence
900 * the prefetch of the next AG will start some time before the processing of the
901 * current AG finishes, ensuring that when we iterate an start processing the
902 * next AG there is already a significant queue of inodes to process.
903 *
904 * Prefetch is done this way to prevent it from running too far ahead of the
905 * processing. Allowing it to do so can cause cache thrashing, where new
906 * prefetch causes previously prefetched buffers to be reclaimed before the
907 * processing thread uses them. This results in reading all the inodes and
908 * metadata twice per phase and it greatly slows down the processing. Hence we
909 * have to carefully control how far ahead we prefetch...
910 */
911 static void
912 prefetch_ag_range(
913 struct work_queue *work,
914 xfs_agnumber_t start_ag,
915 xfs_agnumber_t end_ag,
916 bool dirs_only,
917 void (*func)(struct work_queue *,
918 xfs_agnumber_t, void *))
919 {
920 int i;
921 struct prefetch_args *pf_args[2];
922
923 pf_args[start_ag & 1] = start_inode_prefetch(start_ag, dirs_only, NULL);
924 for (i = start_ag; i < end_ag; i++) {
925 /* Don't prefetch end_ag */
926 if (i + 1 < end_ag)
927 pf_args[(~i) & 1] = start_inode_prefetch(i + 1,
928 dirs_only, pf_args[i & 1]);
929 func(work, i, pf_args[i & 1]);
930 }
931 }
932
933 struct pf_work_args {
934 xfs_agnumber_t start_ag;
935 xfs_agnumber_t end_ag;
936 bool dirs_only;
937 void (*func)(struct work_queue *, xfs_agnumber_t, void *);
938 };
939
940 static void
941 prefetch_ag_range_work(
942 struct work_queue *work,
943 xfs_agnumber_t unused,
944 void *args)
945 {
946 struct pf_work_args *wargs = args;
947
948 prefetch_ag_range(work, wargs->start_ag, wargs->end_ag,
949 wargs->dirs_only, wargs->func);
950 free(args);
951 }
952
953 /*
954 * Do inode prefetch in the most optimal way for the context under which repair
955 * has been run.
956 */
957 void
958 do_inode_prefetch(
959 struct xfs_mount *mp,
960 int stride,
961 void (*func)(struct work_queue *,
962 xfs_agnumber_t, void *),
963 bool check_cache,
964 bool dirs_only)
965 {
966 int i;
967 struct work_queue queue;
968 struct work_queue *queues;
969 int queues_started = 0;
970
971 /*
972 * If the previous phases of repair have not overflowed the buffer
973 * cache, then we don't need to re-read any of the metadata in the
974 * filesystem - it's all in the cache. In that case, run a thread per
975 * CPU to maximise parallelism of the queue to be processed.
976 */
977 if (check_cache && !libxfs_bcache_overflowed()) {
978 queue.mp = mp;
979 create_work_queue(&queue, mp, libxfs_nproc());
980 for (i = 0; i < mp->m_sb.sb_agcount; i++)
981 queue_work(&queue, func, i, NULL);
982 destroy_work_queue(&queue);
983 return;
984 }
985
986 /*
987 * single threaded behaviour - single prefetch thread, processed
988 * directly after each AG is queued.
989 */
990 if (!stride) {
991 queue.mp = mp;
992 prefetch_ag_range(&queue, 0, mp->m_sb.sb_agcount,
993 dirs_only, func);
994 return;
995 }
996
997 /*
998 * create one worker thread for each segment of the volume
999 */
1000 queues = malloc(thread_count * sizeof(work_queue_t));
1001 for (i = 0; i < thread_count; i++) {
1002 struct pf_work_args *wargs;
1003
1004 wargs = malloc(sizeof(struct pf_work_args));
1005 wargs->start_ag = i * stride;
1006 wargs->end_ag = min((i + 1) * stride,
1007 mp->m_sb.sb_agcount);
1008 wargs->dirs_only = dirs_only;
1009 wargs->func = func;
1010
1011 create_work_queue(&queues[i], mp, 1);
1012 queue_work(&queues[i], prefetch_ag_range_work, 0, wargs);
1013 queues_started++;
1014
1015 if (wargs->end_ag >= mp->m_sb.sb_agcount)
1016 break;
1017 }
1018
1019 /*
1020 * wait for workers to complete
1021 */
1022 for (i = 0; i < queues_started; i++)
1023 destroy_work_queue(&queues[i]);
1024 free(queues);
1025 }
1026
1027 void
1028 wait_for_inode_prefetch(
1029 prefetch_args_t *args)
1030 {
1031 if (args == NULL)
1032 return;
1033
1034 pthread_mutex_lock(&args->lock);
1035
1036 while (!args->can_start_processing) {
1037 pftrace("waiting to start processing AG %d", args->agno);
1038
1039 pthread_cond_wait(&args->start_processing, &args->lock);
1040 }
1041 pftrace("can start processing AG %d", args->agno);
1042
1043 pthread_mutex_unlock(&args->lock);
1044 }
1045
1046 void
1047 cleanup_inode_prefetch(
1048 prefetch_args_t *args)
1049 {
1050 if (args == NULL)
1051 return;
1052
1053 pftrace("waiting AG %d prefetch to finish", args->agno);
1054
1055 if (args->queuing_thread)
1056 pthread_join(args->queuing_thread, NULL);
1057
1058 pftrace("AG %d prefetch done", args->agno);
1059
1060 pthread_mutex_destroy(&args->lock);
1061 pthread_cond_destroy(&args->start_reading);
1062 pthread_cond_destroy(&args->start_processing);
1063 sem_destroy(&args->ra_count);
1064 btree_destroy(args->io_queue);
1065
1066 free(args);
1067 }
1068
1069 #ifdef XR_PF_TRACE
1070
1071 static FILE *pf_trace_file;
1072
1073 void
1074 pftrace_init(void)
1075 {
1076 pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w");
1077 setvbuf(pf_trace_file, NULL, _IOLBF, 1024);
1078 }
1079
1080 void
1081 pftrace_done(void)
1082 {
1083 fclose(pf_trace_file);
1084 }
1085
1086 void
1087 _pftrace(const char *func, const char *msg, ...)
1088 {
1089 char buf[200];
1090 struct timeval tv;
1091 va_list args;
1092
1093 gettimeofday(&tv, NULL);
1094
1095 va_start(args, msg);
1096 vsnprintf(buf, sizeof(buf), msg, args);
1097 buf[sizeof(buf)-1] = '\0';
1098 va_end(args);
1099
1100 fprintf(pf_trace_file, "%lu.%06lu %s: %s\n", tv.tv_sec, tv.tv_usec,
1101 func, buf);
1102 }
1103
1104 #endif