]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - repair/prefetch.c
xfs: kill unsupported superblock versions
[thirdparty/xfsprogs-dev.git] / repair / prefetch.c
1 #include <libxfs.h>
2 #include <pthread.h>
3 #include "avl.h"
4 #include "btree.h"
5 #include "globals.h"
6 #include "agheader.h"
7 #include "incore.h"
8 #include "dir2.h"
9 #include "protos.h"
10 #include "err_protos.h"
11 #include "dinode.h"
12 #include "bmap.h"
13 #include "versions.h"
14 #include "threads.h"
15 #include "prefetch.h"
16 #include "progress.h"
17
18 int do_prefetch = 1;
19
20 /*
21 * Performs prefetching by priming the libxfs cache by using a dedicate thread
22 * scanning inodes and reading blocks in ahead of time they are required.
23 *
24 * Any I/O errors can be safely ignored.
25 */
26
27 static xfs_mount_t *mp;
28 static int mp_fd;
29 static int pf_max_bytes;
30 static int pf_max_bbs;
31 static int pf_max_fsbs;
32 static int pf_batch_bytes;
33 static int pf_batch_fsbs;
34
35 static void pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *);
36
37 /*
38 * Buffer priorities for the libxfs cache
39 *
40 * Directory metadata is ranked higher than other metadata as it's used
41 * in phases 3, 4 and 6, while other metadata is only used in 3 & 4.
42 */
43
44 /* intermediate directory btree nodes - can't be queued */
45 #define B_DIR_BMAP CACHE_PREFETCH_PRIORITY + 7
46 /* directory metadata in secondary queue */
47 #define B_DIR_META_2 CACHE_PREFETCH_PRIORITY + 6
48 /* dir metadata that had to fetched from the primary queue to avoid stalling */
49 #define B_DIR_META_H CACHE_PREFETCH_PRIORITY + 5
50 /* single block of directory metadata (can't batch read) */
51 #define B_DIR_META_S CACHE_PREFETCH_PRIORITY + 4
52 /* dir metadata with more than one block fetched in a single I/O */
53 #define B_DIR_META CACHE_PREFETCH_PRIORITY + 3
54 /* inode clusters with directory inodes */
55 #define B_DIR_INODE CACHE_PREFETCH_PRIORITY + 2
56 /* intermediate extent btree nodes */
57 #define B_BMAP CACHE_PREFETCH_PRIORITY + 1
58 /* inode clusters without any directory entries */
59 #define B_INODE CACHE_PREFETCH_PRIORITY
60
61 /*
62 * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if
63 * the buffer is for an inode or other metadata.
64 */
65 #define B_IS_INODE(f) (((f) & 5) == 0)
66
67 #define DEF_BATCH_BYTES 0x10000
68
69 #define MAX_BUFS 128
70
71 #define IO_THRESHOLD (MAX_BUFS * 2)
72
73 typedef enum pf_which {
74 PF_PRIMARY,
75 PF_SECONDARY,
76 PF_META_ONLY
77 } pf_which_t;
78
79
80 static inline void
81 pf_start_processing(
82 prefetch_args_t *args)
83 {
84 if (!args->can_start_processing) {
85 pftrace("signalling processing for AG %d", args->agno);
86
87 args->can_start_processing = 1;
88 pthread_cond_signal(&args->start_processing);
89 }
90 }
91
92 static inline void
93 pf_start_io_workers(
94 prefetch_args_t *args)
95 {
96 if (!args->can_start_reading) {
97 pftrace("signalling reading for AG %d", args->agno);
98
99 args->can_start_reading = 1;
100 pthread_cond_broadcast(&args->start_reading);
101 }
102 }
103
104
105 static void
106 pf_queue_io(
107 prefetch_args_t *args,
108 struct xfs_buf_map *map,
109 int nmaps,
110 int flag)
111 {
112 struct xfs_buf *bp;
113 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, map[0].bm_bn);
114
115 /*
116 * Never block on a buffer lock here, given that the actual repair
117 * code might lock buffers in a different order from us. Given that
118 * the lock holder is either reading it from disk himself or
119 * completely overwriting it this behaviour is perfectly fine.
120 */
121 bp = libxfs_getbuf_map(mp->m_dev, map, nmaps, LIBXFS_GETBUF_TRYLOCK);
122 if (!bp)
123 return;
124
125 if (bp->b_flags & LIBXFS_B_UPTODATE) {
126 if (B_IS_INODE(flag))
127 pf_read_inode_dirs(args, bp);
128 XFS_BUF_SET_PRIORITY(bp, XFS_BUF_PRIORITY(bp) +
129 CACHE_PREFETCH_PRIORITY);
130 libxfs_putbuf(bp);
131 return;
132 }
133 XFS_BUF_SET_PRIORITY(bp, flag);
134
135 pthread_mutex_lock(&args->lock);
136
137 btree_insert(args->io_queue, fsbno, bp);
138
139 if (fsbno > args->last_bno_read) {
140 if (B_IS_INODE(flag)) {
141 args->inode_bufs_queued++;
142 if (args->inode_bufs_queued == IO_THRESHOLD)
143 pf_start_io_workers(args);
144 }
145 } else {
146 ASSERT(!B_IS_INODE(flag));
147 XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
148 }
149
150 pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue"
151 "(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag) ?
152 'I' : 'M', bp, (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
153 args->inode_bufs_queued, args->last_bno_read);
154
155 pf_start_processing(args);
156
157 pthread_mutex_unlock(&args->lock);
158 }
159
160 static int
161 pf_read_bmbt_reclist(
162 prefetch_args_t *args,
163 xfs_bmbt_rec_t *rp,
164 int numrecs)
165 {
166 int i;
167 xfs_bmbt_irec_t irec;
168 xfs_dfilblks_t cp = 0; /* prev count */
169 xfs_dfiloff_t op = 0; /* prev offset */
170 #define MAP_ARRAY_SZ 4
171 struct xfs_buf_map map_array[MAP_ARRAY_SZ];
172 struct xfs_buf_map *map = map_array;
173 int max_extents = MAP_ARRAY_SZ;
174 int nmaps = 0;
175 unsigned int len = 0;
176 int ret = 0;
177
178
179 for (i = 0; i < numrecs; i++) {
180 libxfs_bmbt_disk_get_all(rp + i, &irec);
181
182 if (((i > 0) && (op + cp > irec.br_startoff)) ||
183 (irec.br_blockcount == 0) ||
184 (irec.br_startoff >= fs_max_file_offset))
185 goto out_free;
186
187 if (!verify_dfsbno(mp, irec.br_startblock) || !verify_dfsbno(mp,
188 irec.br_startblock + irec.br_blockcount - 1))
189 goto out_free;
190
191 if (!args->dirs_only && ((irec.br_startoff +
192 irec.br_blockcount) >= mp->m_dirfreeblk))
193 break; /* only Phase 6 reads the free blocks */
194
195 op = irec.br_startoff;
196 cp = irec.br_blockcount;
197
198 while (irec.br_blockcount) {
199 unsigned int bm_len;
200
201 pftrace("queuing dir extent in AG %d", args->agno);
202
203 if (len + irec.br_blockcount >= mp->m_dirblkfsbs)
204 bm_len = mp->m_dirblkfsbs - len;
205 else
206 bm_len = irec.br_blockcount;
207 len += bm_len;
208
209 map[nmaps].bm_bn = XFS_FSB_TO_DADDR(mp,
210 irec.br_startblock);
211 map[nmaps].bm_len = XFS_FSB_TO_BB(mp, bm_len);
212 nmaps++;
213
214 if (len == mp->m_dirblkfsbs) {
215 pf_queue_io(args, map, nmaps, B_DIR_META);
216 len = 0;
217 nmaps = 0;
218 }
219
220 irec.br_blockcount -= bm_len;
221 irec.br_startblock += bm_len;
222
223 /*
224 * Handle very fragmented dir2 blocks with dynamically
225 * allocated buffer maps.
226 */
227 if (nmaps >= max_extents) {
228 struct xfs_buf_map *old_map = NULL;
229
230 if (map == map_array) {
231 old_map = map;
232 map = NULL;
233 }
234 max_extents *= 2;
235 map = realloc(map, max_extents * sizeof(*map));
236 if (map == NULL) {
237 do_error(
238 _("couldn't malloc dir2 buffer list\n"));
239 exit(1);
240 }
241 if (old_map)
242 memcpy(map, old_map, sizeof(map_array));
243 }
244
245 }
246 }
247 ret = 1;
248 out_free:
249 if (map != map_array)
250 free(map);
251 return ret;
252 }
253
254 /*
255 * simplified version of the main scan_lbtree. Returns 0 to stop.
256 */
257
258 static int
259 pf_scan_lbtree(
260 xfs_dfsbno_t dbno,
261 int level,
262 int isadir,
263 prefetch_args_t *args,
264 int (*func)(struct xfs_btree_block *block,
265 int level,
266 int isadir,
267 prefetch_args_t *args))
268 {
269 xfs_buf_t *bp;
270 int rc;
271
272 bp = libxfs_readbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, dbno),
273 XFS_FSB_TO_BB(mp, 1), 0, &xfs_bmbt_buf_ops);
274 if (!bp)
275 return 0;
276
277 XFS_BUF_SET_PRIORITY(bp, isadir ? B_DIR_BMAP : B_BMAP);
278
279 rc = (*func)(XFS_BUF_TO_BLOCK(bp), level - 1, isadir, args);
280
281 libxfs_putbuf(bp);
282
283 return rc;
284 }
285
286 static int
287 pf_scanfunc_bmap(
288 struct xfs_btree_block *block,
289 int level,
290 int isadir,
291 prefetch_args_t *args)
292 {
293 xfs_bmbt_ptr_t *pp;
294 int numrecs;
295 int i;
296 xfs_dfsbno_t dbno;
297
298 /*
299 * do some validation on the block contents
300 */
301 if ((block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) &&
302 block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC)) ||
303 (be16_to_cpu(block->bb_level) != level))
304 return 0;
305
306 numrecs = be16_to_cpu(block->bb_numrecs);
307
308 if (level == 0) {
309 if (numrecs > mp->m_bmap_dmxr[0] || !isadir)
310 return 0;
311 return pf_read_bmbt_reclist(args,
312 XFS_BMBT_REC_ADDR(mp, block, 1), numrecs);
313 }
314
315 if (numrecs > mp->m_bmap_dmxr[1])
316 return 0;
317
318 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
319
320 for (i = 0; i < numrecs; i++) {
321 dbno = be64_to_cpu(pp[i]);
322 if (!verify_dfsbno(mp, dbno))
323 return 0;
324 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
325 return 0;
326 }
327 return 1;
328 }
329
330
331 static void
332 pf_read_btinode(
333 prefetch_args_t *args,
334 xfs_dinode_t *dino,
335 int isadir)
336 {
337 xfs_bmdr_block_t *dib;
338 xfs_bmbt_ptr_t *pp;
339 int i;
340 int level;
341 int numrecs;
342 int dsize;
343 xfs_dfsbno_t dbno;
344
345 dib = (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dino);
346
347 level = be16_to_cpu(dib->bb_level);
348 numrecs = be16_to_cpu(dib->bb_numrecs);
349
350 if ((numrecs == 0) || (level == 0) ||
351 (level > XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))
352 return;
353 /*
354 * use bmdr/dfork_dsize since the root block is in the data fork
355 */
356 if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp))
357 return;
358
359 dsize = XFS_DFORK_DSIZE(dino, mp);
360 pp = XFS_BMDR_PTR_ADDR(dib, 1, xfs_bmdr_maxrecs(mp, dsize, 0));
361
362 for (i = 0; i < numrecs; i++) {
363 dbno = be64_to_cpu(pp[i]);
364 if (!verify_dfsbno(mp, dbno))
365 break;
366 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
367 break;
368 }
369 }
370
371 static void
372 pf_read_exinode(
373 prefetch_args_t *args,
374 xfs_dinode_t *dino)
375 {
376 pf_read_bmbt_reclist(args, (xfs_bmbt_rec_t *)XFS_DFORK_DPTR(dino),
377 be32_to_cpu(dino->di_nextents));
378 }
379
380 static void
381 pf_read_inode_dirs(
382 prefetch_args_t *args,
383 xfs_buf_t *bp)
384 {
385 xfs_dinode_t *dino;
386 int icnt = 0;
387 int hasdir = 0;
388 int isadir;
389
390 libxfs_readbuf_verify(bp, &xfs_inode_buf_ops);
391 if (bp->b_error)
392 return;
393
394 for (icnt = 0; icnt < (XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog); icnt++) {
395 dino = xfs_make_iptr(mp, bp, icnt);
396
397 /*
398 * We are only prefetching directory contents in extents
399 * and btree nodes for other inodes
400 */
401 isadir = (be16_to_cpu(dino->di_mode) & S_IFMT) == S_IFDIR;
402 hasdir |= isadir;
403
404 if (dino->di_format <= XFS_DINODE_FMT_LOCAL)
405 continue;
406
407 if (!isadir && (dino->di_format == XFS_DINODE_FMT_EXTENTS ||
408 args->dirs_only))
409 continue;
410
411 /*
412 * do some checks on the inode to see if we can prefetch
413 * its directory data. It's a cut down version of
414 * process_dinode_int() in dinode.c.
415 */
416 if (dino->di_format > XFS_DINODE_FMT_BTREE)
417 continue;
418
419 if (be16_to_cpu(dino->di_magic) != XFS_DINODE_MAGIC)
420 continue;
421
422 if (!XFS_DINODE_GOOD_VERSION(dino->di_version))
423 continue;
424
425 if (be64_to_cpu(dino->di_size) <= XFS_DFORK_DSIZE(dino, mp))
426 continue;
427
428 if ((dino->di_forkoff != 0) &&
429 (dino->di_forkoff >= XFS_LITINO(mp, dino->di_version) >> 3))
430 continue;
431
432 switch (dino->di_format) {
433 case XFS_DINODE_FMT_EXTENTS:
434 pf_read_exinode(args, dino);
435 break;
436 case XFS_DINODE_FMT_BTREE:
437 pf_read_btinode(args, dino, isadir);
438 break;
439 }
440 }
441 if (hasdir)
442 XFS_BUF_SET_PRIORITY(bp, B_DIR_INODE);
443 }
444
445 /*
446 * pf_batch_read must be called with the lock locked.
447 */
448 static void
449 pf_batch_read(
450 prefetch_args_t *args,
451 pf_which_t which,
452 void *buf)
453 {
454 xfs_buf_t *bplist[MAX_BUFS];
455 unsigned int num;
456 off64_t first_off, last_off, next_off;
457 int len, size;
458 int i;
459 int inode_bufs;
460 unsigned long fsbno = 0;
461 unsigned long max_fsbno;
462 char *pbuf;
463
464 for (;;) {
465 num = 0;
466 if (which == PF_SECONDARY) {
467 bplist[0] = btree_find(args->io_queue, 0, &fsbno);
468 max_fsbno = MIN(fsbno + pf_max_fsbs,
469 args->last_bno_read);
470 } else {
471 bplist[0] = btree_find(args->io_queue,
472 args->last_bno_read, &fsbno);
473 max_fsbno = fsbno + pf_max_fsbs;
474 }
475 while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) {
476 /*
477 * Discontiguous buffers need special handling, so stop
478 * gathering new buffers and process the list and this
479 * discontigous buffer immediately. This avoids the
480 * complexity of keeping a separate discontigous buffer
481 * list and seeking back over ranges we've already done
482 * optimised reads for.
483 */
484 if ((bplist[num]->b_flags & LIBXFS_B_DISCONTIG)) {
485 num++;
486 break;
487 }
488
489 if (which != PF_META_ONLY ||
490 !B_IS_INODE(XFS_BUF_PRIORITY(bplist[num])))
491 num++;
492 if (num == MAX_BUFS)
493 break;
494 bplist[num] = btree_lookup_next(args->io_queue, &fsbno);
495 }
496 if (!num)
497 return;
498
499 /*
500 * do a big read if 25% of the potential buffer is useful,
501 * otherwise, find as many close together blocks and
502 * read them in one read
503 */
504 first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0]));
505 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
506 XFS_BUF_SIZE(bplist[num-1]);
507 while (num > 1 && last_off - first_off > pf_max_bytes) {
508 num--;
509 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
510 XFS_BUF_SIZE(bplist[num-1]);
511 }
512 if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) {
513 /*
514 * not enough blocks for one big read, so determine
515 * the number of blocks that are close enough.
516 */
517 last_off = first_off + XFS_BUF_SIZE(bplist[0]);
518 for (i = 1; i < num; i++) {
519 next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) +
520 XFS_BUF_SIZE(bplist[i]);
521 if (next_off - last_off > pf_batch_bytes)
522 break;
523 last_off = next_off;
524 }
525 num = i;
526 }
527
528 for (i = 0; i < num; i++) {
529 if (btree_delete(args->io_queue, XFS_DADDR_TO_FSB(mp,
530 XFS_BUF_ADDR(bplist[i]))) == NULL)
531 do_error(_("prefetch corruption\n"));
532 }
533
534 if (which == PF_PRIMARY) {
535 for (inode_bufs = 0, i = 0; i < num; i++) {
536 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
537 inode_bufs++;
538 }
539 args->inode_bufs_queued -= inode_bufs;
540 if (inode_bufs && (first_off >> mp->m_sb.sb_blocklog) >
541 pf_batch_fsbs)
542 args->last_bno_read = (first_off >> mp->m_sb.sb_blocklog);
543 }
544 #ifdef XR_PF_TRACE
545 pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)",
546 (long long)XFS_BUF_ADDR(bplist[0]),
547 (long long)XFS_BUF_ADDR(bplist[num-1]), num,
548 (which != PF_SECONDARY) ? "pri" : "sec", args->agno,
549 args->last_bno_read, args->inode_bufs_queued);
550 #endif
551 pthread_mutex_unlock(&args->lock);
552
553 /*
554 * now read the data and put into the xfs_but_t's
555 */
556 len = pread64(mp_fd, buf, (int)(last_off - first_off), first_off);
557
558 /*
559 * Check the last buffer on the list to see if we need to
560 * process a discontiguous buffer. The gather above loop
561 * guarantees that only the last buffer in the list will be a
562 * discontiguous buffer.
563 */
564 if ((bplist[num - 1]->b_flags & LIBXFS_B_DISCONTIG)) {
565 libxfs_readbufr_map(mp->m_ddev_targp, bplist[num - 1], 0);
566 bplist[num - 1]->b_flags |= LIBXFS_B_UNCHECKED;
567 libxfs_putbuf(bplist[num - 1]);
568 num--;
569 }
570
571 if (len > 0) {
572 /*
573 * go through the xfs_buf_t list copying from the
574 * read buffer into the xfs_buf_t's and release them.
575 */
576 for (i = 0; i < num; i++) {
577
578 pbuf = ((char *)buf) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) - first_off);
579 size = XFS_BUF_SIZE(bplist[i]);
580 if (len < size)
581 break;
582 memcpy(XFS_BUF_PTR(bplist[i]), pbuf, size);
583 bplist[i]->b_flags |= (LIBXFS_B_UPTODATE |
584 LIBXFS_B_UNCHECKED);
585 len -= size;
586 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
587 pf_read_inode_dirs(args, bplist[i]);
588 else if (which == PF_META_ONLY)
589 XFS_BUF_SET_PRIORITY(bplist[i],
590 B_DIR_META_H);
591 else if (which == PF_PRIMARY && num == 1)
592 XFS_BUF_SET_PRIORITY(bplist[i],
593 B_DIR_META_S);
594 }
595 }
596 for (i = 0; i < num; i++) {
597 pftrace("putbuf %c %p (%llu) in AG %d",
598 B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M',
599 bplist[i], (long long)XFS_BUF_ADDR(bplist[i]),
600 args->agno);
601 libxfs_putbuf(bplist[i]);
602 }
603 pthread_mutex_lock(&args->lock);
604 if (which != PF_SECONDARY) {
605 pftrace("inode_bufs_queued for AG %d = %d", args->agno,
606 args->inode_bufs_queued);
607 /*
608 * if primary inode queue running low, process metadata
609 * in boths queues to avoid I/O starvation as the
610 * processing thread would be waiting for a metadata
611 * buffer
612 */
613 if (which == PF_PRIMARY && !args->queuing_done &&
614 args->inode_bufs_queued < IO_THRESHOLD) {
615 pftrace("reading metadata bufs from primary queue for AG %d",
616 args->agno);
617
618 pf_batch_read(args, PF_META_ONLY, buf);
619
620 pftrace("reading bufs from secondary queue for AG %d",
621 args->agno);
622
623 pf_batch_read(args, PF_SECONDARY, buf);
624 }
625 }
626 }
627 }
628
629 static void *
630 pf_io_worker(
631 void *param)
632 {
633 prefetch_args_t *args = param;
634 void *buf = memalign(libxfs_device_alignment(),
635 pf_max_bytes);
636
637 if (buf == NULL)
638 return NULL;
639
640 pthread_mutex_lock(&args->lock);
641 while (!args->queuing_done || !btree_is_empty(args->io_queue)) {
642 pftrace("waiting to start prefetch I/O for AG %d", args->agno);
643
644 while (!args->can_start_reading && !args->queuing_done)
645 pthread_cond_wait(&args->start_reading, &args->lock);
646
647 pftrace("starting prefetch I/O for AG %d", args->agno);
648
649 pf_batch_read(args, PF_PRIMARY, buf);
650 pf_batch_read(args, PF_SECONDARY, buf);
651
652 pftrace("ran out of bufs to prefetch for AG %d", args->agno);
653
654 if (!args->queuing_done)
655 args->can_start_reading = 0;
656 }
657 pthread_mutex_unlock(&args->lock);
658
659 free(buf);
660
661 pftrace("finished prefetch I/O for AG %d", args->agno);
662
663 return NULL;
664 }
665
666 static int
667 pf_create_prefetch_thread(
668 prefetch_args_t *args);
669
670 static void *
671 pf_queuing_worker(
672 void *param)
673 {
674 prefetch_args_t *args = param;
675 int num_inos;
676 ino_tree_node_t *irec;
677 ino_tree_node_t *cur_irec;
678 int blks_per_cluster;
679 xfs_agblock_t bno;
680 int i;
681 int err;
682
683 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
684 if (blks_per_cluster == 0)
685 blks_per_cluster = 1;
686
687 for (i = 0; i < PF_THREAD_COUNT; i++) {
688 err = pthread_create(&args->io_threads[i], NULL,
689 pf_io_worker, args);
690 if (err != 0) {
691 do_warn(_("failed to create prefetch thread: %s\n"),
692 strerror(err));
693 if (i == 0) {
694 pf_start_processing(args);
695 return NULL;
696 }
697 /*
698 * since we have at least one I/O thread, use them for
699 * prefetch
700 */
701 break;
702 }
703 }
704 pftrace("starting prefetch for AG %d", args->agno);
705
706 for (irec = findfirst_inode_rec(args->agno); irec != NULL;
707 irec = next_ino_rec(irec)) {
708
709 cur_irec = irec;
710
711 num_inos = XFS_INODES_PER_CHUNK;
712 while (num_inos < XFS_IALLOC_INODES(mp) && irec != NULL) {
713 irec = next_ino_rec(irec);
714 num_inos += XFS_INODES_PER_CHUNK;
715 }
716
717 if (args->dirs_only && cur_irec->ino_isa_dir == 0)
718 continue;
719 #ifdef XR_PF_TRACE
720 sem_getvalue(&args->ra_count, &i);
721 pftrace("queuing irec %p in AG %d, sem count = %d",
722 irec, args->agno, i);
723 #endif
724 err = sem_trywait(&args->ra_count);
725 if (err < 0 && errno == EAGAIN) {
726 /*
727 * Kick the queue once we have reached the limit;
728 * without this the threads processing the inodes
729 * might get stuck on a buffer that has been locked
730 * and added to the I/O queue but is waiting for
731 * the thread to be woken.
732 */
733 pf_start_io_workers(args);
734 sem_wait(&args->ra_count);
735 }
736
737 num_inos = 0;
738 bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum);
739
740 do {
741 struct xfs_buf_map map;
742
743 map.bm_bn = XFS_AGB_TO_DADDR(mp, args->agno, bno);
744 map.bm_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
745 pf_queue_io(args, &map, 1,
746 (cur_irec->ino_isa_dir != 0) ? B_DIR_INODE
747 : B_INODE);
748 bno += blks_per_cluster;
749 num_inos += inodes_per_cluster;
750 } while (num_inos < XFS_IALLOC_INODES(mp));
751 }
752
753 pthread_mutex_lock(&args->lock);
754
755 pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)",
756 args->agno, args->inode_bufs_queued);
757
758 args->queuing_done = 1;
759 pf_start_io_workers(args);
760 pf_start_processing(args);
761 pthread_mutex_unlock(&args->lock);
762
763 /* now wait for the readers to finish */
764 for (i = 0; i < PF_THREAD_COUNT; i++)
765 if (args->io_threads[i])
766 pthread_join(args->io_threads[i], NULL);
767
768 pftrace("prefetch for AG %d finished", args->agno);
769
770 pthread_mutex_lock(&args->lock);
771
772 ASSERT(btree_is_empty(args->io_queue));
773
774 args->prefetch_done = 1;
775 if (args->next_args)
776 pf_create_prefetch_thread(args->next_args);
777
778 pthread_mutex_unlock(&args->lock);
779
780 return NULL;
781 }
782
783 static int
784 pf_create_prefetch_thread(
785 prefetch_args_t *args)
786 {
787 int err;
788
789 pftrace("creating queue thread for AG %d", args->agno);
790
791 err = pthread_create(&args->queuing_thread, NULL,
792 pf_queuing_worker, args);
793 if (err != 0) {
794 do_warn(_("failed to create prefetch thread: %s\n"),
795 strerror(err));
796 cleanup_inode_prefetch(args);
797 }
798
799 return err == 0;
800 }
801
802 void
803 init_prefetch(
804 xfs_mount_t *pmp)
805 {
806 mp = pmp;
807 mp_fd = libxfs_device_to_fd(mp->m_ddev_targp->dev);
808 pf_max_bytes = sysconf(_SC_PAGE_SIZE) << 7;
809 pf_max_bbs = pf_max_bytes >> BBSHIFT;
810 pf_max_fsbs = pf_max_bytes >> mp->m_sb.sb_blocklog;
811 pf_batch_bytes = DEF_BATCH_BYTES;
812 pf_batch_fsbs = DEF_BATCH_BYTES >> (mp->m_sb.sb_blocklog + 1);
813 }
814
815 prefetch_args_t *
816 start_inode_prefetch(
817 xfs_agnumber_t agno,
818 int dirs_only,
819 prefetch_args_t *prev_args)
820 {
821 prefetch_args_t *args;
822 long max_queue;
823
824 if (!do_prefetch || agno >= mp->m_sb.sb_agcount)
825 return NULL;
826
827 args = calloc(1, sizeof(prefetch_args_t));
828
829 btree_init(&args->io_queue);
830 if (pthread_mutex_init(&args->lock, NULL) != 0)
831 do_error(_("failed to initialize prefetch mutex\n"));
832 if (pthread_cond_init(&args->start_reading, NULL) != 0)
833 do_error(_("failed to initialize prefetch cond var\n"));
834 if (pthread_cond_init(&args->start_processing, NULL) != 0)
835 do_error(_("failed to initialize prefetch cond var\n"));
836 args->agno = agno;
837 args->dirs_only = dirs_only;
838
839 /*
840 * use only 1/8 of the libxfs cache as we are only counting inodes
841 * and not any other associated metadata like directories
842 */
843
844 max_queue = libxfs_bcache->c_maxcount / thread_count / 8;
845 if (XFS_INODE_CLUSTER_SIZE(mp) > mp->m_sb.sb_blocksize)
846 max_queue = max_queue * (XFS_INODE_CLUSTER_SIZE(mp) >>
847 mp->m_sb.sb_blocklog) / XFS_IALLOC_BLOCKS(mp);
848
849 sem_init(&args->ra_count, 0, max_queue);
850
851 if (!prev_args) {
852 if (!pf_create_prefetch_thread(args))
853 return NULL;
854 } else {
855 pthread_mutex_lock(&prev_args->lock);
856 if (prev_args->prefetch_done) {
857 if (!pf_create_prefetch_thread(args))
858 args = NULL;
859 } else
860 prev_args->next_args = args;
861 pthread_mutex_unlock(&prev_args->lock);
862 }
863
864 return args;
865 }
866
867 /*
868 * prefetch_ag_range runs a prefetch-and-process loop across a range of AGs. It
869 * begins with @start+ag, and finishes with @end_ag - 1 (i.e. does not prefetch
870 * or process @end_ag). The function starts prefetch on the first AG, then loops
871 * starting prefetch on the next AG and then blocks processing the current AG as
872 * the prefetch queue brings inodes into the processing queue.
873 *
874 * There is only one prefetch taking place at a time, so the prefetch on the
875 * next AG only starts once the current AG has been completely prefetched. Hence
876 * the prefetch of the next AG will start some time before the processing of the
877 * current AG finishes, ensuring that when we iterate an start processing the
878 * next AG there is already a significant queue of inodes to process.
879 *
880 * Prefetch is done this way to prevent it from running too far ahead of the
881 * processing. Allowing it to do so can cause cache thrashing, where new
882 * prefetch causes previously prefetched buffers to be reclaimed before the
883 * processing thread uses them. This results in reading all the inodes and
884 * metadata twice per phase and it greatly slows down the processing. Hence we
885 * have to carefully control how far ahead we prefetch...
886 */
887 static void
888 prefetch_ag_range(
889 struct work_queue *work,
890 xfs_agnumber_t start_ag,
891 xfs_agnumber_t end_ag,
892 bool dirs_only,
893 void (*func)(struct work_queue *,
894 xfs_agnumber_t, void *))
895 {
896 int i;
897 struct prefetch_args *pf_args[2];
898
899 pf_args[start_ag & 1] = start_inode_prefetch(start_ag, dirs_only, NULL);
900 for (i = start_ag; i < end_ag; i++) {
901 /* Don't prefetch end_ag */
902 if (i + 1 < end_ag)
903 pf_args[(~i) & 1] = start_inode_prefetch(i + 1,
904 dirs_only, pf_args[i & 1]);
905 func(work, i, pf_args[i & 1]);
906 }
907 }
908
909 struct pf_work_args {
910 xfs_agnumber_t start_ag;
911 xfs_agnumber_t end_ag;
912 bool dirs_only;
913 void (*func)(struct work_queue *, xfs_agnumber_t, void *);
914 };
915
916 static void
917 prefetch_ag_range_work(
918 struct work_queue *work,
919 xfs_agnumber_t unused,
920 void *args)
921 {
922 struct pf_work_args *wargs = args;
923
924 prefetch_ag_range(work, wargs->start_ag, wargs->end_ag,
925 wargs->dirs_only, wargs->func);
926 free(args);
927 }
928
929 /*
930 * Do inode prefetch in the most optimal way for the context under which repair
931 * has been run.
932 */
933 void
934 do_inode_prefetch(
935 struct xfs_mount *mp,
936 int stride,
937 void (*func)(struct work_queue *,
938 xfs_agnumber_t, void *),
939 bool check_cache,
940 bool dirs_only)
941 {
942 int i;
943 struct work_queue queue;
944 struct work_queue *queues;
945 int queues_started = 0;
946
947 /*
948 * If the previous phases of repair have not overflowed the buffer
949 * cache, then we don't need to re-read any of the metadata in the
950 * filesystem - it's all in the cache. In that case, run a thread per
951 * CPU to maximise parallelism of the queue to be processed.
952 */
953 if (check_cache && !libxfs_bcache_overflowed()) {
954 queue.mp = mp;
955 create_work_queue(&queue, mp, libxfs_nproc());
956 for (i = 0; i < mp->m_sb.sb_agcount; i++)
957 queue_work(&queue, func, i, NULL);
958 destroy_work_queue(&queue);
959 return;
960 }
961
962 /*
963 * single threaded behaviour - single prefetch thread, processed
964 * directly after each AG is queued.
965 */
966 if (!stride) {
967 queue.mp = mp;
968 prefetch_ag_range(&queue, 0, mp->m_sb.sb_agcount,
969 dirs_only, func);
970 return;
971 }
972
973 /*
974 * create one worker thread for each segment of the volume
975 */
976 queues = malloc(thread_count * sizeof(work_queue_t));
977 for (i = 0; i < thread_count; i++) {
978 struct pf_work_args *wargs;
979
980 wargs = malloc(sizeof(struct pf_work_args));
981 wargs->start_ag = i * stride;
982 wargs->end_ag = min((i + 1) * stride,
983 mp->m_sb.sb_agcount);
984 wargs->dirs_only = dirs_only;
985 wargs->func = func;
986
987 create_work_queue(&queues[i], mp, 1);
988 queue_work(&queues[i], prefetch_ag_range_work, 0, wargs);
989 queues_started++;
990
991 if (wargs->end_ag >= mp->m_sb.sb_agcount)
992 break;
993 }
994
995 /*
996 * wait for workers to complete
997 */
998 for (i = 0; i < queues_started; i++)
999 destroy_work_queue(&queues[i]);
1000 free(queues);
1001 }
1002
1003 void
1004 wait_for_inode_prefetch(
1005 prefetch_args_t *args)
1006 {
1007 if (args == NULL)
1008 return;
1009
1010 pthread_mutex_lock(&args->lock);
1011
1012 while (!args->can_start_processing) {
1013 pftrace("waiting to start processing AG %d", args->agno);
1014
1015 pthread_cond_wait(&args->start_processing, &args->lock);
1016 }
1017 pftrace("can start processing AG %d", args->agno);
1018
1019 pthread_mutex_unlock(&args->lock);
1020 }
1021
1022 void
1023 cleanup_inode_prefetch(
1024 prefetch_args_t *args)
1025 {
1026 if (args == NULL)
1027 return;
1028
1029 pftrace("waiting AG %d prefetch to finish", args->agno);
1030
1031 if (args->queuing_thread)
1032 pthread_join(args->queuing_thread, NULL);
1033
1034 pftrace("AG %d prefetch done", args->agno);
1035
1036 pthread_mutex_destroy(&args->lock);
1037 pthread_cond_destroy(&args->start_reading);
1038 pthread_cond_destroy(&args->start_processing);
1039 sem_destroy(&args->ra_count);
1040 btree_destroy(args->io_queue);
1041
1042 free(args);
1043 }
1044
1045 #ifdef XR_PF_TRACE
1046
1047 static FILE *pf_trace_file;
1048
1049 void
1050 pftrace_init(void)
1051 {
1052 pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w");
1053 setvbuf(pf_trace_file, NULL, _IOLBF, 1024);
1054 }
1055
1056 void
1057 pftrace_done(void)
1058 {
1059 fclose(pf_trace_file);
1060 }
1061
1062 void
1063 _pftrace(const char *func, const char *msg, ...)
1064 {
1065 char buf[200];
1066 struct timeval tv;
1067 va_list args;
1068
1069 gettimeofday(&tv, NULL);
1070
1071 va_start(args, msg);
1072 vsnprintf(buf, sizeof(buf), msg, args);
1073 buf[sizeof(buf)-1] = '\0';
1074 va_end(args);
1075
1076 fprintf(pf_trace_file, "%lu.%06lu %s: %s\n", tv.tv_sec, tv.tv_usec,
1077 func, buf);
1078 }
1079
1080 #endif