]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - repair/prefetch.c
xfs: support embedded dfops in transaction
[thirdparty/xfsprogs-dev.git] / repair / prefetch.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "libxfs.h"
4 #include <pthread.h>
5 #include "avl.h"
6 #include "btree.h"
7 #include "globals.h"
8 #include "agheader.h"
9 #include "incore.h"
10 #include "dir2.h"
11 #include "protos.h"
12 #include "err_protos.h"
13 #include "dinode.h"
14 #include "bmap.h"
15 #include "versions.h"
16 #include "threads.h"
17 #include "prefetch.h"
18 #include "progress.h"
19
20 int do_prefetch = 1;
21
22 /*
23 * Performs prefetching by priming the libxfs cache by using a dedicate thread
24 * scanning inodes and reading blocks in ahead of time they are required.
25 *
26 * Any I/O errors can be safely ignored.
27 */
28
29 static xfs_mount_t *mp;
30 static int mp_fd;
31 static int pf_max_bytes;
32 static int pf_max_bbs;
33 static int pf_max_fsbs;
34 static int pf_batch_bytes;
35 static int pf_batch_fsbs;
36
37 static void pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *);
38
39 /*
40 * Buffer priorities for the libxfs cache
41 *
42 * Directory metadata is ranked higher than other metadata as it's used
43 * in phases 3, 4 and 6, while other metadata is only used in 3 & 4.
44 */
45
46 /* intermediate directory btree nodes - can't be queued */
47 #define B_DIR_BMAP CACHE_PREFETCH_PRIORITY + 7
48 /* directory metadata in secondary queue */
49 #define B_DIR_META_2 CACHE_PREFETCH_PRIORITY + 6
50 /* dir metadata that had to fetched from the primary queue to avoid stalling */
51 #define B_DIR_META_H CACHE_PREFETCH_PRIORITY + 5
52 /* single block of directory metadata (can't batch read) */
53 #define B_DIR_META_S CACHE_PREFETCH_PRIORITY + 4
54 /* dir metadata with more than one block fetched in a single I/O */
55 #define B_DIR_META CACHE_PREFETCH_PRIORITY + 3
56 /* inode clusters with directory inodes */
57 #define B_DIR_INODE CACHE_PREFETCH_PRIORITY + 2
58 /* intermediate extent btree nodes */
59 #define B_BMAP CACHE_PREFETCH_PRIORITY + 1
60 /* inode clusters without any directory entries */
61 #define B_INODE CACHE_PREFETCH_PRIORITY
62
63 /*
64 * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if
65 * the buffer is for an inode or other metadata.
66 */
67 #define B_IS_INODE(f) (((f) & 5) == 0)
68
69 #define DEF_BATCH_BYTES 0x10000
70
71 #define MAX_BUFS 128
72
73 #define IO_THRESHOLD (MAX_BUFS * 2)
74
75 typedef enum pf_which {
76 PF_PRIMARY,
77 PF_SECONDARY,
78 PF_META_ONLY
79 } pf_which_t;
80
81
82 static inline void
83 pf_start_processing(
84 prefetch_args_t *args)
85 {
86 if (!args->can_start_processing) {
87 pftrace("signalling processing for AG %d", args->agno);
88
89 args->can_start_processing = 1;
90 pthread_cond_signal(&args->start_processing);
91 }
92 }
93
94 static inline void
95 pf_start_io_workers(
96 prefetch_args_t *args)
97 {
98 if (!args->can_start_reading) {
99 pftrace("signalling reading for AG %d", args->agno);
100
101 args->can_start_reading = 1;
102 pthread_cond_broadcast(&args->start_reading);
103 }
104 }
105
106
107 static void
108 pf_queue_io(
109 prefetch_args_t *args,
110 struct xfs_buf_map *map,
111 int nmaps,
112 int flag)
113 {
114 struct xfs_buf *bp;
115 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, map[0].bm_bn);
116
117 /*
118 * Never block on a buffer lock here, given that the actual repair
119 * code might lock buffers in a different order from us. Given that
120 * the lock holder is either reading it from disk himself or
121 * completely overwriting it this behaviour is perfectly fine.
122 */
123 bp = libxfs_getbuf_map(mp->m_dev, map, nmaps, LIBXFS_GETBUF_TRYLOCK);
124 if (!bp)
125 return;
126
127 if (bp->b_flags & LIBXFS_B_UPTODATE) {
128 if (B_IS_INODE(flag))
129 pf_read_inode_dirs(args, bp);
130 XFS_BUF_SET_PRIORITY(bp, XFS_BUF_PRIORITY(bp) +
131 CACHE_PREFETCH_PRIORITY);
132 libxfs_putbuf(bp);
133 return;
134 }
135 XFS_BUF_SET_PRIORITY(bp, flag);
136
137 pthread_mutex_lock(&args->lock);
138
139 btree_insert(args->io_queue, fsbno, bp);
140
141 if (fsbno > args->last_bno_read) {
142 if (B_IS_INODE(flag)) {
143 args->inode_bufs_queued++;
144 if (args->inode_bufs_queued == IO_THRESHOLD)
145 pf_start_io_workers(args);
146 }
147 } else {
148 ASSERT(!B_IS_INODE(flag));
149 XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
150 }
151
152 pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue"
153 "(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag) ?
154 'I' : 'M', bp, (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
155 args->inode_bufs_queued, args->last_bno_read);
156
157 pf_start_processing(args);
158
159 pthread_mutex_unlock(&args->lock);
160 }
161
162 static int
163 pf_read_bmbt_reclist(
164 prefetch_args_t *args,
165 xfs_bmbt_rec_t *rp,
166 int numrecs)
167 {
168 int i;
169 xfs_bmbt_irec_t irec;
170 xfs_filblks_t cp = 0; /* prev count */
171 xfs_fileoff_t op = 0; /* prev offset */
172 #define MAP_ARRAY_SZ 4
173 struct xfs_buf_map map_array[MAP_ARRAY_SZ];
174 struct xfs_buf_map *map = map_array;
175 int max_extents = MAP_ARRAY_SZ;
176 int nmaps = 0;
177 unsigned int len = 0;
178 int ret = 0;
179
180
181 for (i = 0; i < numrecs; i++) {
182 libxfs_bmbt_disk_get_all(rp + i, &irec);
183
184 if (((i > 0) && (op + cp > irec.br_startoff)) ||
185 (irec.br_blockcount == 0) ||
186 (irec.br_startoff >= fs_max_file_offset))
187 goto out_free;
188
189 if (!verify_dfsbno(mp, irec.br_startblock) || !verify_dfsbno(mp,
190 irec.br_startblock + irec.br_blockcount - 1))
191 goto out_free;
192
193 if (!args->dirs_only && ((irec.br_startoff +
194 irec.br_blockcount) >= mp->m_dir_geo->freeblk))
195 break; /* only Phase 6 reads the free blocks */
196
197 op = irec.br_startoff;
198 cp = irec.br_blockcount;
199
200 while (irec.br_blockcount) {
201 unsigned int bm_len;
202
203 pftrace("queuing dir extent in AG %d", args->agno);
204
205 if (len + irec.br_blockcount >= mp->m_dir_geo->fsbcount)
206 bm_len = mp->m_dir_geo->fsbcount - len;
207 else
208 bm_len = irec.br_blockcount;
209 len += bm_len;
210
211 map[nmaps].bm_bn = XFS_FSB_TO_DADDR(mp,
212 irec.br_startblock);
213 map[nmaps].bm_len = XFS_FSB_TO_BB(mp, bm_len);
214 nmaps++;
215
216 if (len == mp->m_dir_geo->fsbcount) {
217 pf_queue_io(args, map, nmaps, B_DIR_META);
218 len = 0;
219 nmaps = 0;
220 }
221
222 irec.br_blockcount -= bm_len;
223 irec.br_startblock += bm_len;
224
225 /*
226 * Handle very fragmented dir2 blocks with dynamically
227 * allocated buffer maps.
228 */
229 if (nmaps >= max_extents) {
230 struct xfs_buf_map *old_map = NULL;
231
232 if (map == map_array) {
233 old_map = map;
234 map = NULL;
235 }
236 max_extents *= 2;
237 map = realloc(map, max_extents * sizeof(*map));
238 if (map == NULL) {
239 do_error(
240 _("couldn't malloc dir2 buffer list\n"));
241 exit(1);
242 }
243 if (old_map)
244 memcpy(map, old_map, sizeof(map_array));
245 }
246
247 }
248 }
249 ret = 1;
250 out_free:
251 if (map != map_array)
252 free(map);
253 return ret;
254 }
255
256 /*
257 * simplified version of the main scan_lbtree. Returns 0 to stop.
258 */
259
260 static int
261 pf_scan_lbtree(
262 xfs_fsblock_t dbno,
263 int level,
264 int isadir,
265 prefetch_args_t *args,
266 int (*func)(struct xfs_btree_block *block,
267 int level,
268 int isadir,
269 prefetch_args_t *args))
270 {
271 xfs_buf_t *bp;
272 int rc;
273
274 bp = libxfs_readbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, dbno),
275 XFS_FSB_TO_BB(mp, 1), 0, &xfs_bmbt_buf_ops);
276 if (!bp)
277 return 0;
278
279 XFS_BUF_SET_PRIORITY(bp, isadir ? B_DIR_BMAP : B_BMAP);
280
281 /*
282 * If the verifier flagged a problem with the buffer, we can't trust
283 * its contents for the purposes of reading ahead. Stop prefetching
284 * the tree and mark the buffer unchecked so that the next read of the
285 * buffer will retain the error status and be acted upon appropriately.
286 */
287 if (bp->b_error) {
288 bp->b_flags |= LIBXFS_B_UNCHECKED;
289 libxfs_putbuf(bp);
290 return 0;
291 }
292
293 rc = (*func)(XFS_BUF_TO_BLOCK(bp), level - 1, isadir, args);
294
295 libxfs_putbuf(bp);
296
297 return rc;
298 }
299
300 static int
301 pf_scanfunc_bmap(
302 struct xfs_btree_block *block,
303 int level,
304 int isadir,
305 prefetch_args_t *args)
306 {
307 xfs_bmbt_ptr_t *pp;
308 int numrecs;
309 int i;
310 xfs_fsblock_t dbno;
311
312 /*
313 * do some validation on the block contents
314 */
315 if ((block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) &&
316 block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC)) ||
317 (be16_to_cpu(block->bb_level) != level))
318 return 0;
319
320 numrecs = be16_to_cpu(block->bb_numrecs);
321
322 if (level == 0) {
323 if (numrecs > mp->m_bmap_dmxr[0] || !isadir)
324 return 0;
325 return pf_read_bmbt_reclist(args,
326 XFS_BMBT_REC_ADDR(mp, block, 1), numrecs);
327 }
328
329 if (numrecs > mp->m_bmap_dmxr[1])
330 return 0;
331
332 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
333
334 for (i = 0; i < numrecs; i++) {
335 dbno = get_unaligned_be64(&pp[i]);
336 if (!verify_dfsbno(mp, dbno))
337 return 0;
338 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
339 return 0;
340 }
341 return 1;
342 }
343
344
345 static void
346 pf_read_btinode(
347 prefetch_args_t *args,
348 xfs_dinode_t *dino,
349 int isadir)
350 {
351 xfs_bmdr_block_t *dib;
352 xfs_bmbt_ptr_t *pp;
353 int i;
354 int level;
355 int numrecs;
356 int dsize;
357 xfs_fsblock_t dbno;
358
359 dib = (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dino);
360
361 level = be16_to_cpu(dib->bb_level);
362 numrecs = be16_to_cpu(dib->bb_numrecs);
363
364 if ((numrecs == 0) || (level == 0) ||
365 (level > XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))
366 return;
367 /*
368 * use bmdr/dfork_dsize since the root block is in the data fork
369 */
370 if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp))
371 return;
372
373 dsize = XFS_DFORK_DSIZE(dino, mp);
374 pp = XFS_BMDR_PTR_ADDR(dib, 1, libxfs_bmdr_maxrecs(dsize, 0));
375
376 for (i = 0; i < numrecs; i++) {
377 dbno = get_unaligned_be64(&pp[i]);
378 if (!verify_dfsbno(mp, dbno))
379 break;
380 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
381 break;
382 }
383 }
384
385 static void
386 pf_read_exinode(
387 prefetch_args_t *args,
388 xfs_dinode_t *dino)
389 {
390 pf_read_bmbt_reclist(args, (xfs_bmbt_rec_t *)XFS_DFORK_DPTR(dino),
391 be32_to_cpu(dino->di_nextents));
392 }
393
394 static void
395 pf_read_inode_dirs(
396 prefetch_args_t *args,
397 xfs_buf_t *bp)
398 {
399 xfs_dinode_t *dino;
400 int icnt = 0;
401 int hasdir = 0;
402 int isadir;
403
404 libxfs_readbuf_verify(bp, &xfs_inode_buf_ops);
405 if (bp->b_error)
406 return;
407
408 for (icnt = 0; icnt < (bp->b_bcount >> mp->m_sb.sb_inodelog); icnt++) {
409 dino = xfs_make_iptr(mp, bp, icnt);
410
411 /*
412 * We are only prefetching directory contents in extents
413 * and btree nodes for other inodes
414 */
415 isadir = (be16_to_cpu(dino->di_mode) & S_IFMT) == S_IFDIR;
416 hasdir |= isadir;
417
418 if (dino->di_format <= XFS_DINODE_FMT_LOCAL)
419 continue;
420
421 if (!isadir && (dino->di_format == XFS_DINODE_FMT_EXTENTS ||
422 args->dirs_only))
423 continue;
424
425 /*
426 * do some checks on the inode to see if we can prefetch
427 * its directory data. It's a cut down version of
428 * process_dinode_int() in dinode.c.
429 */
430 if (dino->di_format > XFS_DINODE_FMT_BTREE)
431 continue;
432
433 if (be16_to_cpu(dino->di_magic) != XFS_DINODE_MAGIC)
434 continue;
435
436 if (!libxfs_dinode_good_version(mp, dino->di_version))
437 continue;
438
439 if (be64_to_cpu(dino->di_size) <= XFS_DFORK_DSIZE(dino, mp))
440 continue;
441
442 if ((dino->di_forkoff != 0) &&
443 (dino->di_forkoff >= XFS_LITINO(mp, dino->di_version) >> 3))
444 continue;
445
446 switch (dino->di_format) {
447 case XFS_DINODE_FMT_EXTENTS:
448 pf_read_exinode(args, dino);
449 break;
450 case XFS_DINODE_FMT_BTREE:
451 pf_read_btinode(args, dino, isadir);
452 break;
453 }
454 }
455 if (hasdir)
456 XFS_BUF_SET_PRIORITY(bp, B_DIR_INODE);
457 }
458
459 /*
460 * pf_batch_read must be called with the lock locked.
461 */
462 static void
463 pf_batch_read(
464 prefetch_args_t *args,
465 pf_which_t which,
466 void *buf)
467 {
468 xfs_buf_t *bplist[MAX_BUFS];
469 unsigned int num;
470 off64_t first_off, last_off, next_off;
471 int len, size;
472 int i;
473 int inode_bufs;
474 unsigned long fsbno = 0;
475 unsigned long max_fsbno;
476 char *pbuf;
477
478 for (;;) {
479 num = 0;
480 if (which == PF_SECONDARY) {
481 bplist[0] = btree_find(args->io_queue, 0, &fsbno);
482 max_fsbno = min(fsbno + pf_max_fsbs,
483 args->last_bno_read);
484 } else {
485 bplist[0] = btree_find(args->io_queue,
486 args->last_bno_read, &fsbno);
487 max_fsbno = fsbno + pf_max_fsbs;
488 }
489 while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) {
490 /*
491 * Discontiguous buffers need special handling, so stop
492 * gathering new buffers and process the list and this
493 * discontigous buffer immediately. This avoids the
494 * complexity of keeping a separate discontigous buffer
495 * list and seeking back over ranges we've already done
496 * optimised reads for.
497 */
498 if ((bplist[num]->b_flags & LIBXFS_B_DISCONTIG)) {
499 num++;
500 break;
501 }
502
503 if (which != PF_META_ONLY ||
504 !B_IS_INODE(XFS_BUF_PRIORITY(bplist[num])))
505 num++;
506 if (num == MAX_BUFS)
507 break;
508 bplist[num] = btree_lookup_next(args->io_queue, &fsbno);
509 }
510 if (!num)
511 return;
512
513 /*
514 * do a big read if 25% of the potential buffer is useful,
515 * otherwise, find as many close together blocks and
516 * read them in one read
517 */
518 first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0]));
519 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
520 XFS_BUF_SIZE(bplist[num-1]);
521 while (num > 1 && last_off - first_off > pf_max_bytes) {
522 num--;
523 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
524 XFS_BUF_SIZE(bplist[num-1]);
525 }
526 if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) {
527 /*
528 * not enough blocks for one big read, so determine
529 * the number of blocks that are close enough.
530 */
531 last_off = first_off + XFS_BUF_SIZE(bplist[0]);
532 for (i = 1; i < num; i++) {
533 next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) +
534 XFS_BUF_SIZE(bplist[i]);
535 if (next_off - last_off > pf_batch_bytes)
536 break;
537 last_off = next_off;
538 }
539 num = i;
540 }
541
542 for (i = 0; i < num; i++) {
543 if (btree_delete(args->io_queue, XFS_DADDR_TO_FSB(mp,
544 XFS_BUF_ADDR(bplist[i]))) == NULL)
545 do_error(_("prefetch corruption\n"));
546 }
547
548 if (which == PF_PRIMARY) {
549 for (inode_bufs = 0, i = 0; i < num; i++) {
550 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
551 inode_bufs++;
552 }
553 args->inode_bufs_queued -= inode_bufs;
554 if (inode_bufs && (first_off >> mp->m_sb.sb_blocklog) >
555 pf_batch_fsbs)
556 args->last_bno_read = (first_off >> mp->m_sb.sb_blocklog);
557 }
558 #ifdef XR_PF_TRACE
559 pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)",
560 (long long)XFS_BUF_ADDR(bplist[0]),
561 (long long)XFS_BUF_ADDR(bplist[num-1]), num,
562 (which != PF_SECONDARY) ? "pri" : "sec", args->agno,
563 args->last_bno_read, args->inode_bufs_queued);
564 #endif
565 pthread_mutex_unlock(&args->lock);
566
567 /*
568 * now read the data and put into the xfs_but_t's
569 */
570 len = pread(mp_fd, buf, (int)(last_off - first_off), first_off);
571
572 /*
573 * Check the last buffer on the list to see if we need to
574 * process a discontiguous buffer. The gather above loop
575 * guarantees that only the last buffer in the list will be a
576 * discontiguous buffer.
577 */
578 if ((bplist[num - 1]->b_flags & LIBXFS_B_DISCONTIG)) {
579 libxfs_readbufr_map(mp->m_ddev_targp, bplist[num - 1], 0);
580 bplist[num - 1]->b_flags |= LIBXFS_B_UNCHECKED;
581 libxfs_putbuf(bplist[num - 1]);
582 num--;
583 }
584
585 if (len > 0) {
586 /*
587 * go through the xfs_buf_t list copying from the
588 * read buffer into the xfs_buf_t's and release them.
589 */
590 for (i = 0; i < num; i++) {
591
592 pbuf = ((char *)buf) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) - first_off);
593 size = XFS_BUF_SIZE(bplist[i]);
594 if (len < size)
595 break;
596 memcpy(bplist[i]->b_addr, pbuf, size);
597 bplist[i]->b_flags |= (LIBXFS_B_UPTODATE |
598 LIBXFS_B_UNCHECKED);
599 len -= size;
600 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
601 pf_read_inode_dirs(args, bplist[i]);
602 else if (which == PF_META_ONLY)
603 XFS_BUF_SET_PRIORITY(bplist[i],
604 B_DIR_META_H);
605 else if (which == PF_PRIMARY && num == 1)
606 XFS_BUF_SET_PRIORITY(bplist[i],
607 B_DIR_META_S);
608 }
609 }
610 for (i = 0; i < num; i++) {
611 pftrace("putbuf %c %p (%llu) in AG %d",
612 B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M',
613 bplist[i], (long long)XFS_BUF_ADDR(bplist[i]),
614 args->agno);
615 libxfs_putbuf(bplist[i]);
616 }
617 pthread_mutex_lock(&args->lock);
618 if (which != PF_SECONDARY) {
619 pftrace("inode_bufs_queued for AG %d = %d", args->agno,
620 args->inode_bufs_queued);
621 /*
622 * if primary inode queue running low, process metadata
623 * in boths queues to avoid I/O starvation as the
624 * processing thread would be waiting for a metadata
625 * buffer
626 */
627 if (which == PF_PRIMARY && !args->queuing_done &&
628 args->inode_bufs_queued < IO_THRESHOLD) {
629 pftrace("reading metadata bufs from primary queue for AG %d",
630 args->agno);
631
632 pf_batch_read(args, PF_META_ONLY, buf);
633
634 pftrace("reading bufs from secondary queue for AG %d",
635 args->agno);
636
637 pf_batch_read(args, PF_SECONDARY, buf);
638 }
639 }
640 }
641 }
642
643 static void *
644 pf_io_worker(
645 void *param)
646 {
647 prefetch_args_t *args = param;
648 void *buf = memalign(libxfs_device_alignment(),
649 pf_max_bytes);
650
651 if (buf == NULL)
652 return NULL;
653
654 pthread_mutex_lock(&args->lock);
655 while (!args->queuing_done || !btree_is_empty(args->io_queue)) {
656 pftrace("waiting to start prefetch I/O for AG %d", args->agno);
657
658 while (!args->can_start_reading && !args->queuing_done)
659 pthread_cond_wait(&args->start_reading, &args->lock);
660
661 pftrace("starting prefetch I/O for AG %d", args->agno);
662
663 pf_batch_read(args, PF_PRIMARY, buf);
664 pf_batch_read(args, PF_SECONDARY, buf);
665
666 pftrace("ran out of bufs to prefetch for AG %d", args->agno);
667
668 if (!args->queuing_done)
669 args->can_start_reading = 0;
670 }
671 pthread_mutex_unlock(&args->lock);
672
673 free(buf);
674
675 pftrace("finished prefetch I/O for AG %d", args->agno);
676
677 return NULL;
678 }
679
680 static int
681 pf_create_prefetch_thread(
682 prefetch_args_t *args);
683
684 /*
685 * If we fail to create the queuing thread or can't create even one
686 * prefetch thread, we need to let processing continue without it.
687 */
688 static void
689 pf_skip_prefetch_thread(prefetch_args_t *args)
690 {
691 prefetch_args_t *next;
692
693 pthread_mutex_lock(&args->lock);
694 args->prefetch_done = 1;
695 pf_start_processing(args);
696 next = args->next_args;
697 args->next_args = NULL;
698 pthread_mutex_unlock(&args->lock);
699
700 if (next)
701 pf_create_prefetch_thread(next);
702 }
703
704 static void *
705 pf_queuing_worker(
706 void *param)
707 {
708 prefetch_args_t *args = param;
709 prefetch_args_t *next_args;
710 int num_inos;
711 ino_tree_node_t *irec;
712 ino_tree_node_t *cur_irec;
713 int blks_per_cluster;
714 xfs_agblock_t bno;
715 int i;
716 int err;
717 uint64_t sparse;
718
719 blks_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
720 if (blks_per_cluster == 0)
721 blks_per_cluster = 1;
722
723 for (i = 0; i < PF_THREAD_COUNT; i++) {
724 err = pthread_create(&args->io_threads[i], NULL,
725 pf_io_worker, args);
726 if (err != 0) {
727 do_warn(_("failed to create prefetch thread: %s\n"),
728 strerror(err));
729 pftrace("failed to create prefetch thread for AG %d: %s",
730 args->agno, strerror(err));
731 args->io_threads[i] = 0;
732 if (i == 0) {
733 pf_skip_prefetch_thread(args);
734 return NULL;
735 }
736 /*
737 * since we have at least one I/O thread, use them for
738 * prefetch
739 */
740 break;
741 }
742 }
743 pftrace("starting prefetch for AG %d", args->agno);
744
745 for (irec = findfirst_inode_rec(args->agno); irec != NULL;
746 irec = next_ino_rec(irec)) {
747
748 cur_irec = irec;
749
750 num_inos = XFS_INODES_PER_CHUNK;
751 while (num_inos < mp->m_ialloc_inos && irec != NULL) {
752 irec = next_ino_rec(irec);
753 num_inos += XFS_INODES_PER_CHUNK;
754 }
755
756 if (args->dirs_only && cur_irec->ino_isa_dir == 0)
757 continue;
758 #ifdef XR_PF_TRACE
759 sem_getvalue(&args->ra_count, &i);
760 pftrace("queuing irec %p in AG %d, sem count = %d",
761 irec, args->agno, i);
762 #endif
763 err = sem_trywait(&args->ra_count);
764 if (err < 0 && errno == EAGAIN) {
765 /*
766 * Kick the queue once we have reached the limit;
767 * without this the threads processing the inodes
768 * might get stuck on a buffer that has been locked
769 * and added to the I/O queue but is waiting for
770 * the thread to be woken.
771 */
772 pf_start_io_workers(args);
773 sem_wait(&args->ra_count);
774 }
775
776 num_inos = 0;
777 bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum);
778 sparse = cur_irec->ir_sparse;
779
780 do {
781 struct xfs_buf_map map;
782
783 map.bm_bn = XFS_AGB_TO_DADDR(mp, args->agno, bno);
784 map.bm_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
785
786 /*
787 * Queue I/O for each non-sparse cluster. We can check
788 * sparse state in cluster sized chunks as cluster size
789 * is the min. granularity of sparse irec regions.
790 */
791 if ((sparse & ((1ULL << inodes_per_cluster) - 1)) == 0)
792 pf_queue_io(args, &map, 1,
793 (cur_irec->ino_isa_dir != 0) ?
794 B_DIR_INODE : B_INODE);
795
796 bno += blks_per_cluster;
797 num_inos += inodes_per_cluster;
798 sparse >>= inodes_per_cluster;
799 } while (num_inos < mp->m_ialloc_inos);
800 }
801
802 pthread_mutex_lock(&args->lock);
803
804 pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)",
805 args->agno, args->inode_bufs_queued);
806
807 args->queuing_done = 1;
808 pf_start_io_workers(args);
809 pf_start_processing(args);
810 pthread_mutex_unlock(&args->lock);
811
812 /* now wait for the readers to finish */
813 for (i = 0; i < PF_THREAD_COUNT; i++)
814 if (args->io_threads[i])
815 pthread_join(args->io_threads[i], NULL);
816
817 pftrace("prefetch for AG %d finished", args->agno);
818
819 pthread_mutex_lock(&args->lock);
820
821 ASSERT(btree_is_empty(args->io_queue));
822
823 args->prefetch_done = 1;
824 next_args = args->next_args;
825 args->next_args = NULL;
826 pthread_mutex_unlock(&args->lock);
827
828 if (next_args)
829 pf_create_prefetch_thread(next_args);
830
831 return NULL;
832 }
833
834 static int
835 pf_create_prefetch_thread(
836 prefetch_args_t *args)
837 {
838 int err;
839
840 pftrace("creating queue thread for AG %d", args->agno);
841
842 err = pthread_create(&args->queuing_thread, NULL,
843 pf_queuing_worker, args);
844 if (err != 0) {
845 do_warn(_("failed to create prefetch thread: %s\n"),
846 strerror(err));
847 pftrace("failed to create prefetch thread for AG %d: %s",
848 args->agno, strerror(err));
849 args->queuing_thread = 0;
850 pf_skip_prefetch_thread(args);
851 }
852
853 return err == 0;
854 }
855
856 void
857 init_prefetch(
858 xfs_mount_t *pmp)
859 {
860 mp = pmp;
861 mp_fd = libxfs_device_to_fd(mp->m_ddev_targp->dev);
862 pf_max_bytes = sysconf(_SC_PAGE_SIZE) << 7;
863 pf_max_bbs = pf_max_bytes >> BBSHIFT;
864 pf_max_fsbs = pf_max_bytes >> mp->m_sb.sb_blocklog;
865 pf_batch_bytes = DEF_BATCH_BYTES;
866 pf_batch_fsbs = DEF_BATCH_BYTES >> (mp->m_sb.sb_blocklog + 1);
867 }
868
869 prefetch_args_t *
870 start_inode_prefetch(
871 xfs_agnumber_t agno,
872 int dirs_only,
873 prefetch_args_t *prev_args)
874 {
875 prefetch_args_t *args;
876 long max_queue;
877
878 if (!do_prefetch || agno >= mp->m_sb.sb_agcount)
879 return NULL;
880
881 args = calloc(1, sizeof(prefetch_args_t));
882
883 btree_init(&args->io_queue);
884 if (pthread_mutex_init(&args->lock, NULL) != 0)
885 do_error(_("failed to initialize prefetch mutex\n"));
886 if (pthread_cond_init(&args->start_reading, NULL) != 0)
887 do_error(_("failed to initialize prefetch cond var\n"));
888 if (pthread_cond_init(&args->start_processing, NULL) != 0)
889 do_error(_("failed to initialize prefetch cond var\n"));
890 args->agno = agno;
891 args->dirs_only = dirs_only;
892
893 /*
894 * use only 1/8 of the libxfs cache as we are only counting inodes
895 * and not any other associated metadata like directories
896 */
897
898 max_queue = libxfs_bcache->c_maxcount / thread_count / 8;
899 if (mp->m_inode_cluster_size > mp->m_sb.sb_blocksize)
900 max_queue = max_queue *
901 (mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog) /
902 mp->m_ialloc_blks;
903
904 sem_init(&args->ra_count, 0, max_queue);
905
906 if (!prev_args) {
907 if (!pf_create_prefetch_thread(args))
908 return NULL;
909 } else {
910 pthread_mutex_lock(&prev_args->lock);
911 if (prev_args->prefetch_done) {
912 pthread_mutex_unlock(&prev_args->lock);
913 if (!pf_create_prefetch_thread(args))
914 args = NULL;
915 } else {
916 prev_args->next_args = args;
917 pftrace("queued AG %d after AG %d",
918 args->agno, prev_args->agno);
919 pthread_mutex_unlock(&prev_args->lock);
920 }
921 }
922
923 return args;
924 }
925
926 /*
927 * prefetch_ag_range runs a prefetch-and-process loop across a range of AGs. It
928 * begins with @start+ag, and finishes with @end_ag - 1 (i.e. does not prefetch
929 * or process @end_ag). The function starts prefetch on the first AG, then loops
930 * starting prefetch on the next AG and then blocks processing the current AG as
931 * the prefetch queue brings inodes into the processing queue.
932 *
933 * There is only one prefetch taking place at a time, so the prefetch on the
934 * next AG only starts once the current AG has been completely prefetched. Hence
935 * the prefetch of the next AG will start some time before the processing of the
936 * current AG finishes, ensuring that when we iterate an start processing the
937 * next AG there is already a significant queue of inodes to process.
938 *
939 * Prefetch is done this way to prevent it from running too far ahead of the
940 * processing. Allowing it to do so can cause cache thrashing, where new
941 * prefetch causes previously prefetched buffers to be reclaimed before the
942 * processing thread uses them. This results in reading all the inodes and
943 * metadata twice per phase and it greatly slows down the processing. Hence we
944 * have to carefully control how far ahead we prefetch...
945 */
946 static void
947 prefetch_ag_range(
948 struct workqueue *work,
949 xfs_agnumber_t start_ag,
950 xfs_agnumber_t end_ag,
951 bool dirs_only,
952 void (*func)(struct workqueue *,
953 xfs_agnumber_t, void *))
954 {
955 int i;
956 struct prefetch_args *pf_args[2];
957
958 pf_args[start_ag & 1] = start_inode_prefetch(start_ag, dirs_only, NULL);
959 for (i = start_ag; i < end_ag; i++) {
960 /* Don't prefetch end_ag */
961 if (i + 1 < end_ag)
962 pf_args[(~i) & 1] = start_inode_prefetch(i + 1,
963 dirs_only, pf_args[i & 1]);
964 func(work, i, pf_args[i & 1]);
965 }
966 }
967
968 struct pf_work_args {
969 xfs_agnumber_t start_ag;
970 xfs_agnumber_t end_ag;
971 bool dirs_only;
972 void (*func)(struct workqueue *, xfs_agnumber_t, void *);
973 };
974
975 static void
976 prefetch_ag_range_work(
977 struct workqueue *work,
978 xfs_agnumber_t unused,
979 void *args)
980 {
981 struct pf_work_args *wargs = args;
982
983 prefetch_ag_range(work, wargs->start_ag, wargs->end_ag,
984 wargs->dirs_only, wargs->func);
985 free(args);
986 }
987
988 /*
989 * Do inode prefetch in the most optimal way for the context under which repair
990 * has been run.
991 */
992 void
993 do_inode_prefetch(
994 struct xfs_mount *mp,
995 int stride,
996 void (*func)(struct workqueue *,
997 xfs_agnumber_t, void *),
998 bool check_cache,
999 bool dirs_only)
1000 {
1001 int i;
1002 struct workqueue queue;
1003 struct workqueue *queues;
1004 int queues_started = 0;
1005
1006 /*
1007 * If the previous phases of repair have not overflowed the buffer
1008 * cache, then we don't need to re-read any of the metadata in the
1009 * filesystem - it's all in the cache. In that case, run a thread per
1010 * CPU to maximise parallelism of the queue to be processed.
1011 */
1012 if (check_cache && !libxfs_bcache_overflowed()) {
1013 queue.wq_ctx = mp;
1014 create_work_queue(&queue, mp, libxfs_nproc());
1015 for (i = 0; i < mp->m_sb.sb_agcount; i++)
1016 queue_work(&queue, func, i, NULL);
1017 destroy_work_queue(&queue);
1018 return;
1019 }
1020
1021 /*
1022 * single threaded behaviour - single prefetch thread, processed
1023 * directly after each AG is queued.
1024 */
1025 if (!stride) {
1026 queue.wq_ctx = mp;
1027 prefetch_ag_range(&queue, 0, mp->m_sb.sb_agcount,
1028 dirs_only, func);
1029 return;
1030 }
1031
1032 /*
1033 * create one worker thread for each segment of the volume
1034 */
1035 queues = malloc(thread_count * sizeof(struct workqueue));
1036 for (i = 0; i < thread_count; i++) {
1037 struct pf_work_args *wargs;
1038
1039 wargs = malloc(sizeof(struct pf_work_args));
1040 wargs->start_ag = i * stride;
1041 wargs->end_ag = min((i + 1) * stride,
1042 mp->m_sb.sb_agcount);
1043 wargs->dirs_only = dirs_only;
1044 wargs->func = func;
1045
1046 create_work_queue(&queues[i], mp, 1);
1047 queue_work(&queues[i], prefetch_ag_range_work, 0, wargs);
1048 queues_started++;
1049
1050 if (wargs->end_ag >= mp->m_sb.sb_agcount)
1051 break;
1052 }
1053
1054 /*
1055 * wait for workers to complete
1056 */
1057 for (i = 0; i < queues_started; i++)
1058 destroy_work_queue(&queues[i]);
1059 free(queues);
1060 }
1061
1062 void
1063 wait_for_inode_prefetch(
1064 prefetch_args_t *args)
1065 {
1066 if (args == NULL)
1067 return;
1068
1069 pthread_mutex_lock(&args->lock);
1070
1071 while (!args->can_start_processing) {
1072 pftrace("waiting to start processing AG %d", args->agno);
1073
1074 pthread_cond_wait(&args->start_processing, &args->lock);
1075 }
1076 pftrace("can start processing AG %d", args->agno);
1077
1078 pthread_mutex_unlock(&args->lock);
1079 }
1080
1081 void
1082 cleanup_inode_prefetch(
1083 prefetch_args_t *args)
1084 {
1085 if (args == NULL)
1086 return;
1087
1088 pftrace("waiting AG %d prefetch to finish", args->agno);
1089
1090 if (args->queuing_thread)
1091 pthread_join(args->queuing_thread, NULL);
1092
1093 pftrace("AG %d prefetch done", args->agno);
1094
1095 ASSERT(args->next_args == NULL);
1096
1097 pthread_mutex_destroy(&args->lock);
1098 pthread_cond_destroy(&args->start_reading);
1099 pthread_cond_destroy(&args->start_processing);
1100 sem_destroy(&args->ra_count);
1101 btree_destroy(args->io_queue);
1102
1103 free(args);
1104 }
1105
1106 #ifdef XR_PF_TRACE
1107
1108 static FILE *pf_trace_file;
1109
1110 void
1111 pftrace_init(void)
1112 {
1113 pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w");
1114 setvbuf(pf_trace_file, NULL, _IOLBF, 1024);
1115 }
1116
1117 void
1118 pftrace_done(void)
1119 {
1120 fclose(pf_trace_file);
1121 }
1122
1123 void
1124 _pftrace(const char *func, const char *msg, ...)
1125 {
1126 char buf[200];
1127 struct timeval tv;
1128 va_list args;
1129
1130 gettimeofday(&tv, NULL);
1131
1132 va_start(args, msg);
1133 vsnprintf(buf, sizeof(buf), msg, args);
1134 buf[sizeof(buf)-1] = '\0';
1135 va_end(args);
1136
1137 fprintf(pf_trace_file, "%lu.%06lu %s: %s\n", tv.tv_sec, tv.tv_usec,
1138 func, buf);
1139 }
1140
1141 #endif