]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - repair/prefetch.c
Revert "Merge branch 'xfsprogs-dev'"
[thirdparty/xfsprogs-dev.git] / repair / prefetch.c
1 #include <libxfs.h>
2 #include <pthread.h>
3 #include "avl.h"
4 #include "globals.h"
5 #include "agheader.h"
6 #include "incore.h"
7 #include "dir.h"
8 #include "dir2.h"
9 #include "protos.h"
10 #include "err_protos.h"
11 #include "dinode.h"
12 #include "bmap.h"
13 #include "versions.h"
14 #include "threads.h"
15 #include "prefetch.h"
16 #include "progress.h"
17 #include "radix-tree.h"
18
19 int do_prefetch = 1;
20
21 /*
22 * Performs prefetching by priming the libxfs cache by using a dedicate thread
23 * scanning inodes and reading blocks in ahead of time they are required.
24 *
25 * Any I/O errors can be safely ignored.
26 */
27
28 static xfs_mount_t *mp;
29 static int mp_fd;
30 static int pf_max_bytes;
31 static int pf_max_bbs;
32 static int pf_max_fsbs;
33 static int pf_batch_bytes;
34 static int pf_batch_fsbs;
35
36 static void pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *);
37
38 /*
39 * Buffer priorities for the libxfs cache
40 *
41 * Directory metadata is ranked higher than other metadata as it's used
42 * in phases 3, 4 and 6, while other metadata is only used in 3 & 4.
43 */
44
45 /* intermediate directory btree nodes - can't be queued */
46 #define B_DIR_BMAP CACHE_PREFETCH_PRIORITY + 7
47 /* directory metadata in secondary queue */
48 #define B_DIR_META_2 CACHE_PREFETCH_PRIORITY + 6
49 /* dir metadata that had to fetched from the primary queue to avoid stalling */
50 #define B_DIR_META_H CACHE_PREFETCH_PRIORITY + 5
51 /* single block of directory metadata (can't batch read) */
52 #define B_DIR_META_S CACHE_PREFETCH_PRIORITY + 4
53 /* dir metadata with more than one block fetched in a single I/O */
54 #define B_DIR_META CACHE_PREFETCH_PRIORITY + 3
55 /* inode clusters with directory inodes */
56 #define B_DIR_INODE CACHE_PREFETCH_PRIORITY + 2
57 /* intermediate extent btree nodes */
58 #define B_BMAP CACHE_PREFETCH_PRIORITY + 1
59 /* inode clusters without any directory entries */
60 #define B_INODE CACHE_PREFETCH_PRIORITY
61
62 /*
63 * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if
64 * the buffer is for an inode or other metadata.
65 */
66 #define B_IS_INODE(f) (((f) & 5) == 0)
67 #define B_IS_META(f) (((f) & 5) != 0)
68
69 #define DEF_BATCH_BYTES 0x10000
70
71 #define MAX_BUFS 128
72
73 #define IO_THRESHOLD (MAX_BUFS * 2)
74
75 typedef enum pf_which {
76 PF_PRIMARY,
77 PF_SECONDARY,
78 PF_META_ONLY
79 } pf_which_t;
80
81
82 static inline void
83 pf_start_processing(
84 prefetch_args_t *args)
85 {
86 if (!args->can_start_processing) {
87 #ifdef XR_PF_TRACE
88 pftrace("signalling processing for AG %d", args->agno);
89 #endif
90 args->can_start_processing = 1;
91 pthread_cond_signal(&args->start_processing);
92 }
93 }
94
95 static inline void
96 pf_start_io_workers(
97 prefetch_args_t *args)
98 {
99 if (!args->can_start_reading) {
100 #ifdef XR_PF_TRACE
101 pftrace("signalling reading for AG %d", args->agno);
102 #endif
103 args->can_start_reading = 1;
104 pthread_cond_broadcast(&args->start_reading);
105 }
106 }
107
108
109 static void
110 pf_queue_io(
111 prefetch_args_t *args,
112 xfs_fsblock_t fsbno,
113 int blen,
114 int flag)
115 {
116 xfs_buf_t *bp;
117
118 bp = libxfs_getbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, fsbno),
119 XFS_FSB_TO_BB(mp, blen));
120 if (bp->b_flags & LIBXFS_B_UPTODATE) {
121 if (B_IS_INODE(flag))
122 pf_read_inode_dirs(args, bp);
123 XFS_BUF_SET_PRIORITY(bp, XFS_BUF_PRIORITY(bp) +
124 CACHE_PREFETCH_PRIORITY);
125 libxfs_putbuf(bp);
126 return;
127 }
128 XFS_BUF_SET_PRIORITY(bp, flag);
129
130 pthread_mutex_lock(&args->lock);
131
132 if (fsbno > args->last_bno_read) {
133 radix_tree_insert(&args->primary_io_queue, fsbno, bp);
134 if (B_IS_META(flag))
135 radix_tree_tag_set(&args->primary_io_queue, fsbno, 0);
136 else {
137 args->inode_bufs_queued++;
138 if (args->inode_bufs_queued == IO_THRESHOLD)
139 pf_start_io_workers(args);
140 }
141 #ifdef XR_PF_TRACE
142 pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to "
143 "primary queue (inode_bufs_queued = %d, last_bno = %lu)",
144 B_IS_INODE(flag) ? 'I' : 'M', bp,
145 (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
146 args->inode_bufs_queued, args->last_bno_read);
147 #endif
148 } else {
149 #ifdef XR_PF_TRACE
150 pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to "
151 "secondary queue (last_bno = %lu)",
152 B_IS_INODE(flag) ? 'I' : 'M', bp,
153 (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
154 args->last_bno_read);
155 #endif
156 ASSERT(B_IS_META(flag));
157 XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
158 radix_tree_insert(&args->secondary_io_queue, fsbno, bp);
159 }
160
161 pf_start_processing(args);
162
163 pthread_mutex_unlock(&args->lock);
164 }
165
166 static int
167 pf_read_bmbt_reclist(
168 prefetch_args_t *args,
169 xfs_bmbt_rec_t *rp,
170 int numrecs)
171 {
172 int i;
173 xfs_bmbt_irec_t irec;
174 xfs_dfilblks_t cp = 0; /* prev count */
175 xfs_dfiloff_t op = 0; /* prev offset */
176
177 for (i = 0; i < numrecs; i++) {
178 libxfs_bmbt_disk_get_all(rp + i, &irec);
179
180 if (((i > 0) && (op + cp > irec.br_startoff)) ||
181 (irec.br_blockcount == 0) ||
182 (irec.br_startoff >= fs_max_file_offset))
183 return 0;
184
185 if (!verify_dfsbno(mp, irec.br_startblock) || !verify_dfsbno(mp,
186 irec.br_startblock + irec.br_blockcount - 1))
187 return 0;
188
189 if (!args->dirs_only && ((irec.br_startoff +
190 irec.br_blockcount) >= mp->m_dirfreeblk))
191 break; /* only Phase 6 reads the free blocks */
192
193 op = irec.br_startoff;
194 cp = irec.br_blockcount;
195
196 while (irec.br_blockcount) {
197 unsigned int len;
198 #ifdef XR_PF_TRACE
199 pftrace("queuing dir extent in AG %d", args->agno);
200 #endif
201 len = (irec.br_blockcount > mp->m_dirblkfsbs) ?
202 mp->m_dirblkfsbs : irec.br_blockcount;
203 pf_queue_io(args, irec.br_startblock, len, B_DIR_META);
204 irec.br_blockcount -= len;
205 irec.br_startblock += len;
206 }
207 }
208 return 1;
209 }
210
211 /*
212 * simplified version of the main scan_lbtree. Returns 0 to stop.
213 */
214
215 static int
216 pf_scan_lbtree(
217 xfs_dfsbno_t dbno,
218 int level,
219 int isadir,
220 prefetch_args_t *args,
221 int (*func)(struct xfs_btree_block *block,
222 int level,
223 int isadir,
224 prefetch_args_t *args))
225 {
226 xfs_buf_t *bp;
227 int rc;
228
229 bp = libxfs_readbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, dbno),
230 XFS_FSB_TO_BB(mp, 1), 0);
231 if (!bp)
232 return 0;
233
234 XFS_BUF_SET_PRIORITY(bp, isadir ? B_DIR_BMAP : B_BMAP);
235
236 rc = (*func)(XFS_BUF_TO_BLOCK(bp), level - 1, isadir, args);
237
238 libxfs_putbuf(bp);
239
240 return rc;
241 }
242
243 static int
244 pf_scanfunc_bmap(
245 struct xfs_btree_block *block,
246 int level,
247 int isadir,
248 prefetch_args_t *args)
249 {
250 xfs_bmbt_ptr_t *pp;
251 int numrecs;
252 int i;
253 xfs_dfsbno_t dbno;
254
255 /*
256 * do some validation on the block contents
257 */
258 if ((be32_to_cpu(block->bb_magic) != XFS_BMAP_MAGIC) ||
259 (be16_to_cpu(block->bb_level) != level))
260 return 0;
261
262 numrecs = be16_to_cpu(block->bb_numrecs);
263
264 if (level == 0) {
265 if (numrecs > mp->m_bmap_dmxr[0] || !isadir)
266 return 0;
267 return pf_read_bmbt_reclist(args,
268 XFS_BMBT_REC_ADDR(mp, block, 1), numrecs);
269 }
270
271 if (numrecs > mp->m_bmap_dmxr[1])
272 return 0;
273
274 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
275
276 for (i = 0; i < numrecs; i++) {
277 dbno = be64_to_cpu(pp[i]);
278 if (!verify_dfsbno(mp, dbno))
279 return 0;
280 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
281 return 0;
282 }
283 return 1;
284 }
285
286
287 static void
288 pf_read_btinode(
289 prefetch_args_t *args,
290 xfs_dinode_t *dino,
291 int isadir)
292 {
293 xfs_bmdr_block_t *dib;
294 xfs_bmbt_ptr_t *pp;
295 int i;
296 int level;
297 int numrecs;
298 int dsize;
299 xfs_dfsbno_t dbno;
300
301 dib = (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dino);
302
303 level = be16_to_cpu(dib->bb_level);
304 numrecs = be16_to_cpu(dib->bb_numrecs);
305
306 if ((numrecs == 0) || (level == 0) ||
307 (level > XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))
308 return;
309 /*
310 * use bmdr/dfork_dsize since the root block is in the data fork
311 */
312 if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp))
313 return;
314
315 dsize = XFS_DFORK_DSIZE(dino, mp);
316 pp = XFS_BMDR_PTR_ADDR(dib, 1, xfs_bmdr_maxrecs(mp, dsize, 0));
317
318 for (i = 0; i < numrecs; i++) {
319 dbno = be64_to_cpu(pp[i]);
320 if (!verify_dfsbno(mp, dbno))
321 break;
322 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
323 break;
324 }
325 }
326
327 static void
328 pf_read_exinode(
329 prefetch_args_t *args,
330 xfs_dinode_t *dino)
331 {
332 pf_read_bmbt_reclist(args, (xfs_bmbt_rec_t *)XFS_DFORK_DPTR(dino),
333 be32_to_cpu(dino->di_core.di_nextents));
334 }
335
336 static void
337 pf_read_inode_dirs(
338 prefetch_args_t *args,
339 xfs_buf_t *bp)
340 {
341 xfs_dinode_t *dino;
342 int icnt = 0;
343 int hasdir = 0;
344 int isadir;
345 xfs_dinode_core_t *dinoc;
346
347 for (icnt = 0; icnt < (XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog); icnt++) {
348 dino = XFS_MAKE_IPTR(mp, bp, icnt);
349 dinoc = &dino->di_core;
350
351 /*
352 * We are only prefetching directory contents in extents
353 * and btree nodes for other inodes
354 */
355 isadir = (be16_to_cpu(dinoc->di_mode) & S_IFMT) == S_IFDIR;
356 hasdir |= isadir;
357
358 if (dinoc->di_format <= XFS_DINODE_FMT_LOCAL)
359 continue;
360
361 if (!isadir && (dinoc->di_format == XFS_DINODE_FMT_EXTENTS ||
362 args->dirs_only))
363 continue;
364
365 /*
366 * do some checks on the inode to see if we can prefetch
367 * its directory data. It's a cut down version of
368 * process_dinode_int() in dinode.c.
369 */
370 if (dinoc->di_format > XFS_DINODE_FMT_BTREE)
371 continue;
372
373 if (be16_to_cpu(dinoc->di_magic) != XFS_DINODE_MAGIC)
374 continue;
375
376 if (!XFS_DINODE_GOOD_VERSION(dinoc->di_version) ||
377 (!fs_inode_nlink && dinoc->di_version >
378 XFS_DINODE_VERSION_1))
379 continue;
380
381 if (be64_to_cpu(dinoc->di_size) <= XFS_DFORK_DSIZE(dino, mp))
382 continue;
383
384 if ((dinoc->di_forkoff != 0) &&
385 (dinoc->di_forkoff >= (XFS_LITINO(mp) >> 3)))
386 continue;
387
388 switch (dinoc->di_format) {
389 case XFS_DINODE_FMT_EXTENTS:
390 pf_read_exinode(args, dino);
391 break;
392 case XFS_DINODE_FMT_BTREE:
393 pf_read_btinode(args, dino, isadir);
394 break;
395 }
396 }
397 if (hasdir)
398 XFS_BUF_SET_PRIORITY(bp, B_DIR_INODE);
399 }
400
401 /*
402 * pf_batch_read must be called with the lock locked.
403 */
404
405 static void
406 pf_batch_read(
407 prefetch_args_t *args,
408 pf_which_t which,
409 void *buf)
410 {
411 struct radix_tree_root *queue;
412 xfs_buf_t *bplist[MAX_BUFS];
413 unsigned int num;
414 off64_t first_off, last_off, next_off;
415 int len, size;
416 int i;
417 int inode_bufs;
418 unsigned long fsbno;
419 char *pbuf;
420
421 queue = (which != PF_SECONDARY) ? &args->primary_io_queue
422 : &args->secondary_io_queue;
423
424 while (radix_tree_lookup_first(queue, &fsbno) != NULL) {
425
426 if (which != PF_META_ONLY) {
427 num = radix_tree_gang_lookup_ex(queue,
428 (void**)&bplist[0], fsbno,
429 fsbno + pf_max_fsbs, MAX_BUFS);
430 ASSERT(num > 0);
431 ASSERT(XFS_FSB_TO_DADDR(mp, fsbno) ==
432 XFS_BUF_ADDR(bplist[0]));
433 } else {
434 num = radix_tree_gang_lookup_tag(queue,
435 (void**)&bplist[0], fsbno,
436 MAX_BUFS / 4, 0);
437 if (num == 0)
438 return;
439 }
440
441 /*
442 * do a big read if 25% of the potential buffer is useful,
443 * otherwise, find as many close together blocks and
444 * read them in one read
445 */
446 first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0]));
447 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
448 XFS_BUF_SIZE(bplist[num-1]);
449 while (last_off - first_off > pf_max_bytes) {
450 num--;
451 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
452 XFS_BUF_SIZE(bplist[num-1]);
453 }
454 if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) {
455 /*
456 * not enough blocks for one big read, so determine
457 * the number of blocks that are close enough.
458 */
459 last_off = first_off + XFS_BUF_SIZE(bplist[0]);
460 for (i = 1; i < num; i++) {
461 next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) +
462 XFS_BUF_SIZE(bplist[i]);
463 if (next_off - last_off > pf_batch_bytes)
464 break;
465 last_off = next_off;
466 }
467 num = i;
468 }
469
470 for (i = 0; i < num; i++) {
471 if (radix_tree_delete(queue, XFS_DADDR_TO_FSB(mp,
472 XFS_BUF_ADDR(bplist[i]))) == NULL)
473 do_error(_("prefetch corruption\n"));
474 }
475
476 if (which == PF_PRIMARY) {
477 for (inode_bufs = 0, i = 0; i < num; i++) {
478 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
479 inode_bufs++;
480 }
481 args->inode_bufs_queued -= inode_bufs;
482 if (inode_bufs && (first_off >> mp->m_sb.sb_blocklog) >
483 pf_batch_fsbs)
484 args->last_bno_read = (first_off >> mp->m_sb.sb_blocklog);
485 }
486 #ifdef XR_PF_TRACE
487 pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)",
488 (long long)XFS_BUF_ADDR(bplist[0]),
489 (long long)XFS_BUF_ADDR(bplist[num-1]), num,
490 (which != PF_SECONDARY) ? "pri" : "sec", args->agno,
491 args->last_bno_read, args->inode_bufs_queued);
492 #endif
493 pthread_mutex_unlock(&args->lock);
494
495 /*
496 * now read the data and put into the xfs_but_t's
497 */
498 len = pread64(mp_fd, buf, (int)(last_off - first_off), first_off);
499 if (len > 0) {
500 /*
501 * go through the xfs_buf_t list copying from the
502 * read buffer into the xfs_buf_t's and release them.
503 */
504 last_off = first_off;
505 for (i = 0; i < num; i++) {
506
507 pbuf = ((char *)buf) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) - first_off);
508 size = XFS_BUF_SIZE(bplist[i]);
509 if (len < size)
510 break;
511 memcpy(XFS_BUF_PTR(bplist[i]), pbuf, size);
512 bplist[i]->b_flags |= LIBXFS_B_UPTODATE;
513 len -= size;
514 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
515 pf_read_inode_dirs(args, bplist[i]);
516 else if (which == PF_META_ONLY)
517 XFS_BUF_SET_PRIORITY(bplist[i],
518 B_DIR_META_H);
519 else if (which == PF_PRIMARY && num == 1)
520 XFS_BUF_SET_PRIORITY(bplist[i],
521 B_DIR_META_S);
522 }
523 }
524 for (i = 0; i < num; i++) {
525 #ifdef XR_PF_TRACE
526 pftrace("putbuf %c %p (%llu) in AG %d",
527 B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M',
528 bplist[i], (long long)XFS_BUF_ADDR(bplist[i]),
529 args->agno);
530 #endif
531 libxfs_putbuf(bplist[i]);
532 }
533 pthread_mutex_lock(&args->lock);
534 if (which != PF_SECONDARY) {
535 #ifdef XR_PF_TRACE
536 pftrace("inode_bufs_queued for AG %d = %d", args->agno,
537 args->inode_bufs_queued);
538 #endif
539 /*
540 * if primary inode queue running low, process metadata
541 * in boths queues to avoid I/O starvation as the
542 * processing thread would be waiting for a metadata
543 * buffer
544 */
545 if (which == PF_PRIMARY && !args->queuing_done &&
546 args->inode_bufs_queued < IO_THRESHOLD) {
547 #ifdef XR_PF_TRACE
548 pftrace("reading metadata bufs from primary queue for AG %d",
549 args->agno);
550 #endif
551 pf_batch_read(args, PF_META_ONLY, buf);
552 #ifdef XR_PF_TRACE
553 pftrace("reading bufs from secondary queue for AG %d",
554 args->agno);
555 #endif
556 pf_batch_read(args, PF_SECONDARY, buf);
557 }
558 }
559 }
560 }
561
562 static void *
563 pf_io_worker(
564 void *param)
565 {
566 prefetch_args_t *args = param;
567 void *buf = memalign(libxfs_device_alignment(),
568 pf_max_bytes);
569
570 if (buf == NULL)
571 return NULL;
572
573 pthread_mutex_lock(&args->lock);
574 while (!args->queuing_done || args->primary_io_queue.height) {
575
576 #ifdef XR_PF_TRACE
577 pftrace("waiting to start prefetch I/O for AG %d", args->agno);
578 #endif
579 while (!args->can_start_reading && !args->queuing_done)
580 pthread_cond_wait(&args->start_reading, &args->lock);
581 #ifdef XR_PF_TRACE
582 pftrace("starting prefetch I/O for AG %d", args->agno);
583 #endif
584 pf_batch_read(args, PF_PRIMARY, buf);
585 pf_batch_read(args, PF_SECONDARY, buf);
586
587 #ifdef XR_PF_TRACE
588 pftrace("ran out of bufs to prefetch for AG %d", args->agno);
589 #endif
590 if (!args->queuing_done)
591 args->can_start_reading = 0;
592 }
593 pthread_mutex_unlock(&args->lock);
594
595 free(buf);
596
597 #ifdef XR_PF_TRACE
598 pftrace("finished prefetch I/O for AG %d", args->agno);
599 #endif
600 return NULL;
601 }
602
603 static int
604 pf_create_prefetch_thread(
605 prefetch_args_t *args);
606
607 static void *
608 pf_queuing_worker(
609 void *param)
610 {
611 prefetch_args_t *args = param;
612 int num_inos;
613 ino_tree_node_t *irec;
614 ino_tree_node_t *cur_irec;
615 int blks_per_cluster;
616 xfs_agblock_t bno;
617 int i;
618 int err;
619
620 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
621 if (blks_per_cluster == 0)
622 blks_per_cluster = 1;
623
624 for (i = 0; i < PF_THREAD_COUNT; i++) {
625 err = pthread_create(&args->io_threads[i], NULL,
626 pf_io_worker, args);
627 if (err != 0) {
628 do_warn(_("failed to create prefetch thread: %s\n"),
629 strerror(err));
630 if (i == 0) {
631 pf_start_processing(args);
632 return NULL;
633 }
634 /*
635 * since we have at least one I/O thread, use them for
636 * prefetch
637 */
638 break;
639 }
640 }
641
642 #ifdef XR_PF_TRACE
643 pftrace("starting prefetch for AG %d", args->agno);
644 #endif
645
646 for (irec = findfirst_inode_rec(args->agno); irec != NULL;
647 irec = next_ino_rec(irec)) {
648
649 cur_irec = irec;
650
651 num_inos = XFS_INODES_PER_CHUNK;
652 while (num_inos < XFS_IALLOC_INODES(mp) && irec != NULL) {
653 irec = next_ino_rec(irec);
654 num_inos += XFS_INODES_PER_CHUNK;
655 }
656
657 if (args->dirs_only && cur_irec->ino_isa_dir == 0)
658 continue;
659 #ifdef XR_PF_TRACE
660 sem_getvalue(&args->ra_count, &i);
661 pftrace("queuing irec %p in AG %d, sem count = %d",
662 irec, args->agno, i);
663 #endif
664 sem_wait(&args->ra_count);
665
666 num_inos = 0;
667 bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum);
668
669 do {
670 pf_queue_io(args, XFS_AGB_TO_FSB(mp, args->agno, bno),
671 blks_per_cluster,
672 (cur_irec->ino_isa_dir != 0) ?
673 B_DIR_INODE : B_INODE);
674 bno += blks_per_cluster;
675 num_inos += inodes_per_cluster;
676 } while (num_inos < XFS_IALLOC_INODES(mp));
677 }
678
679 pthread_mutex_lock(&args->lock);
680
681 #ifdef XR_PF_TRACE
682 pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)",
683 args->agno, args->inode_bufs_queued);
684 #endif
685 args->queuing_done = 1;
686 pf_start_io_workers(args);
687 pf_start_processing(args);
688 pthread_mutex_unlock(&args->lock);
689
690 /* now wait for the readers to finish */
691 for (i = 0; i < PF_THREAD_COUNT; i++)
692 if (args->io_threads[i])
693 pthread_join(args->io_threads[i], NULL);
694
695 #ifdef XR_PF_TRACE
696 pftrace("prefetch for AG %d finished", args->agno);
697 #endif
698 pthread_mutex_lock(&args->lock);
699
700 ASSERT(args->primary_io_queue.height == 0);
701 ASSERT(args->secondary_io_queue.height == 0);
702
703 args->prefetch_done = 1;
704 if (args->next_args)
705 pf_create_prefetch_thread(args->next_args);
706
707 pthread_mutex_unlock(&args->lock);
708
709 return NULL;
710 }
711
712 static int
713 pf_create_prefetch_thread(
714 prefetch_args_t *args)
715 {
716 int err;
717
718 #ifdef XR_PF_TRACE
719 pftrace("creating queue thread for AG %d", args->agno);
720 #endif
721 err = pthread_create(&args->queuing_thread, NULL,
722 pf_queuing_worker, args);
723 if (err != 0) {
724 do_warn(_("failed to create prefetch thread: %s\n"),
725 strerror(err));
726 cleanup_inode_prefetch(args);
727 }
728
729 return err == 0;
730 }
731
732 void
733 init_prefetch(
734 xfs_mount_t *pmp)
735 {
736 mp = pmp;
737 mp_fd = libxfs_device_to_fd(mp->m_dev);
738 pf_max_bytes = sysconf(_SC_PAGE_SIZE) << 7;
739 pf_max_bbs = pf_max_bytes >> BBSHIFT;
740 pf_max_fsbs = pf_max_bytes >> mp->m_sb.sb_blocklog;
741 pf_batch_bytes = DEF_BATCH_BYTES;
742 pf_batch_fsbs = DEF_BATCH_BYTES >> (mp->m_sb.sb_blocklog + 1);
743 }
744
745 prefetch_args_t *
746 start_inode_prefetch(
747 xfs_agnumber_t agno,
748 int dirs_only,
749 prefetch_args_t *prev_args)
750 {
751 prefetch_args_t *args;
752 long max_queue;
753
754 if (!do_prefetch || agno >= mp->m_sb.sb_agcount)
755 return NULL;
756
757 args = calloc(1, sizeof(prefetch_args_t));
758
759 INIT_RADIX_TREE(&args->primary_io_queue, 0);
760 INIT_RADIX_TREE(&args->secondary_io_queue, 0);
761 if (pthread_mutex_init(&args->lock, NULL) != 0)
762 do_error(_("failed to initialize prefetch mutex\n"));
763 if (pthread_cond_init(&args->start_reading, NULL) != 0)
764 do_error(_("failed to initialize prefetch cond var\n"));
765 if (pthread_cond_init(&args->start_processing, NULL) != 0)
766 do_error(_("failed to initialize prefetch cond var\n"));
767 args->agno = agno;
768 args->dirs_only = dirs_only;
769
770 /*
771 * use only 1/8 of the libxfs cache as we are only counting inodes
772 * and not any other associated metadata like directories
773 */
774
775 max_queue = libxfs_bcache->c_maxcount / thread_count / 8;
776 if (XFS_INODE_CLUSTER_SIZE(mp) > mp->m_sb.sb_blocksize)
777 max_queue = max_queue * (XFS_INODE_CLUSTER_SIZE(mp) >>
778 mp->m_sb.sb_blocklog) / XFS_IALLOC_BLOCKS(mp);
779
780 sem_init(&args->ra_count, 0, max_queue);
781
782 if (!prev_args) {
783 if (!pf_create_prefetch_thread(args))
784 return NULL;
785 } else {
786 pthread_mutex_lock(&prev_args->lock);
787 if (prev_args->prefetch_done) {
788 if (!pf_create_prefetch_thread(args))
789 args = NULL;
790 } else
791 prev_args->next_args = args;
792 pthread_mutex_unlock(&prev_args->lock);
793 }
794
795 return args;
796 }
797
798 void
799 wait_for_inode_prefetch(
800 prefetch_args_t *args)
801 {
802 if (args == NULL)
803 return;
804
805 pthread_mutex_lock(&args->lock);
806
807 while (!args->can_start_processing) {
808 #ifdef XR_PF_TRACE
809 pftrace("waiting to start processing AG %d", args->agno);
810 #endif
811 pthread_cond_wait(&args->start_processing, &args->lock);
812 }
813 #ifdef XR_PF_TRACE
814 pftrace("can start processing AG %d", args->agno);
815 #endif
816 pthread_mutex_unlock(&args->lock);
817 }
818
819 void
820 cleanup_inode_prefetch(
821 prefetch_args_t *args)
822 {
823 if (args == NULL)
824 return;
825
826 #ifdef XR_PF_TRACE
827 pftrace("waiting AG %d prefetch to finish", args->agno);
828 #endif
829 if (args->queuing_thread)
830 pthread_join(args->queuing_thread, NULL);
831
832 #ifdef XR_PF_TRACE
833 pftrace("AG %d prefetch done", args->agno);
834 #endif
835 pthread_mutex_destroy(&args->lock);
836 pthread_cond_destroy(&args->start_reading);
837 pthread_cond_destroy(&args->start_processing);
838 sem_destroy(&args->ra_count);
839
840 free(args);
841 }
842
843 #ifdef XR_PF_TRACE
844
845 void
846 _pftrace(const char *func, const char *msg, ...)
847 {
848 char buf[200];
849 struct timeval tv;
850 va_list args;
851
852 gettimeofday(&tv, NULL);
853
854 va_start(args, msg);
855 vsnprintf(buf, sizeof(buf), msg, args);
856 buf[sizeof(buf)-1] = '\0';
857 va_end(args);
858
859 fprintf(pf_trace_file, "%lu.%06lu %s: %s\n", tv.tv_sec, tv.tv_usec, func, buf);
860 }
861
862 #endif