]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - repair/prefetch.c
Don't destroy file with -N -d file options
[thirdparty/xfsprogs-dev.git] / repair / prefetch.c
CommitLineData
cb5b3ef4 1#include <libxfs.h>
2556c98b 2#include <pthread.h>
cb5b3ef4
MV
3#include "avl.h"
4#include "globals.h"
5#include "agheader.h"
6#include "incore.h"
7#include "dir.h"
8#include "dir2.h"
cb5b3ef4
MV
9#include "protos.h"
10#include "err_protos.h"
11#include "dinode.h"
12#include "bmap.h"
13#include "versions.h"
2556c98b
BN
14#include "threads.h"
15#include "prefetch.h"
16#include "progress.h"
17#include "radix-tree.h"
cb5b3ef4
MV
18
19int do_prefetch = 1;
20
2556c98b
BN
21/*
22 * Performs prefetching by priming the libxfs cache by using a dedicate thread
23 * scanning inodes and reading blocks in ahead of time they are required.
24 *
25 * Any I/O errors can be safely ignored.
26 */
cb5b3ef4 27
2556c98b
BN
28static xfs_mount_t *mp;
29static int mp_fd;
30static int pf_max_bytes;
31static int pf_max_bbs;
32static int pf_max_fsbs;
33static int pf_batch_bytes;
34static int pf_batch_fsbs;
cb5b3ef4 35
69ec88b5
BN
36static void pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *);
37
38/* buffer priorities for the libxfs cache */
39
40#define B_DIR_BMAP 15
41#define B_DIR_META_2 13 /* metadata in secondary queue */
42#define B_DIR_META_H 11 /* metadata fetched for PF_META_ONLY */
43#define B_DIR_META_S 9 /* single block of metadata */
44#define B_DIR_META 7
45#define B_DIR_INODE 6
46#define B_BMAP 5
47#define B_INODE 4
48
49#define B_IS_INODE(b) (((b) & 1) == 0)
50#define B_IS_META(b) (((b) & 1) != 0)
cb5b3ef4 51
2556c98b
BN
52#define DEF_BATCH_BYTES 0x10000
53
54#define MAX_BUFS 128
55
69ec88b5 56#define IO_THRESHOLD (MAX_BUFS * 2)
2556c98b
BN
57
58typedef enum pf_which {
59 PF_PRIMARY,
60 PF_SECONDARY,
61 PF_META_ONLY
62} pf_which_t;
63
64
65static inline void
66pf_start_processing(
67 prefetch_args_t *args)
68{
69 if (!args->can_start_processing) {
70#ifdef XR_PF_TRACE
71 pftrace("signalling processing for AG %d", args->agno);
72#endif
73 args->can_start_processing = 1;
74 pthread_cond_signal(&args->start_processing);
cb5b3ef4 75 }
2556c98b
BN
76}
77
78static inline void
79pf_start_io_workers(
80 prefetch_args_t *args)
81{
82 if (!args->can_start_reading) {
83#ifdef XR_PF_TRACE
84 pftrace("signalling reading for AG %d", args->agno);
85#endif
86 args->can_start_reading = 1;
87 pthread_cond_broadcast(&args->start_reading);
cb5b3ef4 88 }
cb5b3ef4
MV
89}
90
2556c98b 91
cb5b3ef4 92static void
2556c98b
BN
93pf_queue_io(
94 prefetch_args_t *args,
95 xfs_fsblock_t fsbno,
96 int blen,
97 int flag)
cb5b3ef4 98{
2556c98b 99 xfs_buf_t *bp;
cb5b3ef4 100
2556c98b
BN
101 bp = libxfs_getbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, fsbno),
102 XFS_FSB_TO_BB(mp, blen));
103 if (bp->b_flags & LIBXFS_B_UPTODATE) {
69ec88b5
BN
104 if (B_IS_INODE(flag))
105 pf_read_inode_dirs(args, bp);
106 XFS_BUF_SET_PRIORITY(bp, XFS_BUF_PRIORITY(bp) + 8);
2556c98b 107 libxfs_putbuf(bp);
cb5b3ef4
MV
108 return;
109 }
69ec88b5 110 XFS_BUF_SET_PRIORITY(bp, flag);
cb5b3ef4 111
2556c98b 112 pthread_mutex_lock(&args->lock);
cb5b3ef4 113
2556c98b
BN
114 if (fsbno > args->last_bno_read) {
115 radix_tree_insert(&args->primary_io_queue, fsbno, bp);
69ec88b5 116 if (B_IS_META(flag))
2556c98b
BN
117 radix_tree_tag_set(&args->primary_io_queue, fsbno, 0);
118 else {
119 args->inode_bufs_queued++;
120 if (args->inode_bufs_queued == IO_THRESHOLD)
121 pf_start_io_workers(args);
cb5b3ef4 122 }
2556c98b
BN
123#ifdef XR_PF_TRACE
124 pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to "
125 "primary queue (inode_bufs_queued = %d, last_bno = %lu)",
69ec88b5 126 B_IS_INODE(flag) ? 'I' : 'M', bp,
2556c98b
BN
127 (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
128 args->inode_bufs_queued, args->last_bno_read);
129#endif
130 } else {
131#ifdef XR_PF_TRACE
132 pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to "
133 "secondary queue (last_bno = %lu)",
69ec88b5 134 B_IS_INODE(flag) ? 'I' : 'M', bp,
2556c98b
BN
135 (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
136 args->last_bno_read);
137#endif
69ec88b5
BN
138 ASSERT(B_IS_META(flag));
139 XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
2556c98b 140 radix_tree_insert(&args->secondary_io_queue, fsbno, bp);
cb5b3ef4
MV
141 }
142
2556c98b 143 pf_start_processing(args);
cb5b3ef4 144
2556c98b 145 pthread_mutex_unlock(&args->lock);
cb5b3ef4
MV
146}
147
2556c98b
BN
148static int
149pf_read_bmbt_reclist(
150 prefetch_args_t *args,
151 xfs_bmbt_rec_t *rp,
152 int numrecs)
cb5b3ef4 153{
cb5b3ef4 154 int i;
2556c98b
BN
155 xfs_dfsbno_t s; /* start */
156 xfs_dfilblks_t c; /* count */
157 xfs_dfiloff_t o; /* offset */
158 xfs_dfilblks_t cp = 0; /* prev count */
159 xfs_dfiloff_t op = 0; /* prev offset */
160 int flag; /* extent flag */
161
162 for (i = 0; i < numrecs; i++, rp++) {
163 convert_extent((xfs_bmbt_rec_32_t*)rp, &o, &s, &c, &flag);
164
165 if (((i > 0) && (op + cp > o)) || (c == 0) ||
166 (o >= fs_max_file_offset))
167 return 0;
168
169 if (!verify_dfsbno(mp, s) || !verify_dfsbno(mp, s + c - 1))
170 return 0;
171
172 if (!args->dirs_only && ((o + c) >= mp->m_dirfreeblk))
173 break; /* only Phase 6 reads the free blocks */
174
175 op = o;
176 cp = c;
177
178 while (c) {
f8bc5a6f 179 unsigned int len;
2556c98b
BN
180#ifdef XR_PF_TRACE
181 pftrace("queuing dir extent in AG %d", args->agno);
182#endif
f8bc5a6f
BN
183 len = (c > mp->m_dirblkfsbs) ? mp->m_dirblkfsbs : c;
184 pf_queue_io(args, s, len, B_DIR_META);
185 c -= len;
186 s += len;
2556c98b
BN
187 }
188 }
189 return 1;
190}
cb5b3ef4 191
2556c98b
BN
192/*
193 * simplified version of the main scan_lbtree. Returns 0 to stop.
194 */
195
196static int
197pf_scan_lbtree(
198 xfs_dfsbno_t dbno,
199 int level,
200 int isadir,
201 prefetch_args_t *args,
202 int (*func)(xfs_btree_lblock_t *block,
203 int level,
204 int isadir,
205 prefetch_args_t *args))
206{
207 xfs_buf_t *bp;
208 int rc;
cb5b3ef4 209
2556c98b 210 bp = libxfs_readbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, dbno),
cb5b3ef4 211 XFS_FSB_TO_BB(mp, 1), 0);
2556c98b
BN
212 if (!bp)
213 return 0;
cb5b3ef4 214
69ec88b5
BN
215 XFS_BUF_SET_PRIORITY(bp, isadir ? B_DIR_BMAP : B_BMAP);
216
2556c98b 217 rc = (*func)((xfs_btree_lblock_t *)XFS_BUF_PTR(bp), level - 1, isadir, args);
cb5b3ef4 218
2556c98b 219 libxfs_putbuf(bp);
cb5b3ef4 220
2556c98b
BN
221 return rc;
222}
223
224static int
225pf_scanfunc_bmap(
226 xfs_btree_lblock_t *block,
227 int level,
228 int isadir,
229 prefetch_args_t *args)
230{
231 xfs_bmbt_rec_t *rp;
232 xfs_bmbt_ptr_t *pp;
233 int numrecs;
234 int i;
235 xfs_dfsbno_t dbno;
236
237 /*
238 * do some validation on the block contents
239 */
240 if ((be32_to_cpu(block->bb_magic) != XFS_BMAP_MAGIC) ||
241 (be16_to_cpu(block->bb_level) != level))
242 return 0;
243
244 numrecs = be16_to_cpu(block->bb_numrecs);
245
246 if (level == 0) {
247 if (numrecs > mp->m_bmap_dmxr[0] || !isadir)
248 return 0;
249
250 rp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt,
251 block, 1, mp->m_bmap_dmxr[0]);
252
253 return pf_read_bmbt_reclist(args, rp, numrecs);
cb5b3ef4
MV
254 }
255
2556c98b
BN
256 if (numrecs > mp->m_bmap_dmxr[1])
257 return 0;
cb5b3ef4 258
2556c98b
BN
259 pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, 1,
260 mp->m_bmap_dmxr[1]);
261
262 for (i = 0; i < numrecs; i++) {
263 dbno = be64_to_cpu(pp[i]);
264 if (!verify_dfsbno(mp, dbno))
265 return 0;
266 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
267 return 0;
cb5b3ef4 268 }
2556c98b 269 return 1;
cb5b3ef4
MV
270}
271
2556c98b
BN
272
273static void
274pf_read_btinode(
275 prefetch_args_t *args,
276 xfs_dinode_t *dino,
277 int isadir)
cb5b3ef4 278{
2556c98b
BN
279 xfs_bmdr_block_t *dib;
280 xfs_bmbt_ptr_t *pp;
281 int i;
282 int level;
283 int numrecs;
284 int dsize;
285 xfs_dfsbno_t dbno;
286
287 dib = (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dino);
288
289 level = be16_to_cpu(dib->bb_level);
290 numrecs = be16_to_cpu(dib->bb_numrecs);
291
292 if ((numrecs == 0) || (level == 0) ||
293 (level > XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))
cb5b3ef4 294 return;
2556c98b
BN
295 /*
296 * use bmdr/dfork_dsize since the root block is in the data fork
297 */
298 if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp))
299 return;
300
301 dsize = XFS_DFORK_DSIZE(dino, mp);
302 pp = XFS_BTREE_PTR_ADDR(dsize, xfs_bmdr, dib, 1,
303 XFS_BTREE_BLOCK_MAXRECS(dsize, xfs_bmdr, 0));
cb5b3ef4 304
2556c98b
BN
305 for (i = 0; i < numrecs; i++) {
306 dbno = be64_to_cpu(pp[i]);
307 if (!verify_dfsbno(mp, dbno))
cb5b3ef4 308 break;
2556c98b 309 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
cb5b3ef4 310 break;
2556c98b
BN
311 }
312}
313
314static void
315pf_read_exinode(
316 prefetch_args_t *args,
317 xfs_dinode_t *dino)
318{
319 pf_read_bmbt_reclist(args, (xfs_bmbt_rec_t *)XFS_DFORK_DPTR(dino),
320 be32_to_cpu(dino->di_core.di_nextents));
321}
cb5b3ef4 322
2556c98b
BN
323static void
324pf_read_inode_dirs(
325 prefetch_args_t *args,
326 xfs_buf_t *bp)
327{
328 xfs_dinode_t *dino;
329 int icnt = 0;
69ec88b5
BN
330 int hasdir = 0;
331 int isadir;
2556c98b
BN
332 xfs_dinode_core_t *dinoc;
333
334 for (icnt = 0; icnt < (XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog); icnt++) {
335 dino = XFS_MAKE_IPTR(mp, bp, icnt);
336 dinoc = &dino->di_core;
337
338 /*
339 * We are only prefetching directory contents in extents
340 * and btree nodes for other inodes
341 */
69ec88b5
BN
342 isadir = (be16_to_cpu(dinoc->di_mode) & S_IFMT) == S_IFDIR;
343 hasdir |= isadir;
344
345 if (dinoc->di_format <= XFS_DINODE_FMT_LOCAL)
346 continue;
347
348 if (!isadir && (dinoc->di_format == XFS_DINODE_FMT_EXTENTS ||
349 args->dirs_only))
2556c98b
BN
350 continue;
351
352 /*
353 * do some checks on the inode to see if we can prefetch
354 * its directory data. It's a cut down version of
355 * process_dinode_int() in dinode.c.
356 */
357 if (dinoc->di_format > XFS_DINODE_FMT_BTREE)
358 continue;
359
360 if (be16_to_cpu(dinoc->di_magic) != XFS_DINODE_MAGIC)
361 continue;
362
363 if (!XFS_DINODE_GOOD_VERSION(dinoc->di_version) ||
364 (!fs_inode_nlink && dinoc->di_version >
365 XFS_DINODE_VERSION_1))
366 continue;
367
368 if (be64_to_cpu(dinoc->di_size) <= XFS_DFORK_DSIZE(dino, mp))
369 continue;
370
371 if ((dinoc->di_forkoff != 0) &&
372 (dinoc->di_forkoff >= (XFS_LITINO(mp) >> 3)))
373 continue;
374
375 switch (dinoc->di_format) {
376 case XFS_DINODE_FMT_EXTENTS:
377 pf_read_exinode(args, dino);
cb5b3ef4 378 break;
2556c98b 379 case XFS_DINODE_FMT_BTREE:
69ec88b5 380 pf_read_btinode(args, dino, isadir);
cb5b3ef4 381 break;
cb5b3ef4
MV
382 }
383 }
69ec88b5
BN
384 if (hasdir)
385 XFS_BUF_SET_PRIORITY(bp, B_DIR_INODE);
cb5b3ef4
MV
386}
387
2556c98b
BN
388/*
389 * pf_batch_read must be called with the lock locked.
390 */
391
cb5b3ef4 392static void
2556c98b
BN
393pf_batch_read(
394 prefetch_args_t *args,
395 pf_which_t which,
396 void *buf)
cb5b3ef4 397{
2556c98b
BN
398 struct radix_tree_root *queue;
399 xfs_buf_t *bplist[MAX_BUFS];
400 unsigned int num;
401 off64_t first_off, last_off, next_off;
402 int len, size;
cb5b3ef4 403 int i;
2556c98b
BN
404 int inode_bufs;
405 unsigned long fsbno;
406 char *pbuf;
407
408 queue = (which != PF_SECONDARY) ? &args->primary_io_queue
409 : &args->secondary_io_queue;
410
411 while (radix_tree_lookup_first(queue, &fsbno) != NULL) {
412
413 if (which != PF_META_ONLY) {
414 num = radix_tree_gang_lookup_ex(queue,
415 (void**)&bplist[0], fsbno,
416 fsbno + pf_max_fsbs, MAX_BUFS);
417 ASSERT(num > 0);
418 ASSERT(XFS_FSB_TO_DADDR(mp, fsbno) ==
419 XFS_BUF_ADDR(bplist[0]));
420 } else {
421 num = radix_tree_gang_lookup_tag(queue,
422 (void**)&bplist[0], fsbno,
423 MAX_BUFS / 4, 0);
424 if (num == 0)
425 return;
426 }
cb5b3ef4 427
2556c98b
BN
428 /*
429 * do a big read if 25% of the potential buffer is useful,
430 * otherwise, find as many close together blocks and
431 * read them in one read
432 */
433 first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0]));
434 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
435 XFS_BUF_SIZE(bplist[num-1]);
436 while (last_off - first_off > pf_max_bytes) {
437 num--;
438 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
439 XFS_BUF_SIZE(bplist[num-1]);
440 }
441 if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) {
442 /*
443 * not enough blocks for one big read, so determine
444 * the number of blocks that are close enough.
445 */
446 last_off = first_off + XFS_BUF_SIZE(bplist[0]);
447 for (i = 1; i < num; i++) {
448 next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) +
449 XFS_BUF_SIZE(bplist[i]);
450 if (next_off - last_off > pf_batch_bytes)
451 break;
452 last_off = next_off;
453 }
454 num = i;
455 }
cb5b3ef4 456
2556c98b
BN
457 for (i = 0; i < num; i++) {
458 if (radix_tree_delete(queue, XFS_DADDR_TO_FSB(mp,
459 XFS_BUF_ADDR(bplist[i]))) == NULL)
460 do_error(_("prefetch corruption\n"));
cb5b3ef4
MV
461 }
462
2556c98b
BN
463 if (which == PF_PRIMARY) {
464 for (inode_bufs = 0, i = 0; i < num; i++) {
69ec88b5 465 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
2556c98b
BN
466 inode_bufs++;
467 }
468 args->inode_bufs_queued -= inode_bufs;
469 if (inode_bufs && (first_off >> mp->m_sb.sb_blocklog) >
470 pf_batch_fsbs)
471 args->last_bno_read = (first_off >> mp->m_sb.sb_blocklog);
472 }
473#ifdef XR_PF_TRACE
474 pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)",
475 (long long)XFS_BUF_ADDR(bplist[0]),
476 (long long)XFS_BUF_ADDR(bplist[num-1]), num,
477 (which != PF_SECONDARY) ? "pri" : "sec", args->agno,
478 args->last_bno_read, args->inode_bufs_queued);
479#endif
480 pthread_mutex_unlock(&args->lock);
481
482 /*
483 * now read the data and put into the xfs_but_t's
484 */
485 len = pread64(mp_fd, buf, (int)(last_off - first_off), first_off);
486 if (len > 0) {
487 /*
488 * go through the xfs_buf_t list copying from the
489 * read buffer into the xfs_buf_t's and release them.
490 */
491 last_off = first_off;
492 for (i = 0; i < num; i++) {
493
494 pbuf = ((char *)buf) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) - first_off);
495 size = XFS_BUF_SIZE(bplist[i]);
496 if (len < size)
497 break;
498 memcpy(XFS_BUF_PTR(bplist[i]), pbuf, size);
499 bplist[i]->b_flags |= LIBXFS_B_UPTODATE;
500 len -= size;
69ec88b5 501 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
2556c98b 502 pf_read_inode_dirs(args, bplist[i]);
69ec88b5
BN
503 else if (which == PF_META_ONLY)
504 XFS_BUF_SET_PRIORITY(bplist[i],
505 B_DIR_META_H);
506 else if (which == PF_PRIMARY && num == 1)
507 XFS_BUF_SET_PRIORITY(bplist[i],
508 B_DIR_META_S);
2556c98b
BN
509 }
510 }
511 for (i = 0; i < num; i++) {
512#ifdef XR_PF_TRACE
513 pftrace("putbuf %c %p (%llu) in AG %d",
69ec88b5 514 B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M',
2556c98b
BN
515 bplist[i], (long long)XFS_BUF_ADDR(bplist[i]),
516 args->agno);
517#endif
518 libxfs_putbuf(bplist[i]);
519 }
520 pthread_mutex_lock(&args->lock);
521 if (which != PF_SECONDARY) {
522#ifdef XR_PF_TRACE
523 pftrace("inode_bufs_queued for AG %d = %d", args->agno,
524 args->inode_bufs_queued);
525#endif
526 /*
527 * if primary inode queue running low, process metadata
528 * in boths queues to avoid I/O starvation as the
529 * processing thread would be waiting for a metadata
530 * buffer
531 */
532 if (which == PF_PRIMARY && !args->queuing_done &&
533 args->inode_bufs_queued < IO_THRESHOLD) {
534#ifdef XR_PF_TRACE
535 pftrace("reading metadata bufs from primary queue for AG %d",
536 args->agno);
537#endif
538 pf_batch_read(args, PF_META_ONLY, buf);
539#ifdef XR_PF_TRACE
540 pftrace("reading bufs from secondary queue for AG %d",
541 args->agno);
542#endif
543 pf_batch_read(args, PF_SECONDARY, buf);
544 }
cb5b3ef4 545 }
cb5b3ef4 546 }
2556c98b
BN
547}
548
549static void *
550pf_io_worker(
551 void *param)
552{
553 prefetch_args_t *args = param;
554 void *buf = memalign(libxfs_device_alignment(),
555 pf_max_bytes);
556
557 if (buf == NULL)
558 return NULL;
cb5b3ef4 559
2556c98b
BN
560 pthread_mutex_lock(&args->lock);
561 while (!args->queuing_done || args->primary_io_queue.height) {
562
563#ifdef XR_PF_TRACE
564 pftrace("waiting to start prefetch I/O for AG %d", args->agno);
565#endif
566 while (!args->can_start_reading && !args->queuing_done)
567 pthread_cond_wait(&args->start_reading, &args->lock);
568#ifdef XR_PF_TRACE
569 pftrace("starting prefetch I/O for AG %d", args->agno);
570#endif
571 pf_batch_read(args, PF_PRIMARY, buf);
572 pf_batch_read(args, PF_SECONDARY, buf);
573
574#ifdef XR_PF_TRACE
575 pftrace("ran out of bufs to prefetch for AG %d", args->agno);
576#endif
577 if (!args->queuing_done)
578 args->can_start_reading = 0;
cb5b3ef4 579 }
2556c98b 580 pthread_mutex_unlock(&args->lock);
cb5b3ef4 581
2556c98b
BN
582 free(buf);
583
584#ifdef XR_PF_TRACE
585 pftrace("finished prefetch I/O for AG %d", args->agno);
586#endif
587 return NULL;
cb5b3ef4
MV
588}
589
2556c98b
BN
590static int
591pf_create_prefetch_thread(
592 prefetch_args_t *args);
593
594static void *
595pf_queuing_worker(
596 void *param)
cb5b3ef4 597{
2556c98b
BN
598 prefetch_args_t *args = param;
599 int num_inos;
600 ino_tree_node_t *irec;
601 ino_tree_node_t *cur_irec;
602 int blks_per_cluster;
2556c98b 603 xfs_agblock_t bno;
cb5b3ef4 604 int i;
2556c98b
BN
605 int err;
606
607 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
608 if (blks_per_cluster == 0)
609 blks_per_cluster = 1;
2556c98b
BN
610
611 for (i = 0; i < PF_THREAD_COUNT; i++) {
612 err = pthread_create(&args->io_threads[i], NULL,
613 pf_io_worker, args);
614 if (err != 0) {
615 do_warn(_("failed to create prefetch thread: %s\n"),
616 strerror(err));
617 if (i == 0) {
618 pf_start_processing(args);
619 return NULL;
620 }
621 /*
622 * since we have at least one I/O thread, use them for
623 * prefetch
624 */
625 break;
626 }
cb5b3ef4
MV
627 }
628
2556c98b
BN
629#ifdef XR_PF_TRACE
630 pftrace("starting prefetch for AG %d", args->agno);
631#endif
cb5b3ef4 632
2556c98b
BN
633 for (irec = findfirst_inode_rec(args->agno); irec != NULL;
634 irec = next_ino_rec(irec)) {
cb5b3ef4 635
2556c98b 636 cur_irec = irec;
cb5b3ef4 637
2556c98b
BN
638 num_inos = XFS_INODES_PER_CHUNK;
639 while (num_inos < XFS_IALLOC_INODES(mp) && irec != NULL) {
640 irec = next_ino_rec(irec);
641 num_inos += XFS_INODES_PER_CHUNK;
642 }
cb5b3ef4 643
2556c98b
BN
644 if (args->dirs_only && cur_irec->ino_isa_dir == 0)
645 continue;
646#ifdef XR_PF_TRACE
647 sem_getvalue(&args->ra_count, &i);
648 pftrace("queuing irec %p in AG %d, sem count = %d",
649 irec, args->agno, i);
650#endif
651 sem_wait(&args->ra_count);
652
653 num_inos = 0;
654 bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum);
655
656 do {
657 pf_queue_io(args, XFS_AGB_TO_FSB(mp, args->agno, bno),
69ec88b5
BN
658 blks_per_cluster,
659 (cur_irec->ino_isa_dir != 0) ?
660 B_DIR_INODE : B_INODE);
2556c98b 661 bno += blks_per_cluster;
edf3f9d0 662 num_inos += inodes_per_cluster;
2556c98b 663 } while (num_inos < XFS_IALLOC_INODES(mp));
cb5b3ef4
MV
664 }
665
2556c98b 666 pthread_mutex_lock(&args->lock);
cb5b3ef4 667
2556c98b
BN
668#ifdef XR_PF_TRACE
669 pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)",
670 args->agno, args->inode_bufs_queued);
671#endif
672 args->queuing_done = 1;
673 pf_start_io_workers(args);
674 pf_start_processing(args);
675 pthread_mutex_unlock(&args->lock);
676
677 /* now wait for the readers to finish */
678 for (i = 0; i < PF_THREAD_COUNT; i++)
679 if (args->io_threads[i])
680 pthread_join(args->io_threads[i], NULL);
681
682#ifdef XR_PF_TRACE
683 pftrace("prefetch for AG %d finished", args->agno);
684#endif
685 pthread_mutex_lock(&args->lock);
686
687 ASSERT(args->primary_io_queue.height == 0);
688 ASSERT(args->secondary_io_queue.height == 0);
689
690 args->prefetch_done = 1;
691 if (args->next_args)
692 pf_create_prefetch_thread(args->next_args);
693
694 pthread_mutex_unlock(&args->lock);
695
696 return NULL;
cb5b3ef4
MV
697}
698
2556c98b
BN
699static int
700pf_create_prefetch_thread(
701 prefetch_args_t *args)
702{
703 int err;
704
705#ifdef XR_PF_TRACE
706 pftrace("creating queue thread for AG %d", args->agno);
707#endif
708 err = pthread_create(&args->queuing_thread, NULL,
709 pf_queuing_worker, args);
710 if (err != 0) {
711 do_warn(_("failed to create prefetch thread: %s\n"),
712 strerror(err));
713 cleanup_inode_prefetch(args);
714 }
715
716 return err == 0;
717}
cb5b3ef4
MV
718
719void
2556c98b
BN
720init_prefetch(
721 xfs_mount_t *pmp)
cb5b3ef4 722{
2556c98b
BN
723 mp = pmp;
724 mp_fd = libxfs_device_to_fd(mp->m_dev);
725 pf_max_bytes = sysconf(_SC_PAGE_SIZE) << 7;
726 pf_max_bbs = pf_max_bytes >> BBSHIFT;
727 pf_max_fsbs = pf_max_bytes >> mp->m_sb.sb_blocklog;
728 pf_batch_bytes = DEF_BATCH_BYTES;
729 pf_batch_fsbs = DEF_BATCH_BYTES >> (mp->m_sb.sb_blocklog + 1);
730}
cb5b3ef4 731
2556c98b
BN
732prefetch_args_t *
733start_inode_prefetch(
734 xfs_agnumber_t agno,
735 int dirs_only,
736 prefetch_args_t *prev_args)
737{
738 prefetch_args_t *args;
edf3f9d0 739 long max_queue;
cb5b3ef4 740
2556c98b
BN
741 if (!do_prefetch || agno >= mp->m_sb.sb_agcount)
742 return NULL;
cb5b3ef4 743
2556c98b
BN
744 args = calloc(1, sizeof(prefetch_args_t));
745
746 INIT_RADIX_TREE(&args->primary_io_queue, 0);
747 INIT_RADIX_TREE(&args->secondary_io_queue, 0);
748 pthread_mutex_init(&args->lock, NULL);
749 pthread_cond_init(&args->start_reading, NULL);
750 pthread_cond_init(&args->start_processing, NULL);
751 args->agno = agno;
752 args->dirs_only = dirs_only;
753
754 /*
755 * use only 1/8 of the libxfs cache as we are only counting inodes
756 * and not any other associated metadata like directories
757 */
758
edf3f9d0
BN
759 max_queue = libxfs_bcache->c_maxcount / thread_count / 8;
760 if (XFS_INODE_CLUSTER_SIZE(mp) > mp->m_sb.sb_blocksize)
761 max_queue = max_queue * (XFS_INODE_CLUSTER_SIZE(mp) >>
762 mp->m_sb.sb_blocklog) / XFS_IALLOC_BLOCKS(mp);
763
764 sem_init(&args->ra_count, 0, max_queue);
2556c98b
BN
765
766 if (!prev_args) {
767 if (!pf_create_prefetch_thread(args))
768 return NULL;
769 } else {
770 pthread_mutex_lock(&prev_args->lock);
771 if (prev_args->prefetch_done) {
772 if (!pf_create_prefetch_thread(args))
773 args = NULL;
774 } else
775 prev_args->next_args = args;
776 pthread_mutex_unlock(&prev_args->lock);
cb5b3ef4 777 }
2556c98b
BN
778
779 return args;
cb5b3ef4
MV
780}
781
782void
2556c98b
BN
783wait_for_inode_prefetch(
784 prefetch_args_t *args)
cb5b3ef4 785{
2556c98b 786 if (args == NULL)
cb5b3ef4 787 return;
2556c98b
BN
788
789 pthread_mutex_lock(&args->lock);
790
791 while (!args->can_start_processing) {
792#ifdef XR_PF_TRACE
793 pftrace("waiting to start processing AG %d", args->agno);
794#endif
795 pthread_cond_wait(&args->start_processing, &args->lock);
cb5b3ef4 796 }
2556c98b
BN
797#ifdef XR_PF_TRACE
798 pftrace("can start processing AG %d", args->agno);
799#endif
800 pthread_mutex_unlock(&args->lock);
801}
cb5b3ef4 802
2556c98b
BN
803void
804cleanup_inode_prefetch(
805 prefetch_args_t *args)
806{
807 if (args == NULL)
808 return;
cb5b3ef4 809
2556c98b
BN
810#ifdef XR_PF_TRACE
811 pftrace("waiting AG %d prefetch to finish", args->agno);
812#endif
813 if (args->queuing_thread)
814 pthread_join(args->queuing_thread, NULL);
815
816#ifdef XR_PF_TRACE
817 pftrace("AG %d prefetch done", args->agno);
818#endif
819 pthread_mutex_destroy(&args->lock);
820 pthread_cond_destroy(&args->start_reading);
821 pthread_cond_destroy(&args->start_processing);
822 sem_destroy(&args->ra_count);
823
824 free(args);
cb5b3ef4
MV
825}
826
2556c98b
BN
827#ifdef XR_PF_TRACE
828
cb5b3ef4 829void
2556c98b 830_pftrace(const char *func, const char *msg, ...)
cb5b3ef4 831{
2556c98b
BN
832 char buf[200];
833 struct timeval tv;
834 va_list args;
cb5b3ef4 835
2556c98b 836 gettimeofday(&tv, NULL);
cb5b3ef4 837
2556c98b
BN
838 va_start(args, msg);
839 vsnprintf(buf, sizeof(buf), msg, args);
840 buf[sizeof(buf)-1] = '\0';
841 va_end(args);
cb5b3ef4 842
2556c98b 843 fprintf(pf_trace_file, "%lu.%06lu %s: %s\n", tv.tv_sec, tv.tv_usec, func, buf);
cb5b3ef4 844}
2556c98b
BN
845
846#endif