]>
Commit | Line | Data |
---|---|---|
cb5b3ef4 | 1 | #include <libxfs.h> |
2556c98b | 2 | #include <pthread.h> |
cb5b3ef4 MV |
3 | #include "avl.h" |
4 | #include "globals.h" | |
5 | #include "agheader.h" | |
6 | #include "incore.h" | |
7 | #include "dir.h" | |
8 | #include "dir2.h" | |
cb5b3ef4 MV |
9 | #include "protos.h" |
10 | #include "err_protos.h" | |
11 | #include "dinode.h" | |
12 | #include "bmap.h" | |
13 | #include "versions.h" | |
2556c98b BN |
14 | #include "threads.h" |
15 | #include "prefetch.h" | |
16 | #include "progress.h" | |
17 | #include "radix-tree.h" | |
cb5b3ef4 MV |
18 | |
19 | int do_prefetch = 1; | |
20 | ||
2556c98b BN |
21 | /* |
22 | * Performs prefetching by priming the libxfs cache by using a dedicate thread | |
23 | * scanning inodes and reading blocks in ahead of time they are required. | |
24 | * | |
25 | * Any I/O errors can be safely ignored. | |
26 | */ | |
cb5b3ef4 | 27 | |
2556c98b BN |
28 | static xfs_mount_t *mp; |
29 | static int mp_fd; | |
30 | static int pf_max_bytes; | |
31 | static int pf_max_bbs; | |
32 | static int pf_max_fsbs; | |
33 | static int pf_batch_bytes; | |
34 | static int pf_batch_fsbs; | |
cb5b3ef4 | 35 | |
69ec88b5 BN |
36 | static void pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *); |
37 | ||
38 | /* buffer priorities for the libxfs cache */ | |
39 | ||
40 | #define B_DIR_BMAP 15 | |
41 | #define B_DIR_META_2 13 /* metadata in secondary queue */ | |
42 | #define B_DIR_META_H 11 /* metadata fetched for PF_META_ONLY */ | |
43 | #define B_DIR_META_S 9 /* single block of metadata */ | |
44 | #define B_DIR_META 7 | |
45 | #define B_DIR_INODE 6 | |
46 | #define B_BMAP 5 | |
47 | #define B_INODE 4 | |
48 | ||
49 | #define B_IS_INODE(b) (((b) & 1) == 0) | |
50 | #define B_IS_META(b) (((b) & 1) != 0) | |
cb5b3ef4 | 51 | |
2556c98b BN |
52 | #define DEF_BATCH_BYTES 0x10000 |
53 | ||
54 | #define MAX_BUFS 128 | |
55 | ||
69ec88b5 | 56 | #define IO_THRESHOLD (MAX_BUFS * 2) |
2556c98b BN |
57 | |
58 | typedef enum pf_which { | |
59 | PF_PRIMARY, | |
60 | PF_SECONDARY, | |
61 | PF_META_ONLY | |
62 | } pf_which_t; | |
63 | ||
64 | ||
65 | static inline void | |
66 | pf_start_processing( | |
67 | prefetch_args_t *args) | |
68 | { | |
69 | if (!args->can_start_processing) { | |
70 | #ifdef XR_PF_TRACE | |
71 | pftrace("signalling processing for AG %d", args->agno); | |
72 | #endif | |
73 | args->can_start_processing = 1; | |
74 | pthread_cond_signal(&args->start_processing); | |
cb5b3ef4 | 75 | } |
2556c98b BN |
76 | } |
77 | ||
78 | static inline void | |
79 | pf_start_io_workers( | |
80 | prefetch_args_t *args) | |
81 | { | |
82 | if (!args->can_start_reading) { | |
83 | #ifdef XR_PF_TRACE | |
84 | pftrace("signalling reading for AG %d", args->agno); | |
85 | #endif | |
86 | args->can_start_reading = 1; | |
87 | pthread_cond_broadcast(&args->start_reading); | |
cb5b3ef4 | 88 | } |
cb5b3ef4 MV |
89 | } |
90 | ||
2556c98b | 91 | |
cb5b3ef4 | 92 | static void |
2556c98b BN |
93 | pf_queue_io( |
94 | prefetch_args_t *args, | |
95 | xfs_fsblock_t fsbno, | |
96 | int blen, | |
97 | int flag) | |
cb5b3ef4 | 98 | { |
2556c98b | 99 | xfs_buf_t *bp; |
cb5b3ef4 | 100 | |
2556c98b BN |
101 | bp = libxfs_getbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, fsbno), |
102 | XFS_FSB_TO_BB(mp, blen)); | |
103 | if (bp->b_flags & LIBXFS_B_UPTODATE) { | |
69ec88b5 BN |
104 | if (B_IS_INODE(flag)) |
105 | pf_read_inode_dirs(args, bp); | |
106 | XFS_BUF_SET_PRIORITY(bp, XFS_BUF_PRIORITY(bp) + 8); | |
2556c98b | 107 | libxfs_putbuf(bp); |
cb5b3ef4 MV |
108 | return; |
109 | } | |
69ec88b5 | 110 | XFS_BUF_SET_PRIORITY(bp, flag); |
cb5b3ef4 | 111 | |
2556c98b | 112 | pthread_mutex_lock(&args->lock); |
cb5b3ef4 | 113 | |
2556c98b BN |
114 | if (fsbno > args->last_bno_read) { |
115 | radix_tree_insert(&args->primary_io_queue, fsbno, bp); | |
69ec88b5 | 116 | if (B_IS_META(flag)) |
2556c98b BN |
117 | radix_tree_tag_set(&args->primary_io_queue, fsbno, 0); |
118 | else { | |
119 | args->inode_bufs_queued++; | |
120 | if (args->inode_bufs_queued == IO_THRESHOLD) | |
121 | pf_start_io_workers(args); | |
cb5b3ef4 | 122 | } |
2556c98b BN |
123 | #ifdef XR_PF_TRACE |
124 | pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to " | |
125 | "primary queue (inode_bufs_queued = %d, last_bno = %lu)", | |
69ec88b5 | 126 | B_IS_INODE(flag) ? 'I' : 'M', bp, |
2556c98b BN |
127 | (long long)XFS_BUF_ADDR(bp), args->agno, fsbno, |
128 | args->inode_bufs_queued, args->last_bno_read); | |
129 | #endif | |
130 | } else { | |
131 | #ifdef XR_PF_TRACE | |
132 | pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to " | |
133 | "secondary queue (last_bno = %lu)", | |
69ec88b5 | 134 | B_IS_INODE(flag) ? 'I' : 'M', bp, |
2556c98b BN |
135 | (long long)XFS_BUF_ADDR(bp), args->agno, fsbno, |
136 | args->last_bno_read); | |
137 | #endif | |
69ec88b5 BN |
138 | ASSERT(B_IS_META(flag)); |
139 | XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2); | |
2556c98b | 140 | radix_tree_insert(&args->secondary_io_queue, fsbno, bp); |
cb5b3ef4 MV |
141 | } |
142 | ||
2556c98b | 143 | pf_start_processing(args); |
cb5b3ef4 | 144 | |
2556c98b | 145 | pthread_mutex_unlock(&args->lock); |
cb5b3ef4 MV |
146 | } |
147 | ||
2556c98b BN |
148 | static int |
149 | pf_read_bmbt_reclist( | |
150 | prefetch_args_t *args, | |
151 | xfs_bmbt_rec_t *rp, | |
152 | int numrecs) | |
cb5b3ef4 | 153 | { |
cb5b3ef4 | 154 | int i; |
2556c98b BN |
155 | xfs_dfsbno_t s; /* start */ |
156 | xfs_dfilblks_t c; /* count */ | |
157 | xfs_dfiloff_t o; /* offset */ | |
158 | xfs_dfilblks_t cp = 0; /* prev count */ | |
159 | xfs_dfiloff_t op = 0; /* prev offset */ | |
160 | int flag; /* extent flag */ | |
161 | ||
162 | for (i = 0; i < numrecs; i++, rp++) { | |
163 | convert_extent((xfs_bmbt_rec_32_t*)rp, &o, &s, &c, &flag); | |
164 | ||
165 | if (((i > 0) && (op + cp > o)) || (c == 0) || | |
166 | (o >= fs_max_file_offset)) | |
167 | return 0; | |
168 | ||
169 | if (!verify_dfsbno(mp, s) || !verify_dfsbno(mp, s + c - 1)) | |
170 | return 0; | |
171 | ||
172 | if (!args->dirs_only && ((o + c) >= mp->m_dirfreeblk)) | |
173 | break; /* only Phase 6 reads the free blocks */ | |
174 | ||
175 | op = o; | |
176 | cp = c; | |
177 | ||
178 | while (c) { | |
f8bc5a6f | 179 | unsigned int len; |
2556c98b BN |
180 | #ifdef XR_PF_TRACE |
181 | pftrace("queuing dir extent in AG %d", args->agno); | |
182 | #endif | |
f8bc5a6f BN |
183 | len = (c > mp->m_dirblkfsbs) ? mp->m_dirblkfsbs : c; |
184 | pf_queue_io(args, s, len, B_DIR_META); | |
185 | c -= len; | |
186 | s += len; | |
2556c98b BN |
187 | } |
188 | } | |
189 | return 1; | |
190 | } | |
cb5b3ef4 | 191 | |
2556c98b BN |
192 | /* |
193 | * simplified version of the main scan_lbtree. Returns 0 to stop. | |
194 | */ | |
195 | ||
196 | static int | |
197 | pf_scan_lbtree( | |
198 | xfs_dfsbno_t dbno, | |
199 | int level, | |
200 | int isadir, | |
201 | prefetch_args_t *args, | |
202 | int (*func)(xfs_btree_lblock_t *block, | |
203 | int level, | |
204 | int isadir, | |
205 | prefetch_args_t *args)) | |
206 | { | |
207 | xfs_buf_t *bp; | |
208 | int rc; | |
cb5b3ef4 | 209 | |
2556c98b | 210 | bp = libxfs_readbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, dbno), |
cb5b3ef4 | 211 | XFS_FSB_TO_BB(mp, 1), 0); |
2556c98b BN |
212 | if (!bp) |
213 | return 0; | |
cb5b3ef4 | 214 | |
69ec88b5 BN |
215 | XFS_BUF_SET_PRIORITY(bp, isadir ? B_DIR_BMAP : B_BMAP); |
216 | ||
2556c98b | 217 | rc = (*func)((xfs_btree_lblock_t *)XFS_BUF_PTR(bp), level - 1, isadir, args); |
cb5b3ef4 | 218 | |
2556c98b | 219 | libxfs_putbuf(bp); |
cb5b3ef4 | 220 | |
2556c98b BN |
221 | return rc; |
222 | } | |
223 | ||
224 | static int | |
225 | pf_scanfunc_bmap( | |
226 | xfs_btree_lblock_t *block, | |
227 | int level, | |
228 | int isadir, | |
229 | prefetch_args_t *args) | |
230 | { | |
231 | xfs_bmbt_rec_t *rp; | |
232 | xfs_bmbt_ptr_t *pp; | |
233 | int numrecs; | |
234 | int i; | |
235 | xfs_dfsbno_t dbno; | |
236 | ||
237 | /* | |
238 | * do some validation on the block contents | |
239 | */ | |
240 | if ((be32_to_cpu(block->bb_magic) != XFS_BMAP_MAGIC) || | |
241 | (be16_to_cpu(block->bb_level) != level)) | |
242 | return 0; | |
243 | ||
244 | numrecs = be16_to_cpu(block->bb_numrecs); | |
245 | ||
246 | if (level == 0) { | |
247 | if (numrecs > mp->m_bmap_dmxr[0] || !isadir) | |
248 | return 0; | |
249 | ||
250 | rp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, | |
251 | block, 1, mp->m_bmap_dmxr[0]); | |
252 | ||
253 | return pf_read_bmbt_reclist(args, rp, numrecs); | |
cb5b3ef4 MV |
254 | } |
255 | ||
2556c98b BN |
256 | if (numrecs > mp->m_bmap_dmxr[1]) |
257 | return 0; | |
cb5b3ef4 | 258 | |
2556c98b BN |
259 | pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, 1, |
260 | mp->m_bmap_dmxr[1]); | |
261 | ||
262 | for (i = 0; i < numrecs; i++) { | |
263 | dbno = be64_to_cpu(pp[i]); | |
264 | if (!verify_dfsbno(mp, dbno)) | |
265 | return 0; | |
266 | if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap)) | |
267 | return 0; | |
cb5b3ef4 | 268 | } |
2556c98b | 269 | return 1; |
cb5b3ef4 MV |
270 | } |
271 | ||
2556c98b BN |
272 | |
273 | static void | |
274 | pf_read_btinode( | |
275 | prefetch_args_t *args, | |
276 | xfs_dinode_t *dino, | |
277 | int isadir) | |
cb5b3ef4 | 278 | { |
2556c98b BN |
279 | xfs_bmdr_block_t *dib; |
280 | xfs_bmbt_ptr_t *pp; | |
281 | int i; | |
282 | int level; | |
283 | int numrecs; | |
284 | int dsize; | |
285 | xfs_dfsbno_t dbno; | |
286 | ||
287 | dib = (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dino); | |
288 | ||
289 | level = be16_to_cpu(dib->bb_level); | |
290 | numrecs = be16_to_cpu(dib->bb_numrecs); | |
291 | ||
292 | if ((numrecs == 0) || (level == 0) || | |
293 | (level > XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))) | |
cb5b3ef4 | 294 | return; |
2556c98b BN |
295 | /* |
296 | * use bmdr/dfork_dsize since the root block is in the data fork | |
297 | */ | |
298 | if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp)) | |
299 | return; | |
300 | ||
301 | dsize = XFS_DFORK_DSIZE(dino, mp); | |
302 | pp = XFS_BTREE_PTR_ADDR(dsize, xfs_bmdr, dib, 1, | |
303 | XFS_BTREE_BLOCK_MAXRECS(dsize, xfs_bmdr, 0)); | |
cb5b3ef4 | 304 | |
2556c98b BN |
305 | for (i = 0; i < numrecs; i++) { |
306 | dbno = be64_to_cpu(pp[i]); | |
307 | if (!verify_dfsbno(mp, dbno)) | |
cb5b3ef4 | 308 | break; |
2556c98b | 309 | if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap)) |
cb5b3ef4 | 310 | break; |
2556c98b BN |
311 | } |
312 | } | |
313 | ||
314 | static void | |
315 | pf_read_exinode( | |
316 | prefetch_args_t *args, | |
317 | xfs_dinode_t *dino) | |
318 | { | |
319 | pf_read_bmbt_reclist(args, (xfs_bmbt_rec_t *)XFS_DFORK_DPTR(dino), | |
320 | be32_to_cpu(dino->di_core.di_nextents)); | |
321 | } | |
cb5b3ef4 | 322 | |
2556c98b BN |
323 | static void |
324 | pf_read_inode_dirs( | |
325 | prefetch_args_t *args, | |
326 | xfs_buf_t *bp) | |
327 | { | |
328 | xfs_dinode_t *dino; | |
329 | int icnt = 0; | |
69ec88b5 BN |
330 | int hasdir = 0; |
331 | int isadir; | |
2556c98b BN |
332 | xfs_dinode_core_t *dinoc; |
333 | ||
334 | for (icnt = 0; icnt < (XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog); icnt++) { | |
335 | dino = XFS_MAKE_IPTR(mp, bp, icnt); | |
336 | dinoc = &dino->di_core; | |
337 | ||
338 | /* | |
339 | * We are only prefetching directory contents in extents | |
340 | * and btree nodes for other inodes | |
341 | */ | |
69ec88b5 BN |
342 | isadir = (be16_to_cpu(dinoc->di_mode) & S_IFMT) == S_IFDIR; |
343 | hasdir |= isadir; | |
344 | ||
345 | if (dinoc->di_format <= XFS_DINODE_FMT_LOCAL) | |
346 | continue; | |
347 | ||
348 | if (!isadir && (dinoc->di_format == XFS_DINODE_FMT_EXTENTS || | |
349 | args->dirs_only)) | |
2556c98b BN |
350 | continue; |
351 | ||
352 | /* | |
353 | * do some checks on the inode to see if we can prefetch | |
354 | * its directory data. It's a cut down version of | |
355 | * process_dinode_int() in dinode.c. | |
356 | */ | |
357 | if (dinoc->di_format > XFS_DINODE_FMT_BTREE) | |
358 | continue; | |
359 | ||
360 | if (be16_to_cpu(dinoc->di_magic) != XFS_DINODE_MAGIC) | |
361 | continue; | |
362 | ||
363 | if (!XFS_DINODE_GOOD_VERSION(dinoc->di_version) || | |
364 | (!fs_inode_nlink && dinoc->di_version > | |
365 | XFS_DINODE_VERSION_1)) | |
366 | continue; | |
367 | ||
368 | if (be64_to_cpu(dinoc->di_size) <= XFS_DFORK_DSIZE(dino, mp)) | |
369 | continue; | |
370 | ||
371 | if ((dinoc->di_forkoff != 0) && | |
372 | (dinoc->di_forkoff >= (XFS_LITINO(mp) >> 3))) | |
373 | continue; | |
374 | ||
375 | switch (dinoc->di_format) { | |
376 | case XFS_DINODE_FMT_EXTENTS: | |
377 | pf_read_exinode(args, dino); | |
cb5b3ef4 | 378 | break; |
2556c98b | 379 | case XFS_DINODE_FMT_BTREE: |
69ec88b5 | 380 | pf_read_btinode(args, dino, isadir); |
cb5b3ef4 | 381 | break; |
cb5b3ef4 MV |
382 | } |
383 | } | |
69ec88b5 BN |
384 | if (hasdir) |
385 | XFS_BUF_SET_PRIORITY(bp, B_DIR_INODE); | |
cb5b3ef4 MV |
386 | } |
387 | ||
2556c98b BN |
388 | /* |
389 | * pf_batch_read must be called with the lock locked. | |
390 | */ | |
391 | ||
cb5b3ef4 | 392 | static void |
2556c98b BN |
393 | pf_batch_read( |
394 | prefetch_args_t *args, | |
395 | pf_which_t which, | |
396 | void *buf) | |
cb5b3ef4 | 397 | { |
2556c98b BN |
398 | struct radix_tree_root *queue; |
399 | xfs_buf_t *bplist[MAX_BUFS]; | |
400 | unsigned int num; | |
401 | off64_t first_off, last_off, next_off; | |
402 | int len, size; | |
cb5b3ef4 | 403 | int i; |
2556c98b BN |
404 | int inode_bufs; |
405 | unsigned long fsbno; | |
406 | char *pbuf; | |
407 | ||
408 | queue = (which != PF_SECONDARY) ? &args->primary_io_queue | |
409 | : &args->secondary_io_queue; | |
410 | ||
411 | while (radix_tree_lookup_first(queue, &fsbno) != NULL) { | |
412 | ||
413 | if (which != PF_META_ONLY) { | |
414 | num = radix_tree_gang_lookup_ex(queue, | |
415 | (void**)&bplist[0], fsbno, | |
416 | fsbno + pf_max_fsbs, MAX_BUFS); | |
417 | ASSERT(num > 0); | |
418 | ASSERT(XFS_FSB_TO_DADDR(mp, fsbno) == | |
419 | XFS_BUF_ADDR(bplist[0])); | |
420 | } else { | |
421 | num = radix_tree_gang_lookup_tag(queue, | |
422 | (void**)&bplist[0], fsbno, | |
423 | MAX_BUFS / 4, 0); | |
424 | if (num == 0) | |
425 | return; | |
426 | } | |
cb5b3ef4 | 427 | |
2556c98b BN |
428 | /* |
429 | * do a big read if 25% of the potential buffer is useful, | |
430 | * otherwise, find as many close together blocks and | |
431 | * read them in one read | |
432 | */ | |
433 | first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0])); | |
434 | last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) + | |
435 | XFS_BUF_SIZE(bplist[num-1]); | |
436 | while (last_off - first_off > pf_max_bytes) { | |
437 | num--; | |
438 | last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) + | |
439 | XFS_BUF_SIZE(bplist[num-1]); | |
440 | } | |
441 | if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) { | |
442 | /* | |
443 | * not enough blocks for one big read, so determine | |
444 | * the number of blocks that are close enough. | |
445 | */ | |
446 | last_off = first_off + XFS_BUF_SIZE(bplist[0]); | |
447 | for (i = 1; i < num; i++) { | |
448 | next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) + | |
449 | XFS_BUF_SIZE(bplist[i]); | |
450 | if (next_off - last_off > pf_batch_bytes) | |
451 | break; | |
452 | last_off = next_off; | |
453 | } | |
454 | num = i; | |
455 | } | |
cb5b3ef4 | 456 | |
2556c98b BN |
457 | for (i = 0; i < num; i++) { |
458 | if (radix_tree_delete(queue, XFS_DADDR_TO_FSB(mp, | |
459 | XFS_BUF_ADDR(bplist[i]))) == NULL) | |
460 | do_error(_("prefetch corruption\n")); | |
cb5b3ef4 MV |
461 | } |
462 | ||
2556c98b BN |
463 | if (which == PF_PRIMARY) { |
464 | for (inode_bufs = 0, i = 0; i < num; i++) { | |
69ec88b5 | 465 | if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i]))) |
2556c98b BN |
466 | inode_bufs++; |
467 | } | |
468 | args->inode_bufs_queued -= inode_bufs; | |
469 | if (inode_bufs && (first_off >> mp->m_sb.sb_blocklog) > | |
470 | pf_batch_fsbs) | |
471 | args->last_bno_read = (first_off >> mp->m_sb.sb_blocklog); | |
472 | } | |
473 | #ifdef XR_PF_TRACE | |
474 | pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)", | |
475 | (long long)XFS_BUF_ADDR(bplist[0]), | |
476 | (long long)XFS_BUF_ADDR(bplist[num-1]), num, | |
477 | (which != PF_SECONDARY) ? "pri" : "sec", args->agno, | |
478 | args->last_bno_read, args->inode_bufs_queued); | |
479 | #endif | |
480 | pthread_mutex_unlock(&args->lock); | |
481 | ||
482 | /* | |
483 | * now read the data and put into the xfs_but_t's | |
484 | */ | |
485 | len = pread64(mp_fd, buf, (int)(last_off - first_off), first_off); | |
486 | if (len > 0) { | |
487 | /* | |
488 | * go through the xfs_buf_t list copying from the | |
489 | * read buffer into the xfs_buf_t's and release them. | |
490 | */ | |
491 | last_off = first_off; | |
492 | for (i = 0; i < num; i++) { | |
493 | ||
494 | pbuf = ((char *)buf) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) - first_off); | |
495 | size = XFS_BUF_SIZE(bplist[i]); | |
496 | if (len < size) | |
497 | break; | |
498 | memcpy(XFS_BUF_PTR(bplist[i]), pbuf, size); | |
499 | bplist[i]->b_flags |= LIBXFS_B_UPTODATE; | |
500 | len -= size; | |
69ec88b5 | 501 | if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i]))) |
2556c98b | 502 | pf_read_inode_dirs(args, bplist[i]); |
69ec88b5 BN |
503 | else if (which == PF_META_ONLY) |
504 | XFS_BUF_SET_PRIORITY(bplist[i], | |
505 | B_DIR_META_H); | |
506 | else if (which == PF_PRIMARY && num == 1) | |
507 | XFS_BUF_SET_PRIORITY(bplist[i], | |
508 | B_DIR_META_S); | |
2556c98b BN |
509 | } |
510 | } | |
511 | for (i = 0; i < num; i++) { | |
512 | #ifdef XR_PF_TRACE | |
513 | pftrace("putbuf %c %p (%llu) in AG %d", | |
69ec88b5 | 514 | B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M', |
2556c98b BN |
515 | bplist[i], (long long)XFS_BUF_ADDR(bplist[i]), |
516 | args->agno); | |
517 | #endif | |
518 | libxfs_putbuf(bplist[i]); | |
519 | } | |
520 | pthread_mutex_lock(&args->lock); | |
521 | if (which != PF_SECONDARY) { | |
522 | #ifdef XR_PF_TRACE | |
523 | pftrace("inode_bufs_queued for AG %d = %d", args->agno, | |
524 | args->inode_bufs_queued); | |
525 | #endif | |
526 | /* | |
527 | * if primary inode queue running low, process metadata | |
528 | * in boths queues to avoid I/O starvation as the | |
529 | * processing thread would be waiting for a metadata | |
530 | * buffer | |
531 | */ | |
532 | if (which == PF_PRIMARY && !args->queuing_done && | |
533 | args->inode_bufs_queued < IO_THRESHOLD) { | |
534 | #ifdef XR_PF_TRACE | |
535 | pftrace("reading metadata bufs from primary queue for AG %d", | |
536 | args->agno); | |
537 | #endif | |
538 | pf_batch_read(args, PF_META_ONLY, buf); | |
539 | #ifdef XR_PF_TRACE | |
540 | pftrace("reading bufs from secondary queue for AG %d", | |
541 | args->agno); | |
542 | #endif | |
543 | pf_batch_read(args, PF_SECONDARY, buf); | |
544 | } | |
cb5b3ef4 | 545 | } |
cb5b3ef4 | 546 | } |
2556c98b BN |
547 | } |
548 | ||
549 | static void * | |
550 | pf_io_worker( | |
551 | void *param) | |
552 | { | |
553 | prefetch_args_t *args = param; | |
554 | void *buf = memalign(libxfs_device_alignment(), | |
555 | pf_max_bytes); | |
556 | ||
557 | if (buf == NULL) | |
558 | return NULL; | |
cb5b3ef4 | 559 | |
2556c98b BN |
560 | pthread_mutex_lock(&args->lock); |
561 | while (!args->queuing_done || args->primary_io_queue.height) { | |
562 | ||
563 | #ifdef XR_PF_TRACE | |
564 | pftrace("waiting to start prefetch I/O for AG %d", args->agno); | |
565 | #endif | |
566 | while (!args->can_start_reading && !args->queuing_done) | |
567 | pthread_cond_wait(&args->start_reading, &args->lock); | |
568 | #ifdef XR_PF_TRACE | |
569 | pftrace("starting prefetch I/O for AG %d", args->agno); | |
570 | #endif | |
571 | pf_batch_read(args, PF_PRIMARY, buf); | |
572 | pf_batch_read(args, PF_SECONDARY, buf); | |
573 | ||
574 | #ifdef XR_PF_TRACE | |
575 | pftrace("ran out of bufs to prefetch for AG %d", args->agno); | |
576 | #endif | |
577 | if (!args->queuing_done) | |
578 | args->can_start_reading = 0; | |
cb5b3ef4 | 579 | } |
2556c98b | 580 | pthread_mutex_unlock(&args->lock); |
cb5b3ef4 | 581 | |
2556c98b BN |
582 | free(buf); |
583 | ||
584 | #ifdef XR_PF_TRACE | |
585 | pftrace("finished prefetch I/O for AG %d", args->agno); | |
586 | #endif | |
587 | return NULL; | |
cb5b3ef4 MV |
588 | } |
589 | ||
2556c98b BN |
590 | static int |
591 | pf_create_prefetch_thread( | |
592 | prefetch_args_t *args); | |
593 | ||
594 | static void * | |
595 | pf_queuing_worker( | |
596 | void *param) | |
cb5b3ef4 | 597 | { |
2556c98b BN |
598 | prefetch_args_t *args = param; |
599 | int num_inos; | |
600 | ino_tree_node_t *irec; | |
601 | ino_tree_node_t *cur_irec; | |
602 | int blks_per_cluster; | |
2556c98b | 603 | xfs_agblock_t bno; |
cb5b3ef4 | 604 | int i; |
2556c98b BN |
605 | int err; |
606 | ||
607 | blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; | |
608 | if (blks_per_cluster == 0) | |
609 | blks_per_cluster = 1; | |
2556c98b BN |
610 | |
611 | for (i = 0; i < PF_THREAD_COUNT; i++) { | |
612 | err = pthread_create(&args->io_threads[i], NULL, | |
613 | pf_io_worker, args); | |
614 | if (err != 0) { | |
615 | do_warn(_("failed to create prefetch thread: %s\n"), | |
616 | strerror(err)); | |
617 | if (i == 0) { | |
618 | pf_start_processing(args); | |
619 | return NULL; | |
620 | } | |
621 | /* | |
622 | * since we have at least one I/O thread, use them for | |
623 | * prefetch | |
624 | */ | |
625 | break; | |
626 | } | |
cb5b3ef4 MV |
627 | } |
628 | ||
2556c98b BN |
629 | #ifdef XR_PF_TRACE |
630 | pftrace("starting prefetch for AG %d", args->agno); | |
631 | #endif | |
cb5b3ef4 | 632 | |
2556c98b BN |
633 | for (irec = findfirst_inode_rec(args->agno); irec != NULL; |
634 | irec = next_ino_rec(irec)) { | |
cb5b3ef4 | 635 | |
2556c98b | 636 | cur_irec = irec; |
cb5b3ef4 | 637 | |
2556c98b BN |
638 | num_inos = XFS_INODES_PER_CHUNK; |
639 | while (num_inos < XFS_IALLOC_INODES(mp) && irec != NULL) { | |
640 | irec = next_ino_rec(irec); | |
641 | num_inos += XFS_INODES_PER_CHUNK; | |
642 | } | |
cb5b3ef4 | 643 | |
2556c98b BN |
644 | if (args->dirs_only && cur_irec->ino_isa_dir == 0) |
645 | continue; | |
646 | #ifdef XR_PF_TRACE | |
647 | sem_getvalue(&args->ra_count, &i); | |
648 | pftrace("queuing irec %p in AG %d, sem count = %d", | |
649 | irec, args->agno, i); | |
650 | #endif | |
651 | sem_wait(&args->ra_count); | |
652 | ||
653 | num_inos = 0; | |
654 | bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum); | |
655 | ||
656 | do { | |
657 | pf_queue_io(args, XFS_AGB_TO_FSB(mp, args->agno, bno), | |
69ec88b5 BN |
658 | blks_per_cluster, |
659 | (cur_irec->ino_isa_dir != 0) ? | |
660 | B_DIR_INODE : B_INODE); | |
2556c98b | 661 | bno += blks_per_cluster; |
edf3f9d0 | 662 | num_inos += inodes_per_cluster; |
2556c98b | 663 | } while (num_inos < XFS_IALLOC_INODES(mp)); |
cb5b3ef4 MV |
664 | } |
665 | ||
2556c98b | 666 | pthread_mutex_lock(&args->lock); |
cb5b3ef4 | 667 | |
2556c98b BN |
668 | #ifdef XR_PF_TRACE |
669 | pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)", | |
670 | args->agno, args->inode_bufs_queued); | |
671 | #endif | |
672 | args->queuing_done = 1; | |
673 | pf_start_io_workers(args); | |
674 | pf_start_processing(args); | |
675 | pthread_mutex_unlock(&args->lock); | |
676 | ||
677 | /* now wait for the readers to finish */ | |
678 | for (i = 0; i < PF_THREAD_COUNT; i++) | |
679 | if (args->io_threads[i]) | |
680 | pthread_join(args->io_threads[i], NULL); | |
681 | ||
682 | #ifdef XR_PF_TRACE | |
683 | pftrace("prefetch for AG %d finished", args->agno); | |
684 | #endif | |
685 | pthread_mutex_lock(&args->lock); | |
686 | ||
687 | ASSERT(args->primary_io_queue.height == 0); | |
688 | ASSERT(args->secondary_io_queue.height == 0); | |
689 | ||
690 | args->prefetch_done = 1; | |
691 | if (args->next_args) | |
692 | pf_create_prefetch_thread(args->next_args); | |
693 | ||
694 | pthread_mutex_unlock(&args->lock); | |
695 | ||
696 | return NULL; | |
cb5b3ef4 MV |
697 | } |
698 | ||
2556c98b BN |
699 | static int |
700 | pf_create_prefetch_thread( | |
701 | prefetch_args_t *args) | |
702 | { | |
703 | int err; | |
704 | ||
705 | #ifdef XR_PF_TRACE | |
706 | pftrace("creating queue thread for AG %d", args->agno); | |
707 | #endif | |
708 | err = pthread_create(&args->queuing_thread, NULL, | |
709 | pf_queuing_worker, args); | |
710 | if (err != 0) { | |
711 | do_warn(_("failed to create prefetch thread: %s\n"), | |
712 | strerror(err)); | |
713 | cleanup_inode_prefetch(args); | |
714 | } | |
715 | ||
716 | return err == 0; | |
717 | } | |
cb5b3ef4 MV |
718 | |
719 | void | |
2556c98b BN |
720 | init_prefetch( |
721 | xfs_mount_t *pmp) | |
cb5b3ef4 | 722 | { |
2556c98b BN |
723 | mp = pmp; |
724 | mp_fd = libxfs_device_to_fd(mp->m_dev); | |
725 | pf_max_bytes = sysconf(_SC_PAGE_SIZE) << 7; | |
726 | pf_max_bbs = pf_max_bytes >> BBSHIFT; | |
727 | pf_max_fsbs = pf_max_bytes >> mp->m_sb.sb_blocklog; | |
728 | pf_batch_bytes = DEF_BATCH_BYTES; | |
729 | pf_batch_fsbs = DEF_BATCH_BYTES >> (mp->m_sb.sb_blocklog + 1); | |
730 | } | |
cb5b3ef4 | 731 | |
2556c98b BN |
732 | prefetch_args_t * |
733 | start_inode_prefetch( | |
734 | xfs_agnumber_t agno, | |
735 | int dirs_only, | |
736 | prefetch_args_t *prev_args) | |
737 | { | |
738 | prefetch_args_t *args; | |
edf3f9d0 | 739 | long max_queue; |
cb5b3ef4 | 740 | |
2556c98b BN |
741 | if (!do_prefetch || agno >= mp->m_sb.sb_agcount) |
742 | return NULL; | |
cb5b3ef4 | 743 | |
2556c98b BN |
744 | args = calloc(1, sizeof(prefetch_args_t)); |
745 | ||
746 | INIT_RADIX_TREE(&args->primary_io_queue, 0); | |
747 | INIT_RADIX_TREE(&args->secondary_io_queue, 0); | |
748 | pthread_mutex_init(&args->lock, NULL); | |
749 | pthread_cond_init(&args->start_reading, NULL); | |
750 | pthread_cond_init(&args->start_processing, NULL); | |
751 | args->agno = agno; | |
752 | args->dirs_only = dirs_only; | |
753 | ||
754 | /* | |
755 | * use only 1/8 of the libxfs cache as we are only counting inodes | |
756 | * and not any other associated metadata like directories | |
757 | */ | |
758 | ||
edf3f9d0 BN |
759 | max_queue = libxfs_bcache->c_maxcount / thread_count / 8; |
760 | if (XFS_INODE_CLUSTER_SIZE(mp) > mp->m_sb.sb_blocksize) | |
761 | max_queue = max_queue * (XFS_INODE_CLUSTER_SIZE(mp) >> | |
762 | mp->m_sb.sb_blocklog) / XFS_IALLOC_BLOCKS(mp); | |
763 | ||
764 | sem_init(&args->ra_count, 0, max_queue); | |
2556c98b BN |
765 | |
766 | if (!prev_args) { | |
767 | if (!pf_create_prefetch_thread(args)) | |
768 | return NULL; | |
769 | } else { | |
770 | pthread_mutex_lock(&prev_args->lock); | |
771 | if (prev_args->prefetch_done) { | |
772 | if (!pf_create_prefetch_thread(args)) | |
773 | args = NULL; | |
774 | } else | |
775 | prev_args->next_args = args; | |
776 | pthread_mutex_unlock(&prev_args->lock); | |
cb5b3ef4 | 777 | } |
2556c98b BN |
778 | |
779 | return args; | |
cb5b3ef4 MV |
780 | } |
781 | ||
782 | void | |
2556c98b BN |
783 | wait_for_inode_prefetch( |
784 | prefetch_args_t *args) | |
cb5b3ef4 | 785 | { |
2556c98b | 786 | if (args == NULL) |
cb5b3ef4 | 787 | return; |
2556c98b BN |
788 | |
789 | pthread_mutex_lock(&args->lock); | |
790 | ||
791 | while (!args->can_start_processing) { | |
792 | #ifdef XR_PF_TRACE | |
793 | pftrace("waiting to start processing AG %d", args->agno); | |
794 | #endif | |
795 | pthread_cond_wait(&args->start_processing, &args->lock); | |
cb5b3ef4 | 796 | } |
2556c98b BN |
797 | #ifdef XR_PF_TRACE |
798 | pftrace("can start processing AG %d", args->agno); | |
799 | #endif | |
800 | pthread_mutex_unlock(&args->lock); | |
801 | } | |
cb5b3ef4 | 802 | |
2556c98b BN |
803 | void |
804 | cleanup_inode_prefetch( | |
805 | prefetch_args_t *args) | |
806 | { | |
807 | if (args == NULL) | |
808 | return; | |
cb5b3ef4 | 809 | |
2556c98b BN |
810 | #ifdef XR_PF_TRACE |
811 | pftrace("waiting AG %d prefetch to finish", args->agno); | |
812 | #endif | |
813 | if (args->queuing_thread) | |
814 | pthread_join(args->queuing_thread, NULL); | |
815 | ||
816 | #ifdef XR_PF_TRACE | |
817 | pftrace("AG %d prefetch done", args->agno); | |
818 | #endif | |
819 | pthread_mutex_destroy(&args->lock); | |
820 | pthread_cond_destroy(&args->start_reading); | |
821 | pthread_cond_destroy(&args->start_processing); | |
822 | sem_destroy(&args->ra_count); | |
823 | ||
824 | free(args); | |
cb5b3ef4 MV |
825 | } |
826 | ||
2556c98b BN |
827 | #ifdef XR_PF_TRACE |
828 | ||
cb5b3ef4 | 829 | void |
2556c98b | 830 | _pftrace(const char *func, const char *msg, ...) |
cb5b3ef4 | 831 | { |
2556c98b BN |
832 | char buf[200]; |
833 | struct timeval tv; | |
834 | va_list args; | |
cb5b3ef4 | 835 | |
2556c98b | 836 | gettimeofday(&tv, NULL); |
cb5b3ef4 | 837 | |
2556c98b BN |
838 | va_start(args, msg); |
839 | vsnprintf(buf, sizeof(buf), msg, args); | |
840 | buf[sizeof(buf)-1] = '\0'; | |
841 | va_end(args); | |
cb5b3ef4 | 842 | |
2556c98b | 843 | fprintf(pf_trace_file, "%lu.%06lu %s: %s\n", tv.tv_sec, tv.tv_usec, func, buf); |
cb5b3ef4 | 844 | } |
2556c98b BN |
845 | |
846 | #endif |