]>
Commit | Line | Data |
---|---|---|
6b803e5a | 1 | #include "libxfs.h" |
2556c98b | 2 | #include <pthread.h> |
cb5b3ef4 | 3 | #include "avl.h" |
379397bf | 4 | #include "btree.h" |
cb5b3ef4 MV |
5 | #include "globals.h" |
6 | #include "agheader.h" | |
7 | #include "incore.h" | |
cb5b3ef4 | 8 | #include "dir2.h" |
cb5b3ef4 MV |
9 | #include "protos.h" |
10 | #include "err_protos.h" | |
11 | #include "dinode.h" | |
12 | #include "bmap.h" | |
13 | #include "versions.h" | |
2556c98b BN |
14 | #include "threads.h" |
15 | #include "prefetch.h" | |
16 | #include "progress.h" | |
cb5b3ef4 MV |
17 | |
18 | int do_prefetch = 1; | |
19 | ||
2556c98b BN |
20 | /* |
21 | * Performs prefetching by priming the libxfs cache by using a dedicate thread | |
22 | * scanning inodes and reading blocks in ahead of time they are required. | |
23 | * | |
24 | * Any I/O errors can be safely ignored. | |
25 | */ | |
cb5b3ef4 | 26 | |
2556c98b BN |
27 | static xfs_mount_t *mp; |
28 | static int mp_fd; | |
29 | static int pf_max_bytes; | |
30 | static int pf_max_bbs; | |
31 | static int pf_max_fsbs; | |
32 | static int pf_batch_bytes; | |
33 | static int pf_batch_fsbs; | |
cb5b3ef4 | 34 | |
69ec88b5 BN |
35 | static void pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *); |
36 | ||
a040d7c9 BN |
37 | /* |
38 | * Buffer priorities for the libxfs cache | |
39 | * | |
40 | * Directory metadata is ranked higher than other metadata as it's used | |
41 | * in phases 3, 4 and 6, while other metadata is only used in 3 & 4. | |
42 | */ | |
69ec88b5 | 43 | |
a040d7c9 BN |
44 | /* intermediate directory btree nodes - can't be queued */ |
45 | #define B_DIR_BMAP CACHE_PREFETCH_PRIORITY + 7 | |
46 | /* directory metadata in secondary queue */ | |
47 | #define B_DIR_META_2 CACHE_PREFETCH_PRIORITY + 6 | |
48 | /* dir metadata that had to fetched from the primary queue to avoid stalling */ | |
49 | #define B_DIR_META_H CACHE_PREFETCH_PRIORITY + 5 | |
50 | /* single block of directory metadata (can't batch read) */ | |
51 | #define B_DIR_META_S CACHE_PREFETCH_PRIORITY + 4 | |
52 | /* dir metadata with more than one block fetched in a single I/O */ | |
53 | #define B_DIR_META CACHE_PREFETCH_PRIORITY + 3 | |
54 | /* inode clusters with directory inodes */ | |
55 | #define B_DIR_INODE CACHE_PREFETCH_PRIORITY + 2 | |
56 | /* intermediate extent btree nodes */ | |
57 | #define B_BMAP CACHE_PREFETCH_PRIORITY + 1 | |
58 | /* inode clusters without any directory entries */ | |
59 | #define B_INODE CACHE_PREFETCH_PRIORITY | |
69ec88b5 | 60 | |
a040d7c9 BN |
61 | /* |
62 | * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if | |
63 | * the buffer is for an inode or other metadata. | |
64 | */ | |
65 | #define B_IS_INODE(f) (((f) & 5) == 0) | |
cb5b3ef4 | 66 | |
2556c98b BN |
67 | #define DEF_BATCH_BYTES 0x10000 |
68 | ||
69 | #define MAX_BUFS 128 | |
70 | ||
69ec88b5 | 71 | #define IO_THRESHOLD (MAX_BUFS * 2) |
2556c98b BN |
72 | |
73 | typedef enum pf_which { | |
74 | PF_PRIMARY, | |
75 | PF_SECONDARY, | |
76 | PF_META_ONLY | |
77 | } pf_which_t; | |
78 | ||
79 | ||
80 | static inline void | |
81 | pf_start_processing( | |
82 | prefetch_args_t *args) | |
83 | { | |
84 | if (!args->can_start_processing) { | |
2556c98b | 85 | pftrace("signalling processing for AG %d", args->agno); |
4c0a98ae | 86 | |
2556c98b BN |
87 | args->can_start_processing = 1; |
88 | pthread_cond_signal(&args->start_processing); | |
cb5b3ef4 | 89 | } |
2556c98b BN |
90 | } |
91 | ||
92 | static inline void | |
93 | pf_start_io_workers( | |
94 | prefetch_args_t *args) | |
95 | { | |
96 | if (!args->can_start_reading) { | |
2556c98b | 97 | pftrace("signalling reading for AG %d", args->agno); |
4c0a98ae | 98 | |
2556c98b BN |
99 | args->can_start_reading = 1; |
100 | pthread_cond_broadcast(&args->start_reading); | |
cb5b3ef4 | 101 | } |
cb5b3ef4 MV |
102 | } |
103 | ||
2556c98b | 104 | |
cb5b3ef4 | 105 | static void |
2556c98b BN |
106 | pf_queue_io( |
107 | prefetch_args_t *args, | |
dd9093de DC |
108 | struct xfs_buf_map *map, |
109 | int nmaps, | |
2556c98b | 110 | int flag) |
cb5b3ef4 | 111 | { |
dd9093de DC |
112 | struct xfs_buf *bp; |
113 | xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, map[0].bm_bn); | |
cb5b3ef4 | 114 | |
2ae22647 CH |
115 | /* |
116 | * Never block on a buffer lock here, given that the actual repair | |
117 | * code might lock buffers in a different order from us. Given that | |
118 | * the lock holder is either reading it from disk himself or | |
119 | * completely overwriting it this behaviour is perfectly fine. | |
120 | */ | |
dd9093de | 121 | bp = libxfs_getbuf_map(mp->m_dev, map, nmaps, LIBXFS_GETBUF_TRYLOCK); |
2ae22647 CH |
122 | if (!bp) |
123 | return; | |
124 | ||
2556c98b | 125 | if (bp->b_flags & LIBXFS_B_UPTODATE) { |
69ec88b5 BN |
126 | if (B_IS_INODE(flag)) |
127 | pf_read_inode_dirs(args, bp); | |
b3563c19 BN |
128 | XFS_BUF_SET_PRIORITY(bp, XFS_BUF_PRIORITY(bp) + |
129 | CACHE_PREFETCH_PRIORITY); | |
2556c98b | 130 | libxfs_putbuf(bp); |
cb5b3ef4 MV |
131 | return; |
132 | } | |
69ec88b5 | 133 | XFS_BUF_SET_PRIORITY(bp, flag); |
cb5b3ef4 | 134 | |
2556c98b | 135 | pthread_mutex_lock(&args->lock); |
cb5b3ef4 | 136 | |
bb34c934 BN |
137 | btree_insert(args->io_queue, fsbno, bp); |
138 | ||
2556c98b | 139 | if (fsbno > args->last_bno_read) { |
379397bf | 140 | if (B_IS_INODE(flag)) { |
2556c98b BN |
141 | args->inode_bufs_queued++; |
142 | if (args->inode_bufs_queued == IO_THRESHOLD) | |
143 | pf_start_io_workers(args); | |
cb5b3ef4 | 144 | } |
2556c98b | 145 | } else { |
08cee623 | 146 | ASSERT(!B_IS_INODE(flag)); |
69ec88b5 | 147 | XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2); |
cb5b3ef4 MV |
148 | } |
149 | ||
4c0a98ae BN |
150 | pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue" |
151 | "(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag) ? | |
152 | 'I' : 'M', bp, (long long)XFS_BUF_ADDR(bp), args->agno, fsbno, | |
153 | args->inode_bufs_queued, args->last_bno_read); | |
154 | ||
2556c98b | 155 | pf_start_processing(args); |
cb5b3ef4 | 156 | |
2556c98b | 157 | pthread_mutex_unlock(&args->lock); |
cb5b3ef4 MV |
158 | } |
159 | ||
2556c98b BN |
160 | static int |
161 | pf_read_bmbt_reclist( | |
162 | prefetch_args_t *args, | |
163 | xfs_bmbt_rec_t *rp, | |
164 | int numrecs) | |
cb5b3ef4 | 165 | { |
cb5b3ef4 | 166 | int i; |
e0a12bda | 167 | xfs_bmbt_irec_t irec; |
5a35bf2c DC |
168 | xfs_filblks_t cp = 0; /* prev count */ |
169 | xfs_fileoff_t op = 0; /* prev offset */ | |
dd9093de DC |
170 | #define MAP_ARRAY_SZ 4 |
171 | struct xfs_buf_map map_array[MAP_ARRAY_SZ]; | |
172 | struct xfs_buf_map *map = map_array; | |
173 | int max_extents = MAP_ARRAY_SZ; | |
24e04791 | 174 | int nmaps = 0; |
dd9093de DC |
175 | unsigned int len = 0; |
176 | int ret = 0; | |
177 | ||
2556c98b | 178 | |
5e656dbb BN |
179 | for (i = 0; i < numrecs; i++) { |
180 | libxfs_bmbt_disk_get_all(rp + i, &irec); | |
2556c98b | 181 | |
e0a12bda BN |
182 | if (((i > 0) && (op + cp > irec.br_startoff)) || |
183 | (irec.br_blockcount == 0) || | |
184 | (irec.br_startoff >= fs_max_file_offset)) | |
dd9093de | 185 | goto out_free; |
2556c98b | 186 | |
e0a12bda BN |
187 | if (!verify_dfsbno(mp, irec.br_startblock) || !verify_dfsbno(mp, |
188 | irec.br_startblock + irec.br_blockcount - 1)) | |
dd9093de | 189 | goto out_free; |
2556c98b | 190 | |
e0a12bda | 191 | if (!args->dirs_only && ((irec.br_startoff + |
ff105f75 | 192 | irec.br_blockcount) >= mp->m_dir_geo->freeblk)) |
2556c98b BN |
193 | break; /* only Phase 6 reads the free blocks */ |
194 | ||
e0a12bda BN |
195 | op = irec.br_startoff; |
196 | cp = irec.br_blockcount; | |
2556c98b | 197 | |
e0a12bda | 198 | while (irec.br_blockcount) { |
dd9093de | 199 | unsigned int bm_len; |
4c0a98ae | 200 | |
2556c98b | 201 | pftrace("queuing dir extent in AG %d", args->agno); |
4c0a98ae | 202 | |
ff105f75 DC |
203 | if (len + irec.br_blockcount >= mp->m_dir_geo->fsbcount) |
204 | bm_len = mp->m_dir_geo->fsbcount - len; | |
dd9093de DC |
205 | else |
206 | bm_len = irec.br_blockcount; | |
207 | len += bm_len; | |
208 | ||
209 | map[nmaps].bm_bn = XFS_FSB_TO_DADDR(mp, | |
210 | irec.br_startblock); | |
211 | map[nmaps].bm_len = XFS_FSB_TO_BB(mp, bm_len); | |
212 | nmaps++; | |
213 | ||
ff105f75 | 214 | if (len == mp->m_dir_geo->fsbcount) { |
dd9093de DC |
215 | pf_queue_io(args, map, nmaps, B_DIR_META); |
216 | len = 0; | |
217 | nmaps = 0; | |
218 | } | |
219 | ||
220 | irec.br_blockcount -= bm_len; | |
221 | irec.br_startblock += bm_len; | |
222 | ||
223 | /* | |
224 | * Handle very fragmented dir2 blocks with dynamically | |
225 | * allocated buffer maps. | |
226 | */ | |
227 | if (nmaps >= max_extents) { | |
228 | struct xfs_buf_map *old_map = NULL; | |
229 | ||
230 | if (map == map_array) { | |
231 | old_map = map; | |
232 | map = NULL; | |
233 | } | |
234 | max_extents *= 2; | |
235 | map = realloc(map, max_extents * sizeof(*map)); | |
236 | if (map == NULL) { | |
237 | do_error( | |
238 | _("couldn't malloc dir2 buffer list\n")); | |
239 | exit(1); | |
240 | } | |
241 | if (old_map) | |
242 | memcpy(map, old_map, sizeof(map_array)); | |
243 | } | |
244 | ||
2556c98b BN |
245 | } |
246 | } | |
dd9093de DC |
247 | ret = 1; |
248 | out_free: | |
249 | if (map != map_array) | |
250 | free(map); | |
251 | return ret; | |
2556c98b | 252 | } |
cb5b3ef4 | 253 | |
2556c98b BN |
254 | /* |
255 | * simplified version of the main scan_lbtree. Returns 0 to stop. | |
256 | */ | |
257 | ||
258 | static int | |
259 | pf_scan_lbtree( | |
5a35bf2c | 260 | xfs_fsblock_t dbno, |
2556c98b BN |
261 | int level, |
262 | int isadir, | |
263 | prefetch_args_t *args, | |
b3563c19 | 264 | int (*func)(struct xfs_btree_block *block, |
2556c98b BN |
265 | int level, |
266 | int isadir, | |
267 | prefetch_args_t *args)) | |
268 | { | |
269 | xfs_buf_t *bp; | |
270 | int rc; | |
cb5b3ef4 | 271 | |
2556c98b | 272 | bp = libxfs_readbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, dbno), |
e0607266 | 273 | XFS_FSB_TO_BB(mp, 1), 0, &xfs_bmbt_buf_ops); |
2556c98b BN |
274 | if (!bp) |
275 | return 0; | |
cb5b3ef4 | 276 | |
69ec88b5 BN |
277 | XFS_BUF_SET_PRIORITY(bp, isadir ? B_DIR_BMAP : B_BMAP); |
278 | ||
43ba1861 DW |
279 | /* |
280 | * If the verifier flagged a problem with the buffer, we can't trust | |
281 | * its contents for the purposes of reading ahead. Stop prefetching | |
282 | * the tree and mark the buffer unchecked so that the next read of the | |
283 | * buffer will retain the error status and be acted upon appropriately. | |
284 | */ | |
285 | if (bp->b_error) { | |
286 | bp->b_flags |= LIBXFS_B_UNCHECKED; | |
287 | libxfs_putbuf(bp); | |
288 | return 0; | |
289 | } | |
290 | ||
b3563c19 | 291 | rc = (*func)(XFS_BUF_TO_BLOCK(bp), level - 1, isadir, args); |
cb5b3ef4 | 292 | |
2556c98b | 293 | libxfs_putbuf(bp); |
cb5b3ef4 | 294 | |
2556c98b BN |
295 | return rc; |
296 | } | |
297 | ||
298 | static int | |
299 | pf_scanfunc_bmap( | |
b3563c19 | 300 | struct xfs_btree_block *block, |
2556c98b BN |
301 | int level, |
302 | int isadir, | |
303 | prefetch_args_t *args) | |
304 | { | |
2556c98b BN |
305 | xfs_bmbt_ptr_t *pp; |
306 | int numrecs; | |
307 | int i; | |
5a35bf2c | 308 | xfs_fsblock_t dbno; |
2556c98b BN |
309 | |
310 | /* | |
311 | * do some validation on the block contents | |
312 | */ | |
1c88e98c DC |
313 | if ((block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) && |
314 | block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC)) || | |
2556c98b BN |
315 | (be16_to_cpu(block->bb_level) != level)) |
316 | return 0; | |
317 | ||
318 | numrecs = be16_to_cpu(block->bb_numrecs); | |
319 | ||
320 | if (level == 0) { | |
321 | if (numrecs > mp->m_bmap_dmxr[0] || !isadir) | |
322 | return 0; | |
5e656dbb | 323 | return pf_read_bmbt_reclist(args, |
b3563c19 | 324 | XFS_BMBT_REC_ADDR(mp, block, 1), numrecs); |
cb5b3ef4 MV |
325 | } |
326 | ||
2556c98b BN |
327 | if (numrecs > mp->m_bmap_dmxr[1]) |
328 | return 0; | |
cb5b3ef4 | 329 | |
b3563c19 | 330 | pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); |
2556c98b BN |
331 | |
332 | for (i = 0; i < numrecs; i++) { | |
fb36a55d | 333 | dbno = get_unaligned_be64(&pp[i]); |
2556c98b BN |
334 | if (!verify_dfsbno(mp, dbno)) |
335 | return 0; | |
336 | if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap)) | |
337 | return 0; | |
cb5b3ef4 | 338 | } |
2556c98b | 339 | return 1; |
cb5b3ef4 MV |
340 | } |
341 | ||
2556c98b BN |
342 | |
343 | static void | |
344 | pf_read_btinode( | |
345 | prefetch_args_t *args, | |
346 | xfs_dinode_t *dino, | |
347 | int isadir) | |
cb5b3ef4 | 348 | { |
2556c98b BN |
349 | xfs_bmdr_block_t *dib; |
350 | xfs_bmbt_ptr_t *pp; | |
351 | int i; | |
352 | int level; | |
353 | int numrecs; | |
354 | int dsize; | |
5a35bf2c | 355 | xfs_fsblock_t dbno; |
2556c98b BN |
356 | |
357 | dib = (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dino); | |
358 | ||
359 | level = be16_to_cpu(dib->bb_level); | |
360 | numrecs = be16_to_cpu(dib->bb_numrecs); | |
361 | ||
362 | if ((numrecs == 0) || (level == 0) || | |
363 | (level > XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))) | |
cb5b3ef4 | 364 | return; |
2556c98b BN |
365 | /* |
366 | * use bmdr/dfork_dsize since the root block is in the data fork | |
367 | */ | |
368 | if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp)) | |
369 | return; | |
370 | ||
371 | dsize = XFS_DFORK_DSIZE(dino, mp); | |
e2f60652 | 372 | pp = XFS_BMDR_PTR_ADDR(dib, 1, libxfs_bmdr_maxrecs(dsize, 0)); |
cb5b3ef4 | 373 | |
2556c98b | 374 | for (i = 0; i < numrecs; i++) { |
fb36a55d | 375 | dbno = get_unaligned_be64(&pp[i]); |
2556c98b | 376 | if (!verify_dfsbno(mp, dbno)) |
cb5b3ef4 | 377 | break; |
2556c98b | 378 | if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap)) |
cb5b3ef4 | 379 | break; |
2556c98b BN |
380 | } |
381 | } | |
382 | ||
383 | static void | |
384 | pf_read_exinode( | |
385 | prefetch_args_t *args, | |
386 | xfs_dinode_t *dino) | |
387 | { | |
388 | pf_read_bmbt_reclist(args, (xfs_bmbt_rec_t *)XFS_DFORK_DPTR(dino), | |
56b2de80 | 389 | be32_to_cpu(dino->di_nextents)); |
2556c98b | 390 | } |
cb5b3ef4 | 391 | |
2556c98b BN |
392 | static void |
393 | pf_read_inode_dirs( | |
394 | prefetch_args_t *args, | |
395 | xfs_buf_t *bp) | |
396 | { | |
397 | xfs_dinode_t *dino; | |
398 | int icnt = 0; | |
69ec88b5 BN |
399 | int hasdir = 0; |
400 | int isadir; | |
2556c98b | 401 | |
adbb3573 | 402 | libxfs_readbuf_verify(bp, &xfs_inode_buf_ops); |
e0607266 DC |
403 | if (bp->b_error) |
404 | return; | |
405 | ||
2556c98b | 406 | for (icnt = 0; icnt < (XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog); icnt++) { |
56b2de80 | 407 | dino = xfs_make_iptr(mp, bp, icnt); |
2556c98b BN |
408 | |
409 | /* | |
410 | * We are only prefetching directory contents in extents | |
411 | * and btree nodes for other inodes | |
412 | */ | |
56b2de80 | 413 | isadir = (be16_to_cpu(dino->di_mode) & S_IFMT) == S_IFDIR; |
69ec88b5 BN |
414 | hasdir |= isadir; |
415 | ||
56b2de80 | 416 | if (dino->di_format <= XFS_DINODE_FMT_LOCAL) |
69ec88b5 BN |
417 | continue; |
418 | ||
56b2de80 | 419 | if (!isadir && (dino->di_format == XFS_DINODE_FMT_EXTENTS || |
69ec88b5 | 420 | args->dirs_only)) |
2556c98b BN |
421 | continue; |
422 | ||
423 | /* | |
424 | * do some checks on the inode to see if we can prefetch | |
425 | * its directory data. It's a cut down version of | |
426 | * process_dinode_int() in dinode.c. | |
427 | */ | |
56b2de80 | 428 | if (dino->di_format > XFS_DINODE_FMT_BTREE) |
2556c98b BN |
429 | continue; |
430 | ||
56b2de80 | 431 | if (be16_to_cpu(dino->di_magic) != XFS_DINODE_MAGIC) |
2556c98b BN |
432 | continue; |
433 | ||
e2f60652 | 434 | if (!libxfs_dinode_good_version(mp, dino->di_version)) |
2556c98b BN |
435 | continue; |
436 | ||
56b2de80 | 437 | if (be64_to_cpu(dino->di_size) <= XFS_DFORK_DSIZE(dino, mp)) |
2556c98b BN |
438 | continue; |
439 | ||
56b2de80 | 440 | if ((dino->di_forkoff != 0) && |
49f693fa | 441 | (dino->di_forkoff >= XFS_LITINO(mp, dino->di_version) >> 3)) |
2556c98b BN |
442 | continue; |
443 | ||
56b2de80 | 444 | switch (dino->di_format) { |
2556c98b BN |
445 | case XFS_DINODE_FMT_EXTENTS: |
446 | pf_read_exinode(args, dino); | |
cb5b3ef4 | 447 | break; |
2556c98b | 448 | case XFS_DINODE_FMT_BTREE: |
69ec88b5 | 449 | pf_read_btinode(args, dino, isadir); |
cb5b3ef4 | 450 | break; |
cb5b3ef4 MV |
451 | } |
452 | } | |
69ec88b5 BN |
453 | if (hasdir) |
454 | XFS_BUF_SET_PRIORITY(bp, B_DIR_INODE); | |
cb5b3ef4 MV |
455 | } |
456 | ||
dd9093de DC |
457 | /* |
458 | * pf_batch_read must be called with the lock locked. | |
459 | */ | |
cb5b3ef4 | 460 | static void |
2556c98b BN |
461 | pf_batch_read( |
462 | prefetch_args_t *args, | |
463 | pf_which_t which, | |
464 | void *buf) | |
cb5b3ef4 | 465 | { |
2556c98b BN |
466 | xfs_buf_t *bplist[MAX_BUFS]; |
467 | unsigned int num; | |
468 | off64_t first_off, last_off, next_off; | |
469 | int len, size; | |
cb5b3ef4 | 470 | int i; |
2556c98b | 471 | int inode_bufs; |
e33b06a3 | 472 | unsigned long fsbno = 0; |
379397bf | 473 | unsigned long max_fsbno; |
2556c98b BN |
474 | char *pbuf; |
475 | ||
bb34c934 | 476 | for (;;) { |
379397bf | 477 | num = 0; |
bb34c934 BN |
478 | if (which == PF_SECONDARY) { |
479 | bplist[0] = btree_find(args->io_queue, 0, &fsbno); | |
480 | max_fsbno = MIN(fsbno + pf_max_fsbs, | |
481 | args->last_bno_read); | |
482 | } else { | |
483 | bplist[0] = btree_find(args->io_queue, | |
484 | args->last_bno_read, &fsbno); | |
485 | max_fsbno = fsbno + pf_max_fsbs; | |
486 | } | |
379397bf | 487 | while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) { |
dd9093de | 488 | /* |
bbd32754 DC |
489 | * Discontiguous buffers need special handling, so stop |
490 | * gathering new buffers and process the list and this | |
491 | * discontigous buffer immediately. This avoids the | |
492 | * complexity of keeping a separate discontigous buffer | |
493 | * list and seeking back over ranges we've already done | |
494 | * optimised reads for. | |
dd9093de DC |
495 | */ |
496 | if ((bplist[num]->b_flags & LIBXFS_B_DISCONTIG)) { | |
bbd32754 DC |
497 | num++; |
498 | break; | |
499 | } | |
500 | ||
501 | if (which != PF_META_ONLY || | |
dd9093de | 502 | !B_IS_INODE(XFS_BUF_PRIORITY(bplist[num]))) |
379397bf | 503 | num++; |
e49f30a7 ES |
504 | if (num == MAX_BUFS) |
505 | break; | |
bb34c934 | 506 | bplist[num] = btree_lookup_next(args->io_queue, &fsbno); |
2556c98b | 507 | } |
379397bf BN |
508 | if (!num) |
509 | return; | |
cb5b3ef4 | 510 | |
2556c98b BN |
511 | /* |
512 | * do a big read if 25% of the potential buffer is useful, | |
513 | * otherwise, find as many close together blocks and | |
514 | * read them in one read | |
515 | */ | |
516 | first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0])); | |
517 | last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) + | |
518 | XFS_BUF_SIZE(bplist[num-1]); | |
2c350101 | 519 | while (num > 1 && last_off - first_off > pf_max_bytes) { |
2556c98b BN |
520 | num--; |
521 | last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) + | |
522 | XFS_BUF_SIZE(bplist[num-1]); | |
523 | } | |
524 | if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) { | |
525 | /* | |
526 | * not enough blocks for one big read, so determine | |
527 | * the number of blocks that are close enough. | |
528 | */ | |
529 | last_off = first_off + XFS_BUF_SIZE(bplist[0]); | |
530 | for (i = 1; i < num; i++) { | |
531 | next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) + | |
532 | XFS_BUF_SIZE(bplist[i]); | |
533 | if (next_off - last_off > pf_batch_bytes) | |
534 | break; | |
535 | last_off = next_off; | |
536 | } | |
537 | num = i; | |
538 | } | |
cb5b3ef4 | 539 | |
2556c98b | 540 | for (i = 0; i < num; i++) { |
bb34c934 | 541 | if (btree_delete(args->io_queue, XFS_DADDR_TO_FSB(mp, |
2556c98b BN |
542 | XFS_BUF_ADDR(bplist[i]))) == NULL) |
543 | do_error(_("prefetch corruption\n")); | |
cb5b3ef4 MV |
544 | } |
545 | ||
2556c98b BN |
546 | if (which == PF_PRIMARY) { |
547 | for (inode_bufs = 0, i = 0; i < num; i++) { | |
69ec88b5 | 548 | if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i]))) |
2556c98b BN |
549 | inode_bufs++; |
550 | } | |
551 | args->inode_bufs_queued -= inode_bufs; | |
552 | if (inode_bufs && (first_off >> mp->m_sb.sb_blocklog) > | |
553 | pf_batch_fsbs) | |
554 | args->last_bno_read = (first_off >> mp->m_sb.sb_blocklog); | |
555 | } | |
556 | #ifdef XR_PF_TRACE | |
557 | pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)", | |
558 | (long long)XFS_BUF_ADDR(bplist[0]), | |
559 | (long long)XFS_BUF_ADDR(bplist[num-1]), num, | |
560 | (which != PF_SECONDARY) ? "pri" : "sec", args->agno, | |
561 | args->last_bno_read, args->inode_bufs_queued); | |
562 | #endif | |
563 | pthread_mutex_unlock(&args->lock); | |
564 | ||
565 | /* | |
566 | * now read the data and put into the xfs_but_t's | |
567 | */ | |
2f9a125c | 568 | len = pread(mp_fd, buf, (int)(last_off - first_off), first_off); |
bbd32754 DC |
569 | |
570 | /* | |
571 | * Check the last buffer on the list to see if we need to | |
572 | * process a discontiguous buffer. The gather above loop | |
573 | * guarantees that only the last buffer in the list will be a | |
574 | * discontiguous buffer. | |
575 | */ | |
576 | if ((bplist[num - 1]->b_flags & LIBXFS_B_DISCONTIG)) { | |
577 | libxfs_readbufr_map(mp->m_ddev_targp, bplist[num - 1], 0); | |
578 | bplist[num - 1]->b_flags |= LIBXFS_B_UNCHECKED; | |
579 | libxfs_putbuf(bplist[num - 1]); | |
580 | num--; | |
581 | } | |
582 | ||
2556c98b BN |
583 | if (len > 0) { |
584 | /* | |
585 | * go through the xfs_buf_t list copying from the | |
586 | * read buffer into the xfs_buf_t's and release them. | |
587 | */ | |
2556c98b BN |
588 | for (i = 0; i < num; i++) { |
589 | ||
590 | pbuf = ((char *)buf) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) - first_off); | |
591 | size = XFS_BUF_SIZE(bplist[i]); | |
592 | if (len < size) | |
593 | break; | |
594 | memcpy(XFS_BUF_PTR(bplist[i]), pbuf, size); | |
adbb3573 DC |
595 | bplist[i]->b_flags |= (LIBXFS_B_UPTODATE | |
596 | LIBXFS_B_UNCHECKED); | |
2556c98b | 597 | len -= size; |
69ec88b5 | 598 | if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i]))) |
2556c98b | 599 | pf_read_inode_dirs(args, bplist[i]); |
69ec88b5 BN |
600 | else if (which == PF_META_ONLY) |
601 | XFS_BUF_SET_PRIORITY(bplist[i], | |
602 | B_DIR_META_H); | |
603 | else if (which == PF_PRIMARY && num == 1) | |
604 | XFS_BUF_SET_PRIORITY(bplist[i], | |
605 | B_DIR_META_S); | |
2556c98b BN |
606 | } |
607 | } | |
608 | for (i = 0; i < num; i++) { | |
2556c98b | 609 | pftrace("putbuf %c %p (%llu) in AG %d", |
69ec88b5 | 610 | B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M', |
2556c98b BN |
611 | bplist[i], (long long)XFS_BUF_ADDR(bplist[i]), |
612 | args->agno); | |
2556c98b BN |
613 | libxfs_putbuf(bplist[i]); |
614 | } | |
615 | pthread_mutex_lock(&args->lock); | |
616 | if (which != PF_SECONDARY) { | |
2556c98b BN |
617 | pftrace("inode_bufs_queued for AG %d = %d", args->agno, |
618 | args->inode_bufs_queued); | |
2556c98b BN |
619 | /* |
620 | * if primary inode queue running low, process metadata | |
621 | * in boths queues to avoid I/O starvation as the | |
622 | * processing thread would be waiting for a metadata | |
623 | * buffer | |
624 | */ | |
625 | if (which == PF_PRIMARY && !args->queuing_done && | |
626 | args->inode_bufs_queued < IO_THRESHOLD) { | |
2556c98b BN |
627 | pftrace("reading metadata bufs from primary queue for AG %d", |
628 | args->agno); | |
4c0a98ae | 629 | |
2556c98b | 630 | pf_batch_read(args, PF_META_ONLY, buf); |
4c0a98ae | 631 | |
2556c98b BN |
632 | pftrace("reading bufs from secondary queue for AG %d", |
633 | args->agno); | |
4c0a98ae | 634 | |
2556c98b BN |
635 | pf_batch_read(args, PF_SECONDARY, buf); |
636 | } | |
cb5b3ef4 | 637 | } |
cb5b3ef4 | 638 | } |
2556c98b BN |
639 | } |
640 | ||
641 | static void * | |
642 | pf_io_worker( | |
643 | void *param) | |
644 | { | |
645 | prefetch_args_t *args = param; | |
646 | void *buf = memalign(libxfs_device_alignment(), | |
647 | pf_max_bytes); | |
648 | ||
649 | if (buf == NULL) | |
650 | return NULL; | |
cb5b3ef4 | 651 | |
2556c98b | 652 | pthread_mutex_lock(&args->lock); |
bb34c934 | 653 | while (!args->queuing_done || !btree_is_empty(args->io_queue)) { |
2556c98b | 654 | pftrace("waiting to start prefetch I/O for AG %d", args->agno); |
4c0a98ae | 655 | |
2556c98b BN |
656 | while (!args->can_start_reading && !args->queuing_done) |
657 | pthread_cond_wait(&args->start_reading, &args->lock); | |
4c0a98ae | 658 | |
2556c98b | 659 | pftrace("starting prefetch I/O for AG %d", args->agno); |
4c0a98ae | 660 | |
2556c98b BN |
661 | pf_batch_read(args, PF_PRIMARY, buf); |
662 | pf_batch_read(args, PF_SECONDARY, buf); | |
663 | ||
2556c98b | 664 | pftrace("ran out of bufs to prefetch for AG %d", args->agno); |
4c0a98ae | 665 | |
2556c98b BN |
666 | if (!args->queuing_done) |
667 | args->can_start_reading = 0; | |
cb5b3ef4 | 668 | } |
2556c98b | 669 | pthread_mutex_unlock(&args->lock); |
cb5b3ef4 | 670 | |
2556c98b BN |
671 | free(buf); |
672 | ||
2556c98b | 673 | pftrace("finished prefetch I/O for AG %d", args->agno); |
4c0a98ae | 674 | |
2556c98b | 675 | return NULL; |
cb5b3ef4 MV |
676 | } |
677 | ||
2556c98b BN |
678 | static int |
679 | pf_create_prefetch_thread( | |
680 | prefetch_args_t *args); | |
681 | ||
682 | static void * | |
683 | pf_queuing_worker( | |
684 | void *param) | |
cb5b3ef4 | 685 | { |
2556c98b BN |
686 | prefetch_args_t *args = param; |
687 | int num_inos; | |
688 | ino_tree_node_t *irec; | |
689 | ino_tree_node_t *cur_irec; | |
690 | int blks_per_cluster; | |
2556c98b | 691 | xfs_agblock_t bno; |
cb5b3ef4 | 692 | int i; |
2556c98b | 693 | int err; |
870b18fd | 694 | uint64_t sparse; |
2556c98b | 695 | |
ff105f75 | 696 | blks_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog; |
2556c98b BN |
697 | if (blks_per_cluster == 0) |
698 | blks_per_cluster = 1; | |
2556c98b BN |
699 | |
700 | for (i = 0; i < PF_THREAD_COUNT; i++) { | |
701 | err = pthread_create(&args->io_threads[i], NULL, | |
702 | pf_io_worker, args); | |
703 | if (err != 0) { | |
704 | do_warn(_("failed to create prefetch thread: %s\n"), | |
705 | strerror(err)); | |
53dc81db | 706 | args->io_threads[i] = 0; |
2556c98b BN |
707 | if (i == 0) { |
708 | pf_start_processing(args); | |
709 | return NULL; | |
710 | } | |
711 | /* | |
712 | * since we have at least one I/O thread, use them for | |
713 | * prefetch | |
714 | */ | |
715 | break; | |
716 | } | |
cb5b3ef4 | 717 | } |
2556c98b | 718 | pftrace("starting prefetch for AG %d", args->agno); |
cb5b3ef4 | 719 | |
2556c98b BN |
720 | for (irec = findfirst_inode_rec(args->agno); irec != NULL; |
721 | irec = next_ino_rec(irec)) { | |
cb5b3ef4 | 722 | |
2556c98b | 723 | cur_irec = irec; |
cb5b3ef4 | 724 | |
2556c98b | 725 | num_inos = XFS_INODES_PER_CHUNK; |
ff105f75 | 726 | while (num_inos < mp->m_ialloc_inos && irec != NULL) { |
2556c98b BN |
727 | irec = next_ino_rec(irec); |
728 | num_inos += XFS_INODES_PER_CHUNK; | |
729 | } | |
cb5b3ef4 | 730 | |
2556c98b BN |
731 | if (args->dirs_only && cur_irec->ino_isa_dir == 0) |
732 | continue; | |
733 | #ifdef XR_PF_TRACE | |
734 | sem_getvalue(&args->ra_count, &i); | |
735 | pftrace("queuing irec %p in AG %d, sem count = %d", | |
736 | irec, args->agno, i); | |
737 | #endif | |
3724f674 | 738 | err = sem_trywait(&args->ra_count); |
004e18d4 | 739 | if (err < 0 && errno == EAGAIN) { |
3724f674 CH |
740 | /* |
741 | * Kick the queue once we have reached the limit; | |
742 | * without this the threads processing the inodes | |
743 | * might get stuck on a buffer that has been locked | |
744 | * and added to the I/O queue but is waiting for | |
745 | * the thread to be woken. | |
746 | */ | |
747 | pf_start_io_workers(args); | |
748 | sem_wait(&args->ra_count); | |
749 | } | |
2556c98b BN |
750 | |
751 | num_inos = 0; | |
752 | bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum); | |
870b18fd | 753 | sparse = cur_irec->ir_sparse; |
2556c98b BN |
754 | |
755 | do { | |
dd9093de DC |
756 | struct xfs_buf_map map; |
757 | ||
758 | map.bm_bn = XFS_AGB_TO_DADDR(mp, args->agno, bno); | |
759 | map.bm_len = XFS_FSB_TO_BB(mp, blks_per_cluster); | |
870b18fd BF |
760 | |
761 | /* | |
762 | * Queue I/O for each non-sparse cluster. We can check | |
763 | * sparse state in cluster sized chunks as cluster size | |
764 | * is the min. granularity of sparse irec regions. | |
765 | */ | |
c782bf02 | 766 | if ((sparse & ((1ULL << inodes_per_cluster) - 1)) == 0) |
870b18fd BF |
767 | pf_queue_io(args, &map, 1, |
768 | (cur_irec->ino_isa_dir != 0) ? | |
769 | B_DIR_INODE : B_INODE); | |
770 | ||
2556c98b | 771 | bno += blks_per_cluster; |
edf3f9d0 | 772 | num_inos += inodes_per_cluster; |
870b18fd | 773 | sparse >>= inodes_per_cluster; |
ff105f75 | 774 | } while (num_inos < mp->m_ialloc_inos); |
cb5b3ef4 MV |
775 | } |
776 | ||
2556c98b | 777 | pthread_mutex_lock(&args->lock); |
cb5b3ef4 | 778 | |
2556c98b BN |
779 | pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)", |
780 | args->agno, args->inode_bufs_queued); | |
4c0a98ae | 781 | |
2556c98b BN |
782 | args->queuing_done = 1; |
783 | pf_start_io_workers(args); | |
784 | pf_start_processing(args); | |
785 | pthread_mutex_unlock(&args->lock); | |
786 | ||
787 | /* now wait for the readers to finish */ | |
788 | for (i = 0; i < PF_THREAD_COUNT; i++) | |
789 | if (args->io_threads[i]) | |
790 | pthread_join(args->io_threads[i], NULL); | |
791 | ||
2556c98b | 792 | pftrace("prefetch for AG %d finished", args->agno); |
4c0a98ae | 793 | |
2556c98b BN |
794 | pthread_mutex_lock(&args->lock); |
795 | ||
bb34c934 | 796 | ASSERT(btree_is_empty(args->io_queue)); |
2556c98b BN |
797 | |
798 | args->prefetch_done = 1; | |
799 | if (args->next_args) | |
800 | pf_create_prefetch_thread(args->next_args); | |
801 | ||
802 | pthread_mutex_unlock(&args->lock); | |
803 | ||
804 | return NULL; | |
cb5b3ef4 MV |
805 | } |
806 | ||
2556c98b BN |
807 | static int |
808 | pf_create_prefetch_thread( | |
809 | prefetch_args_t *args) | |
810 | { | |
811 | int err; | |
812 | ||
2556c98b | 813 | pftrace("creating queue thread for AG %d", args->agno); |
4c0a98ae | 814 | |
2556c98b BN |
815 | err = pthread_create(&args->queuing_thread, NULL, |
816 | pf_queuing_worker, args); | |
817 | if (err != 0) { | |
818 | do_warn(_("failed to create prefetch thread: %s\n"), | |
819 | strerror(err)); | |
53dc81db | 820 | args->queuing_thread = 0; |
2556c98b BN |
821 | cleanup_inode_prefetch(args); |
822 | } | |
823 | ||
824 | return err == 0; | |
825 | } | |
cb5b3ef4 MV |
826 | |
827 | void | |
2556c98b BN |
828 | init_prefetch( |
829 | xfs_mount_t *pmp) | |
cb5b3ef4 | 830 | { |
2556c98b | 831 | mp = pmp; |
75c8b434 | 832 | mp_fd = libxfs_device_to_fd(mp->m_ddev_targp->dev); |
2556c98b BN |
833 | pf_max_bytes = sysconf(_SC_PAGE_SIZE) << 7; |
834 | pf_max_bbs = pf_max_bytes >> BBSHIFT; | |
835 | pf_max_fsbs = pf_max_bytes >> mp->m_sb.sb_blocklog; | |
836 | pf_batch_bytes = DEF_BATCH_BYTES; | |
837 | pf_batch_fsbs = DEF_BATCH_BYTES >> (mp->m_sb.sb_blocklog + 1); | |
838 | } | |
cb5b3ef4 | 839 | |
2556c98b BN |
840 | prefetch_args_t * |
841 | start_inode_prefetch( | |
842 | xfs_agnumber_t agno, | |
843 | int dirs_only, | |
844 | prefetch_args_t *prev_args) | |
845 | { | |
846 | prefetch_args_t *args; | |
edf3f9d0 | 847 | long max_queue; |
cb5b3ef4 | 848 | |
2556c98b BN |
849 | if (!do_prefetch || agno >= mp->m_sb.sb_agcount) |
850 | return NULL; | |
cb5b3ef4 | 851 | |
2556c98b BN |
852 | args = calloc(1, sizeof(prefetch_args_t)); |
853 | ||
bb34c934 | 854 | btree_init(&args->io_queue); |
5e656dbb BN |
855 | if (pthread_mutex_init(&args->lock, NULL) != 0) |
856 | do_error(_("failed to initialize prefetch mutex\n")); | |
857 | if (pthread_cond_init(&args->start_reading, NULL) != 0) | |
858 | do_error(_("failed to initialize prefetch cond var\n")); | |
859 | if (pthread_cond_init(&args->start_processing, NULL) != 0) | |
860 | do_error(_("failed to initialize prefetch cond var\n")); | |
2556c98b BN |
861 | args->agno = agno; |
862 | args->dirs_only = dirs_only; | |
863 | ||
864 | /* | |
865 | * use only 1/8 of the libxfs cache as we are only counting inodes | |
866 | * and not any other associated metadata like directories | |
867 | */ | |
868 | ||
edf3f9d0 | 869 | max_queue = libxfs_bcache->c_maxcount / thread_count / 8; |
ff105f75 DC |
870 | if (mp->m_inode_cluster_size > mp->m_sb.sb_blocksize) |
871 | max_queue = max_queue * | |
872 | (mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog) / | |
873 | mp->m_ialloc_blks; | |
edf3f9d0 BN |
874 | |
875 | sem_init(&args->ra_count, 0, max_queue); | |
2556c98b BN |
876 | |
877 | if (!prev_args) { | |
878 | if (!pf_create_prefetch_thread(args)) | |
879 | return NULL; | |
880 | } else { | |
881 | pthread_mutex_lock(&prev_args->lock); | |
882 | if (prev_args->prefetch_done) { | |
883 | if (!pf_create_prefetch_thread(args)) | |
884 | args = NULL; | |
885 | } else | |
886 | prev_args->next_args = args; | |
887 | pthread_mutex_unlock(&prev_args->lock); | |
cb5b3ef4 | 888 | } |
2556c98b BN |
889 | |
890 | return args; | |
cb5b3ef4 MV |
891 | } |
892 | ||
71014d19 DC |
893 | /* |
894 | * prefetch_ag_range runs a prefetch-and-process loop across a range of AGs. It | |
895 | * begins with @start+ag, and finishes with @end_ag - 1 (i.e. does not prefetch | |
896 | * or process @end_ag). The function starts prefetch on the first AG, then loops | |
897 | * starting prefetch on the next AG and then blocks processing the current AG as | |
898 | * the prefetch queue brings inodes into the processing queue. | |
899 | * | |
900 | * There is only one prefetch taking place at a time, so the prefetch on the | |
901 | * next AG only starts once the current AG has been completely prefetched. Hence | |
902 | * the prefetch of the next AG will start some time before the processing of the | |
903 | * current AG finishes, ensuring that when we iterate an start processing the | |
904 | * next AG there is already a significant queue of inodes to process. | |
905 | * | |
906 | * Prefetch is done this way to prevent it from running too far ahead of the | |
907 | * processing. Allowing it to do so can cause cache thrashing, where new | |
908 | * prefetch causes previously prefetched buffers to be reclaimed before the | |
909 | * processing thread uses them. This results in reading all the inodes and | |
910 | * metadata twice per phase and it greatly slows down the processing. Hence we | |
911 | * have to carefully control how far ahead we prefetch... | |
912 | */ | |
913 | static void | |
914 | prefetch_ag_range( | |
915 | struct work_queue *work, | |
916 | xfs_agnumber_t start_ag, | |
917 | xfs_agnumber_t end_ag, | |
918 | bool dirs_only, | |
919 | void (*func)(struct work_queue *, | |
920 | xfs_agnumber_t, void *)) | |
921 | { | |
922 | int i; | |
923 | struct prefetch_args *pf_args[2]; | |
924 | ||
925 | pf_args[start_ag & 1] = start_inode_prefetch(start_ag, dirs_only, NULL); | |
926 | for (i = start_ag; i < end_ag; i++) { | |
927 | /* Don't prefetch end_ag */ | |
928 | if (i + 1 < end_ag) | |
929 | pf_args[(~i) & 1] = start_inode_prefetch(i + 1, | |
930 | dirs_only, pf_args[i & 1]); | |
931 | func(work, i, pf_args[i & 1]); | |
932 | } | |
933 | } | |
934 | ||
935 | struct pf_work_args { | |
936 | xfs_agnumber_t start_ag; | |
937 | xfs_agnumber_t end_ag; | |
938 | bool dirs_only; | |
939 | void (*func)(struct work_queue *, xfs_agnumber_t, void *); | |
940 | }; | |
941 | ||
942 | static void | |
943 | prefetch_ag_range_work( | |
944 | struct work_queue *work, | |
945 | xfs_agnumber_t unused, | |
946 | void *args) | |
947 | { | |
948 | struct pf_work_args *wargs = args; | |
949 | ||
f8149110 | 950 | prefetch_ag_range(work, wargs->start_ag, wargs->end_ag, |
71014d19 DC |
951 | wargs->dirs_only, wargs->func); |
952 | free(args); | |
953 | } | |
954 | ||
1164bde5 DC |
955 | /* |
956 | * Do inode prefetch in the most optimal way for the context under which repair | |
957 | * has been run. | |
958 | */ | |
959 | void | |
960 | do_inode_prefetch( | |
961 | struct xfs_mount *mp, | |
962 | int stride, | |
963 | void (*func)(struct work_queue *, | |
964 | xfs_agnumber_t, void *), | |
965 | bool check_cache, | |
966 | bool dirs_only) | |
967 | { | |
71014d19 | 968 | int i; |
1164bde5 DC |
969 | struct work_queue queue; |
970 | struct work_queue *queues; | |
f994d14f | 971 | int queues_started = 0; |
1164bde5 DC |
972 | |
973 | /* | |
974 | * If the previous phases of repair have not overflowed the buffer | |
975 | * cache, then we don't need to re-read any of the metadata in the | |
976 | * filesystem - it's all in the cache. In that case, run a thread per | |
977 | * CPU to maximise parallelism of the queue to be processed. | |
978 | */ | |
979 | if (check_cache && !libxfs_bcache_overflowed()) { | |
980 | queue.mp = mp; | |
981 | create_work_queue(&queue, mp, libxfs_nproc()); | |
982 | for (i = 0; i < mp->m_sb.sb_agcount; i++) | |
983 | queue_work(&queue, func, i, NULL); | |
984 | destroy_work_queue(&queue); | |
985 | return; | |
986 | } | |
987 | ||
988 | /* | |
989 | * single threaded behaviour - single prefetch thread, processed | |
990 | * directly after each AG is queued. | |
991 | */ | |
992 | if (!stride) { | |
993 | queue.mp = mp; | |
71014d19 DC |
994 | prefetch_ag_range(&queue, 0, mp->m_sb.sb_agcount, |
995 | dirs_only, func); | |
1164bde5 DC |
996 | return; |
997 | } | |
998 | ||
999 | /* | |
1000 | * create one worker thread for each segment of the volume | |
1001 | */ | |
1002 | queues = malloc(thread_count * sizeof(work_queue_t)); | |
71014d19 DC |
1003 | for (i = 0; i < thread_count; i++) { |
1004 | struct pf_work_args *wargs; | |
1005 | ||
1006 | wargs = malloc(sizeof(struct pf_work_args)); | |
1007 | wargs->start_ag = i * stride; | |
1008 | wargs->end_ag = min((i + 1) * stride, | |
1009 | mp->m_sb.sb_agcount); | |
1010 | wargs->dirs_only = dirs_only; | |
1011 | wargs->func = func; | |
1012 | ||
1164bde5 | 1013 | create_work_queue(&queues[i], mp, 1); |
71014d19 | 1014 | queue_work(&queues[i], prefetch_ag_range_work, 0, wargs); |
f994d14f | 1015 | queues_started++; |
71014d19 DC |
1016 | |
1017 | if (wargs->end_ag >= mp->m_sb.sb_agcount) | |
1018 | break; | |
1164bde5 | 1019 | } |
71014d19 | 1020 | |
1164bde5 DC |
1021 | /* |
1022 | * wait for workers to complete | |
1023 | */ | |
f994d14f | 1024 | for (i = 0; i < queues_started; i++) |
1164bde5 DC |
1025 | destroy_work_queue(&queues[i]); |
1026 | free(queues); | |
1027 | } | |
1028 | ||
cb5b3ef4 | 1029 | void |
2556c98b BN |
1030 | wait_for_inode_prefetch( |
1031 | prefetch_args_t *args) | |
cb5b3ef4 | 1032 | { |
2556c98b | 1033 | if (args == NULL) |
cb5b3ef4 | 1034 | return; |
2556c98b BN |
1035 | |
1036 | pthread_mutex_lock(&args->lock); | |
1037 | ||
1038 | while (!args->can_start_processing) { | |
2556c98b | 1039 | pftrace("waiting to start processing AG %d", args->agno); |
4c0a98ae | 1040 | |
2556c98b | 1041 | pthread_cond_wait(&args->start_processing, &args->lock); |
cb5b3ef4 | 1042 | } |
2556c98b | 1043 | pftrace("can start processing AG %d", args->agno); |
4c0a98ae | 1044 | |
2556c98b BN |
1045 | pthread_mutex_unlock(&args->lock); |
1046 | } | |
cb5b3ef4 | 1047 | |
2556c98b BN |
1048 | void |
1049 | cleanup_inode_prefetch( | |
1050 | prefetch_args_t *args) | |
1051 | { | |
1052 | if (args == NULL) | |
1053 | return; | |
cb5b3ef4 | 1054 | |
2556c98b | 1055 | pftrace("waiting AG %d prefetch to finish", args->agno); |
4c0a98ae | 1056 | |
2556c98b BN |
1057 | if (args->queuing_thread) |
1058 | pthread_join(args->queuing_thread, NULL); | |
1059 | ||
2556c98b | 1060 | pftrace("AG %d prefetch done", args->agno); |
4c0a98ae | 1061 | |
2556c98b BN |
1062 | pthread_mutex_destroy(&args->lock); |
1063 | pthread_cond_destroy(&args->start_reading); | |
1064 | pthread_cond_destroy(&args->start_processing); | |
1065 | sem_destroy(&args->ra_count); | |
bb34c934 | 1066 | btree_destroy(args->io_queue); |
2556c98b BN |
1067 | |
1068 | free(args); | |
cb5b3ef4 MV |
1069 | } |
1070 | ||
2556c98b BN |
1071 | #ifdef XR_PF_TRACE |
1072 | ||
4c0a98ae BN |
1073 | static FILE *pf_trace_file; |
1074 | ||
1075 | void | |
1076 | pftrace_init(void) | |
1077 | { | |
1078 | pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w"); | |
1079 | setvbuf(pf_trace_file, NULL, _IOLBF, 1024); | |
1080 | } | |
1081 | ||
1082 | void | |
1083 | pftrace_done(void) | |
1084 | { | |
1085 | fclose(pf_trace_file); | |
1086 | } | |
1087 | ||
cb5b3ef4 | 1088 | void |
2556c98b | 1089 | _pftrace(const char *func, const char *msg, ...) |
cb5b3ef4 | 1090 | { |
2556c98b BN |
1091 | char buf[200]; |
1092 | struct timeval tv; | |
1093 | va_list args; | |
cb5b3ef4 | 1094 | |
2556c98b | 1095 | gettimeofday(&tv, NULL); |
cb5b3ef4 | 1096 | |
2556c98b BN |
1097 | va_start(args, msg); |
1098 | vsnprintf(buf, sizeof(buf), msg, args); | |
1099 | buf[sizeof(buf)-1] = '\0'; | |
1100 | va_end(args); | |
cb5b3ef4 | 1101 | |
4c0a98ae BN |
1102 | fprintf(pf_trace_file, "%lu.%06lu %s: %s\n", tv.tv_sec, tv.tv_usec, |
1103 | func, buf); | |
cb5b3ef4 | 1104 | } |
2556c98b BN |
1105 | |
1106 | #endif |