]>
Commit | Line | Data |
---|---|---|
959ef981 DC |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
6b803e5a | 3 | #include "libxfs.h" |
2556c98b | 4 | #include <pthread.h> |
cb5b3ef4 | 5 | #include "avl.h" |
379397bf | 6 | #include "btree.h" |
cb5b3ef4 MV |
7 | #include "globals.h" |
8 | #include "agheader.h" | |
9 | #include "incore.h" | |
cb5b3ef4 | 10 | #include "dir2.h" |
cb5b3ef4 MV |
11 | #include "protos.h" |
12 | #include "err_protos.h" | |
13 | #include "dinode.h" | |
14 | #include "bmap.h" | |
15 | #include "versions.h" | |
2556c98b BN |
16 | #include "threads.h" |
17 | #include "prefetch.h" | |
18 | #include "progress.h" | |
cb5b3ef4 MV |
19 | |
20 | int do_prefetch = 1; | |
21 | ||
2556c98b BN |
22 | /* |
23 | * Performs prefetching by priming the libxfs cache by using a dedicate thread | |
24 | * scanning inodes and reading blocks in ahead of time they are required. | |
25 | * | |
26 | * Any I/O errors can be safely ignored. | |
27 | */ | |
cb5b3ef4 | 28 | |
2556c98b BN |
29 | static xfs_mount_t *mp; |
30 | static int mp_fd; | |
31 | static int pf_max_bytes; | |
32 | static int pf_max_bbs; | |
33 | static int pf_max_fsbs; | |
34 | static int pf_batch_bytes; | |
35 | static int pf_batch_fsbs; | |
cb5b3ef4 | 36 | |
69ec88b5 BN |
37 | static void pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *); |
38 | ||
a040d7c9 BN |
39 | /* |
40 | * Buffer priorities for the libxfs cache | |
41 | * | |
42 | * Directory metadata is ranked higher than other metadata as it's used | |
43 | * in phases 3, 4 and 6, while other metadata is only used in 3 & 4. | |
44 | */ | |
69ec88b5 | 45 | |
a040d7c9 BN |
46 | /* intermediate directory btree nodes - can't be queued */ |
47 | #define B_DIR_BMAP CACHE_PREFETCH_PRIORITY + 7 | |
48 | /* directory metadata in secondary queue */ | |
49 | #define B_DIR_META_2 CACHE_PREFETCH_PRIORITY + 6 | |
50 | /* dir metadata that had to fetched from the primary queue to avoid stalling */ | |
51 | #define B_DIR_META_H CACHE_PREFETCH_PRIORITY + 5 | |
52 | /* single block of directory metadata (can't batch read) */ | |
53 | #define B_DIR_META_S CACHE_PREFETCH_PRIORITY + 4 | |
54 | /* dir metadata with more than one block fetched in a single I/O */ | |
55 | #define B_DIR_META CACHE_PREFETCH_PRIORITY + 3 | |
56 | /* inode clusters with directory inodes */ | |
57 | #define B_DIR_INODE CACHE_PREFETCH_PRIORITY + 2 | |
58 | /* intermediate extent btree nodes */ | |
59 | #define B_BMAP CACHE_PREFETCH_PRIORITY + 1 | |
60 | /* inode clusters without any directory entries */ | |
61 | #define B_INODE CACHE_PREFETCH_PRIORITY | |
69ec88b5 | 62 | |
a040d7c9 BN |
63 | /* |
64 | * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if | |
65 | * the buffer is for an inode or other metadata. | |
66 | */ | |
67 | #define B_IS_INODE(f) (((f) & 5) == 0) | |
cb5b3ef4 | 68 | |
2556c98b BN |
69 | #define DEF_BATCH_BYTES 0x10000 |
70 | ||
71 | #define MAX_BUFS 128 | |
72 | ||
69ec88b5 | 73 | #define IO_THRESHOLD (MAX_BUFS * 2) |
2556c98b BN |
74 | |
75 | typedef enum pf_which { | |
76 | PF_PRIMARY, | |
77 | PF_SECONDARY, | |
78 | PF_META_ONLY | |
79 | } pf_which_t; | |
80 | ||
81 | ||
82 | static inline void | |
83 | pf_start_processing( | |
84 | prefetch_args_t *args) | |
85 | { | |
86 | if (!args->can_start_processing) { | |
2556c98b | 87 | pftrace("signalling processing for AG %d", args->agno); |
4c0a98ae | 88 | |
2556c98b BN |
89 | args->can_start_processing = 1; |
90 | pthread_cond_signal(&args->start_processing); | |
cb5b3ef4 | 91 | } |
2556c98b BN |
92 | } |
93 | ||
94 | static inline void | |
95 | pf_start_io_workers( | |
96 | prefetch_args_t *args) | |
97 | { | |
98 | if (!args->can_start_reading) { | |
2556c98b | 99 | pftrace("signalling reading for AG %d", args->agno); |
4c0a98ae | 100 | |
2556c98b BN |
101 | args->can_start_reading = 1; |
102 | pthread_cond_broadcast(&args->start_reading); | |
cb5b3ef4 | 103 | } |
cb5b3ef4 MV |
104 | } |
105 | ||
2556c98b | 106 | |
cb5b3ef4 | 107 | static void |
2556c98b BN |
108 | pf_queue_io( |
109 | prefetch_args_t *args, | |
dd9093de DC |
110 | struct xfs_buf_map *map, |
111 | int nmaps, | |
2556c98b | 112 | int flag) |
cb5b3ef4 | 113 | { |
dd9093de DC |
114 | struct xfs_buf *bp; |
115 | xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, map[0].bm_bn); | |
583ca112 | 116 | int error; |
cb5b3ef4 | 117 | |
2ae22647 CH |
118 | /* |
119 | * Never block on a buffer lock here, given that the actual repair | |
120 | * code might lock buffers in a different order from us. Given that | |
121 | * the lock holder is either reading it from disk himself or | |
122 | * completely overwriting it this behaviour is perfectly fine. | |
123 | */ | |
583ca112 DW |
124 | error = -libxfs_buf_get_map(mp->m_dev, map, nmaps, |
125 | LIBXFS_GETBUF_TRYLOCK, &bp); | |
126 | if (error) | |
2ae22647 CH |
127 | return; |
128 | ||
2556c98b | 129 | if (bp->b_flags & LIBXFS_B_UPTODATE) { |
69ec88b5 BN |
130 | if (B_IS_INODE(flag)) |
131 | pf_read_inode_dirs(args, bp); | |
af60a998 | 132 | libxfs_buf_set_priority(bp, libxfs_buf_priority(bp) + |
b3563c19 | 133 | CACHE_PREFETCH_PRIORITY); |
e02ba985 | 134 | libxfs_buf_relse(bp); |
cb5b3ef4 MV |
135 | return; |
136 | } | |
af60a998 | 137 | libxfs_buf_set_priority(bp, flag); |
cb5b3ef4 | 138 | |
2556c98b | 139 | pthread_mutex_lock(&args->lock); |
cb5b3ef4 | 140 | |
bb34c934 BN |
141 | btree_insert(args->io_queue, fsbno, bp); |
142 | ||
2556c98b | 143 | if (fsbno > args->last_bno_read) { |
379397bf | 144 | if (B_IS_INODE(flag)) { |
2556c98b BN |
145 | args->inode_bufs_queued++; |
146 | if (args->inode_bufs_queued == IO_THRESHOLD) | |
147 | pf_start_io_workers(args); | |
cb5b3ef4 | 148 | } |
2556c98b | 149 | } else { |
08cee623 | 150 | ASSERT(!B_IS_INODE(flag)); |
af60a998 | 151 | libxfs_buf_set_priority(bp, B_DIR_META_2); |
cb5b3ef4 MV |
152 | } |
153 | ||
4c0a98ae BN |
154 | pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue" |
155 | "(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag) ? | |
156 | 'I' : 'M', bp, (long long)XFS_BUF_ADDR(bp), args->agno, fsbno, | |
157 | args->inode_bufs_queued, args->last_bno_read); | |
158 | ||
2556c98b | 159 | pf_start_processing(args); |
cb5b3ef4 | 160 | |
2556c98b | 161 | pthread_mutex_unlock(&args->lock); |
cb5b3ef4 MV |
162 | } |
163 | ||
2556c98b BN |
164 | static int |
165 | pf_read_bmbt_reclist( | |
166 | prefetch_args_t *args, | |
167 | xfs_bmbt_rec_t *rp, | |
168 | int numrecs) | |
cb5b3ef4 | 169 | { |
cb5b3ef4 | 170 | int i; |
e0a12bda | 171 | xfs_bmbt_irec_t irec; |
5a35bf2c DC |
172 | xfs_filblks_t cp = 0; /* prev count */ |
173 | xfs_fileoff_t op = 0; /* prev offset */ | |
dd9093de DC |
174 | #define MAP_ARRAY_SZ 4 |
175 | struct xfs_buf_map map_array[MAP_ARRAY_SZ]; | |
176 | struct xfs_buf_map *map = map_array; | |
177 | int max_extents = MAP_ARRAY_SZ; | |
24e04791 | 178 | int nmaps = 0; |
dd9093de DC |
179 | unsigned int len = 0; |
180 | int ret = 0; | |
181 | ||
2556c98b | 182 | |
5e656dbb BN |
183 | for (i = 0; i < numrecs; i++) { |
184 | libxfs_bmbt_disk_get_all(rp + i, &irec); | |
2556c98b | 185 | |
e0a12bda BN |
186 | if (((i > 0) && (op + cp > irec.br_startoff)) || |
187 | (irec.br_blockcount == 0) || | |
188 | (irec.br_startoff >= fs_max_file_offset)) | |
dd9093de | 189 | goto out_free; |
2556c98b | 190 | |
a6bd55d3 DW |
191 | if (!libxfs_verify_fsbno(mp, irec.br_startblock) || |
192 | !libxfs_verify_fsbno(mp, irec.br_startblock + | |
193 | irec.br_blockcount - 1)) | |
dd9093de | 194 | goto out_free; |
2556c98b | 195 | |
e0a12bda | 196 | if (!args->dirs_only && ((irec.br_startoff + |
ff105f75 | 197 | irec.br_blockcount) >= mp->m_dir_geo->freeblk)) |
2556c98b BN |
198 | break; /* only Phase 6 reads the free blocks */ |
199 | ||
e0a12bda BN |
200 | op = irec.br_startoff; |
201 | cp = irec.br_blockcount; | |
2556c98b | 202 | |
e0a12bda | 203 | while (irec.br_blockcount) { |
dd9093de | 204 | unsigned int bm_len; |
4c0a98ae | 205 | |
2556c98b | 206 | pftrace("queuing dir extent in AG %d", args->agno); |
4c0a98ae | 207 | |
ff105f75 DC |
208 | if (len + irec.br_blockcount >= mp->m_dir_geo->fsbcount) |
209 | bm_len = mp->m_dir_geo->fsbcount - len; | |
dd9093de DC |
210 | else |
211 | bm_len = irec.br_blockcount; | |
212 | len += bm_len; | |
213 | ||
214 | map[nmaps].bm_bn = XFS_FSB_TO_DADDR(mp, | |
215 | irec.br_startblock); | |
216 | map[nmaps].bm_len = XFS_FSB_TO_BB(mp, bm_len); | |
217 | nmaps++; | |
218 | ||
ff105f75 | 219 | if (len == mp->m_dir_geo->fsbcount) { |
dd9093de DC |
220 | pf_queue_io(args, map, nmaps, B_DIR_META); |
221 | len = 0; | |
222 | nmaps = 0; | |
223 | } | |
224 | ||
225 | irec.br_blockcount -= bm_len; | |
226 | irec.br_startblock += bm_len; | |
227 | ||
228 | /* | |
229 | * Handle very fragmented dir2 blocks with dynamically | |
230 | * allocated buffer maps. | |
231 | */ | |
232 | if (nmaps >= max_extents) { | |
233 | struct xfs_buf_map *old_map = NULL; | |
234 | ||
235 | if (map == map_array) { | |
236 | old_map = map; | |
237 | map = NULL; | |
238 | } | |
239 | max_extents *= 2; | |
240 | map = realloc(map, max_extents * sizeof(*map)); | |
241 | if (map == NULL) { | |
242 | do_error( | |
243 | _("couldn't malloc dir2 buffer list\n")); | |
244 | exit(1); | |
245 | } | |
246 | if (old_map) | |
247 | memcpy(map, old_map, sizeof(map_array)); | |
248 | } | |
249 | ||
2556c98b BN |
250 | } |
251 | } | |
dd9093de DC |
252 | ret = 1; |
253 | out_free: | |
254 | if (map != map_array) | |
255 | free(map); | |
256 | return ret; | |
2556c98b | 257 | } |
cb5b3ef4 | 258 | |
2556c98b BN |
259 | /* |
260 | * simplified version of the main scan_lbtree. Returns 0 to stop. | |
261 | */ | |
262 | ||
263 | static int | |
264 | pf_scan_lbtree( | |
5a35bf2c | 265 | xfs_fsblock_t dbno, |
2556c98b BN |
266 | int level, |
267 | int isadir, | |
268 | prefetch_args_t *args, | |
b3563c19 | 269 | int (*func)(struct xfs_btree_block *block, |
2556c98b BN |
270 | int level, |
271 | int isadir, | |
272 | prefetch_args_t *args)) | |
273 | { | |
274 | xfs_buf_t *bp; | |
275 | int rc; | |
31079e67 | 276 | int error; |
cb5b3ef4 | 277 | |
31079e67 DW |
278 | error = -libxfs_buf_read(mp->m_dev, XFS_FSB_TO_DADDR(mp, dbno), |
279 | XFS_FSB_TO_BB(mp, 1), LIBXFS_READBUF_SALVAGE, &bp, | |
280 | &xfs_bmbt_buf_ops); | |
281 | if (error) | |
2556c98b | 282 | return 0; |
cb5b3ef4 | 283 | |
af60a998 | 284 | libxfs_buf_set_priority(bp, isadir ? B_DIR_BMAP : B_BMAP); |
69ec88b5 | 285 | |
43ba1861 DW |
286 | /* |
287 | * If the verifier flagged a problem with the buffer, we can't trust | |
288 | * its contents for the purposes of reading ahead. Stop prefetching | |
289 | * the tree and mark the buffer unchecked so that the next read of the | |
290 | * buffer will retain the error status and be acted upon appropriately. | |
291 | */ | |
292 | if (bp->b_error) { | |
293 | bp->b_flags |= LIBXFS_B_UNCHECKED; | |
e02ba985 | 294 | libxfs_buf_relse(bp); |
43ba1861 DW |
295 | return 0; |
296 | } | |
297 | ||
b3563c19 | 298 | rc = (*func)(XFS_BUF_TO_BLOCK(bp), level - 1, isadir, args); |
cb5b3ef4 | 299 | |
e02ba985 | 300 | libxfs_buf_relse(bp); |
cb5b3ef4 | 301 | |
2556c98b BN |
302 | return rc; |
303 | } | |
304 | ||
305 | static int | |
306 | pf_scanfunc_bmap( | |
b3563c19 | 307 | struct xfs_btree_block *block, |
2556c98b BN |
308 | int level, |
309 | int isadir, | |
310 | prefetch_args_t *args) | |
311 | { | |
2556c98b BN |
312 | xfs_bmbt_ptr_t *pp; |
313 | int numrecs; | |
314 | int i; | |
5a35bf2c | 315 | xfs_fsblock_t dbno; |
2556c98b BN |
316 | |
317 | /* | |
318 | * do some validation on the block contents | |
319 | */ | |
1c88e98c DC |
320 | if ((block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) && |
321 | block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC)) || | |
2556c98b BN |
322 | (be16_to_cpu(block->bb_level) != level)) |
323 | return 0; | |
324 | ||
325 | numrecs = be16_to_cpu(block->bb_numrecs); | |
326 | ||
327 | if (level == 0) { | |
328 | if (numrecs > mp->m_bmap_dmxr[0] || !isadir) | |
329 | return 0; | |
5e656dbb | 330 | return pf_read_bmbt_reclist(args, |
b3563c19 | 331 | XFS_BMBT_REC_ADDR(mp, block, 1), numrecs); |
cb5b3ef4 MV |
332 | } |
333 | ||
2556c98b BN |
334 | if (numrecs > mp->m_bmap_dmxr[1]) |
335 | return 0; | |
cb5b3ef4 | 336 | |
b3563c19 | 337 | pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); |
2556c98b BN |
338 | |
339 | for (i = 0; i < numrecs; i++) { | |
fb36a55d | 340 | dbno = get_unaligned_be64(&pp[i]); |
a6bd55d3 | 341 | if (!libxfs_verify_fsbno(mp, dbno)) |
2556c98b BN |
342 | return 0; |
343 | if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap)) | |
344 | return 0; | |
cb5b3ef4 | 345 | } |
2556c98b | 346 | return 1; |
cb5b3ef4 MV |
347 | } |
348 | ||
2556c98b BN |
349 | |
350 | static void | |
351 | pf_read_btinode( | |
352 | prefetch_args_t *args, | |
353 | xfs_dinode_t *dino, | |
354 | int isadir) | |
cb5b3ef4 | 355 | { |
2556c98b BN |
356 | xfs_bmdr_block_t *dib; |
357 | xfs_bmbt_ptr_t *pp; | |
358 | int i; | |
359 | int level; | |
360 | int numrecs; | |
361 | int dsize; | |
5a35bf2c | 362 | xfs_fsblock_t dbno; |
2556c98b BN |
363 | |
364 | dib = (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dino); | |
365 | ||
366 | level = be16_to_cpu(dib->bb_level); | |
367 | numrecs = be16_to_cpu(dib->bb_numrecs); | |
368 | ||
369 | if ((numrecs == 0) || (level == 0) || | |
370 | (level > XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))) | |
cb5b3ef4 | 371 | return; |
2556c98b BN |
372 | /* |
373 | * use bmdr/dfork_dsize since the root block is in the data fork | |
374 | */ | |
375 | if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp)) | |
376 | return; | |
377 | ||
378 | dsize = XFS_DFORK_DSIZE(dino, mp); | |
e2f60652 | 379 | pp = XFS_BMDR_PTR_ADDR(dib, 1, libxfs_bmdr_maxrecs(dsize, 0)); |
cb5b3ef4 | 380 | |
2556c98b | 381 | for (i = 0; i < numrecs; i++) { |
fb36a55d | 382 | dbno = get_unaligned_be64(&pp[i]); |
a6bd55d3 | 383 | if (!libxfs_verify_fsbno(mp, dbno)) |
cb5b3ef4 | 384 | break; |
2556c98b | 385 | if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap)) |
cb5b3ef4 | 386 | break; |
2556c98b BN |
387 | } |
388 | } | |
389 | ||
390 | static void | |
391 | pf_read_exinode( | |
392 | prefetch_args_t *args, | |
393 | xfs_dinode_t *dino) | |
394 | { | |
395 | pf_read_bmbt_reclist(args, (xfs_bmbt_rec_t *)XFS_DFORK_DPTR(dino), | |
56b2de80 | 396 | be32_to_cpu(dino->di_nextents)); |
2556c98b | 397 | } |
cb5b3ef4 | 398 | |
2556c98b BN |
399 | static void |
400 | pf_read_inode_dirs( | |
401 | prefetch_args_t *args, | |
402 | xfs_buf_t *bp) | |
403 | { | |
404 | xfs_dinode_t *dino; | |
405 | int icnt = 0; | |
69ec88b5 BN |
406 | int hasdir = 0; |
407 | int isadir; | |
456371d8 | 408 | int error; |
2556c98b | 409 | |
456371d8 DW |
410 | error = -libxfs_readbuf_verify(bp, &xfs_inode_buf_ops); |
411 | if (error) | |
e0607266 DC |
412 | return; |
413 | ||
135e4bfe | 414 | for (icnt = 0; icnt < (bp->b_bcount >> mp->m_sb.sb_inodelog); icnt++) { |
56b2de80 | 415 | dino = xfs_make_iptr(mp, bp, icnt); |
2556c98b BN |
416 | |
417 | /* | |
418 | * We are only prefetching directory contents in extents | |
419 | * and btree nodes for other inodes | |
420 | */ | |
56b2de80 | 421 | isadir = (be16_to_cpu(dino->di_mode) & S_IFMT) == S_IFDIR; |
69ec88b5 BN |
422 | hasdir |= isadir; |
423 | ||
56b2de80 | 424 | if (dino->di_format <= XFS_DINODE_FMT_LOCAL) |
69ec88b5 BN |
425 | continue; |
426 | ||
56b2de80 | 427 | if (!isadir && (dino->di_format == XFS_DINODE_FMT_EXTENTS || |
69ec88b5 | 428 | args->dirs_only)) |
2556c98b BN |
429 | continue; |
430 | ||
431 | /* | |
432 | * do some checks on the inode to see if we can prefetch | |
433 | * its directory data. It's a cut down version of | |
434 | * process_dinode_int() in dinode.c. | |
435 | */ | |
56b2de80 | 436 | if (dino->di_format > XFS_DINODE_FMT_BTREE) |
2556c98b BN |
437 | continue; |
438 | ||
56b2de80 | 439 | if (be16_to_cpu(dino->di_magic) != XFS_DINODE_MAGIC) |
2556c98b BN |
440 | continue; |
441 | ||
db84c7e8 | 442 | if (!libxfs_dinode_good_version(&mp->m_sb, dino->di_version)) |
2556c98b BN |
443 | continue; |
444 | ||
56b2de80 | 445 | if (be64_to_cpu(dino->di_size) <= XFS_DFORK_DSIZE(dino, mp)) |
2556c98b BN |
446 | continue; |
447 | ||
56b2de80 | 448 | if ((dino->di_forkoff != 0) && |
4de63245 | 449 | (dino->di_forkoff >= XFS_LITINO(mp) >> 3)) |
2556c98b BN |
450 | continue; |
451 | ||
56b2de80 | 452 | switch (dino->di_format) { |
2556c98b BN |
453 | case XFS_DINODE_FMT_EXTENTS: |
454 | pf_read_exinode(args, dino); | |
cb5b3ef4 | 455 | break; |
2556c98b | 456 | case XFS_DINODE_FMT_BTREE: |
69ec88b5 | 457 | pf_read_btinode(args, dino, isadir); |
cb5b3ef4 | 458 | break; |
cb5b3ef4 MV |
459 | } |
460 | } | |
69ec88b5 | 461 | if (hasdir) |
af60a998 | 462 | libxfs_buf_set_priority(bp, B_DIR_INODE); |
cb5b3ef4 MV |
463 | } |
464 | ||
dd9093de DC |
465 | /* |
466 | * pf_batch_read must be called with the lock locked. | |
467 | */ | |
cb5b3ef4 | 468 | static void |
2556c98b BN |
469 | pf_batch_read( |
470 | prefetch_args_t *args, | |
471 | pf_which_t which, | |
472 | void *buf) | |
cb5b3ef4 | 473 | { |
2556c98b BN |
474 | xfs_buf_t *bplist[MAX_BUFS]; |
475 | unsigned int num; | |
476 | off64_t first_off, last_off, next_off; | |
477 | int len, size; | |
cb5b3ef4 | 478 | int i; |
2556c98b | 479 | int inode_bufs; |
e33b06a3 | 480 | unsigned long fsbno = 0; |
379397bf | 481 | unsigned long max_fsbno; |
2556c98b BN |
482 | char *pbuf; |
483 | ||
bb34c934 | 484 | for (;;) { |
379397bf | 485 | num = 0; |
bb34c934 BN |
486 | if (which == PF_SECONDARY) { |
487 | bplist[0] = btree_find(args->io_queue, 0, &fsbno); | |
68d16907 | 488 | max_fsbno = min(fsbno + pf_max_fsbs, |
bb34c934 BN |
489 | args->last_bno_read); |
490 | } else { | |
491 | bplist[0] = btree_find(args->io_queue, | |
492 | args->last_bno_read, &fsbno); | |
493 | max_fsbno = fsbno + pf_max_fsbs; | |
494 | } | |
379397bf | 495 | while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) { |
dd9093de | 496 | /* |
bbd32754 DC |
497 | * Discontiguous buffers need special handling, so stop |
498 | * gathering new buffers and process the list and this | |
499 | * discontigous buffer immediately. This avoids the | |
500 | * complexity of keeping a separate discontigous buffer | |
501 | * list and seeking back over ranges we've already done | |
502 | * optimised reads for. | |
dd9093de DC |
503 | */ |
504 | if ((bplist[num]->b_flags & LIBXFS_B_DISCONTIG)) { | |
bbd32754 DC |
505 | num++; |
506 | break; | |
507 | } | |
508 | ||
509 | if (which != PF_META_ONLY || | |
af60a998 | 510 | !B_IS_INODE(libxfs_buf_priority(bplist[num]))) |
379397bf | 511 | num++; |
e49f30a7 ES |
512 | if (num == MAX_BUFS) |
513 | break; | |
bb34c934 | 514 | bplist[num] = btree_lookup_next(args->io_queue, &fsbno); |
2556c98b | 515 | } |
379397bf BN |
516 | if (!num) |
517 | return; | |
cb5b3ef4 | 518 | |
2556c98b BN |
519 | /* |
520 | * do a big read if 25% of the potential buffer is useful, | |
521 | * otherwise, find as many close together blocks and | |
522 | * read them in one read | |
523 | */ | |
524 | first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0])); | |
525 | last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) + | |
526 | XFS_BUF_SIZE(bplist[num-1]); | |
2c350101 | 527 | while (num > 1 && last_off - first_off > pf_max_bytes) { |
2556c98b BN |
528 | num--; |
529 | last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) + | |
530 | XFS_BUF_SIZE(bplist[num-1]); | |
531 | } | |
532 | if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) { | |
533 | /* | |
534 | * not enough blocks for one big read, so determine | |
535 | * the number of blocks that are close enough. | |
536 | */ | |
537 | last_off = first_off + XFS_BUF_SIZE(bplist[0]); | |
538 | for (i = 1; i < num; i++) { | |
539 | next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) + | |
540 | XFS_BUF_SIZE(bplist[i]); | |
541 | if (next_off - last_off > pf_batch_bytes) | |
542 | break; | |
543 | last_off = next_off; | |
544 | } | |
545 | num = i; | |
546 | } | |
cb5b3ef4 | 547 | |
2556c98b | 548 | for (i = 0; i < num; i++) { |
bb34c934 | 549 | if (btree_delete(args->io_queue, XFS_DADDR_TO_FSB(mp, |
2556c98b BN |
550 | XFS_BUF_ADDR(bplist[i]))) == NULL) |
551 | do_error(_("prefetch corruption\n")); | |
cb5b3ef4 MV |
552 | } |
553 | ||
2556c98b BN |
554 | if (which == PF_PRIMARY) { |
555 | for (inode_bufs = 0, i = 0; i < num; i++) { | |
af60a998 | 556 | if (B_IS_INODE(libxfs_buf_priority(bplist[i]))) |
2556c98b BN |
557 | inode_bufs++; |
558 | } | |
559 | args->inode_bufs_queued -= inode_bufs; | |
560 | if (inode_bufs && (first_off >> mp->m_sb.sb_blocklog) > | |
561 | pf_batch_fsbs) | |
562 | args->last_bno_read = (first_off >> mp->m_sb.sb_blocklog); | |
563 | } | |
564 | #ifdef XR_PF_TRACE | |
565 | pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)", | |
566 | (long long)XFS_BUF_ADDR(bplist[0]), | |
567 | (long long)XFS_BUF_ADDR(bplist[num-1]), num, | |
568 | (which != PF_SECONDARY) ? "pri" : "sec", args->agno, | |
569 | args->last_bno_read, args->inode_bufs_queued); | |
570 | #endif | |
571 | pthread_mutex_unlock(&args->lock); | |
572 | ||
573 | /* | |
574 | * now read the data and put into the xfs_but_t's | |
575 | */ | |
2f9a125c | 576 | len = pread(mp_fd, buf, (int)(last_off - first_off), first_off); |
bbd32754 DC |
577 | |
578 | /* | |
579 | * Check the last buffer on the list to see if we need to | |
580 | * process a discontiguous buffer. The gather above loop | |
581 | * guarantees that only the last buffer in the list will be a | |
582 | * discontiguous buffer. | |
583 | */ | |
584 | if ((bplist[num - 1]->b_flags & LIBXFS_B_DISCONTIG)) { | |
585 | libxfs_readbufr_map(mp->m_ddev_targp, bplist[num - 1], 0); | |
586 | bplist[num - 1]->b_flags |= LIBXFS_B_UNCHECKED; | |
e02ba985 | 587 | libxfs_buf_relse(bplist[num - 1]); |
bbd32754 DC |
588 | num--; |
589 | } | |
590 | ||
2556c98b BN |
591 | if (len > 0) { |
592 | /* | |
593 | * go through the xfs_buf_t list copying from the | |
594 | * read buffer into the xfs_buf_t's and release them. | |
595 | */ | |
2556c98b BN |
596 | for (i = 0; i < num; i++) { |
597 | ||
598 | pbuf = ((char *)buf) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) - first_off); | |
599 | size = XFS_BUF_SIZE(bplist[i]); | |
600 | if (len < size) | |
601 | break; | |
04338619 | 602 | memcpy(bplist[i]->b_addr, pbuf, size); |
adbb3573 DC |
603 | bplist[i]->b_flags |= (LIBXFS_B_UPTODATE | |
604 | LIBXFS_B_UNCHECKED); | |
2556c98b | 605 | len -= size; |
af60a998 | 606 | if (B_IS_INODE(libxfs_buf_priority(bplist[i]))) |
2556c98b | 607 | pf_read_inode_dirs(args, bplist[i]); |
69ec88b5 | 608 | else if (which == PF_META_ONLY) |
af60a998 | 609 | libxfs_buf_set_priority(bplist[i], |
69ec88b5 BN |
610 | B_DIR_META_H); |
611 | else if (which == PF_PRIMARY && num == 1) | |
af60a998 | 612 | libxfs_buf_set_priority(bplist[i], |
69ec88b5 | 613 | B_DIR_META_S); |
2556c98b BN |
614 | } |
615 | } | |
616 | for (i = 0; i < num; i++) { | |
2556c98b | 617 | pftrace("putbuf %c %p (%llu) in AG %d", |
af60a998 DW |
618 | B_IS_INODE(libxfs_buf_priority(bplist[i])) ? |
619 | 'I' : 'M', | |
2556c98b BN |
620 | bplist[i], (long long)XFS_BUF_ADDR(bplist[i]), |
621 | args->agno); | |
e02ba985 | 622 | libxfs_buf_relse(bplist[i]); |
2556c98b BN |
623 | } |
624 | pthread_mutex_lock(&args->lock); | |
625 | if (which != PF_SECONDARY) { | |
2556c98b BN |
626 | pftrace("inode_bufs_queued for AG %d = %d", args->agno, |
627 | args->inode_bufs_queued); | |
2556c98b BN |
628 | /* |
629 | * if primary inode queue running low, process metadata | |
630 | * in boths queues to avoid I/O starvation as the | |
631 | * processing thread would be waiting for a metadata | |
632 | * buffer | |
633 | */ | |
634 | if (which == PF_PRIMARY && !args->queuing_done && | |
635 | args->inode_bufs_queued < IO_THRESHOLD) { | |
2556c98b BN |
636 | pftrace("reading metadata bufs from primary queue for AG %d", |
637 | args->agno); | |
4c0a98ae | 638 | |
2556c98b | 639 | pf_batch_read(args, PF_META_ONLY, buf); |
4c0a98ae | 640 | |
2556c98b BN |
641 | pftrace("reading bufs from secondary queue for AG %d", |
642 | args->agno); | |
4c0a98ae | 643 | |
2556c98b BN |
644 | pf_batch_read(args, PF_SECONDARY, buf); |
645 | } | |
cb5b3ef4 | 646 | } |
cb5b3ef4 | 647 | } |
2556c98b BN |
648 | } |
649 | ||
650 | static void * | |
651 | pf_io_worker( | |
652 | void *param) | |
653 | { | |
654 | prefetch_args_t *args = param; | |
655 | void *buf = memalign(libxfs_device_alignment(), | |
656 | pf_max_bytes); | |
657 | ||
658 | if (buf == NULL) | |
659 | return NULL; | |
cb5b3ef4 | 660 | |
2556c98b | 661 | pthread_mutex_lock(&args->lock); |
bb34c934 | 662 | while (!args->queuing_done || !btree_is_empty(args->io_queue)) { |
2556c98b | 663 | pftrace("waiting to start prefetch I/O for AG %d", args->agno); |
4c0a98ae | 664 | |
2556c98b BN |
665 | while (!args->can_start_reading && !args->queuing_done) |
666 | pthread_cond_wait(&args->start_reading, &args->lock); | |
4c0a98ae | 667 | |
2556c98b | 668 | pftrace("starting prefetch I/O for AG %d", args->agno); |
4c0a98ae | 669 | |
2556c98b BN |
670 | pf_batch_read(args, PF_PRIMARY, buf); |
671 | pf_batch_read(args, PF_SECONDARY, buf); | |
672 | ||
2556c98b | 673 | pftrace("ran out of bufs to prefetch for AG %d", args->agno); |
4c0a98ae | 674 | |
2556c98b BN |
675 | if (!args->queuing_done) |
676 | args->can_start_reading = 0; | |
cb5b3ef4 | 677 | } |
2556c98b | 678 | pthread_mutex_unlock(&args->lock); |
cb5b3ef4 | 679 | |
2556c98b BN |
680 | free(buf); |
681 | ||
2556c98b | 682 | pftrace("finished prefetch I/O for AG %d", args->agno); |
4c0a98ae | 683 | |
2556c98b | 684 | return NULL; |
cb5b3ef4 MV |
685 | } |
686 | ||
2556c98b BN |
687 | static int |
688 | pf_create_prefetch_thread( | |
689 | prefetch_args_t *args); | |
690 | ||
b97ad969 JM |
691 | /* |
692 | * If we fail to create the queuing thread or can't create even one | |
693 | * prefetch thread, we need to let processing continue without it. | |
694 | */ | |
695 | static void | |
696 | pf_skip_prefetch_thread(prefetch_args_t *args) | |
697 | { | |
698 | prefetch_args_t *next; | |
699 | ||
700 | pthread_mutex_lock(&args->lock); | |
701 | args->prefetch_done = 1; | |
702 | pf_start_processing(args); | |
703 | next = args->next_args; | |
704 | args->next_args = NULL; | |
705 | pthread_mutex_unlock(&args->lock); | |
706 | ||
707 | if (next) | |
708 | pf_create_prefetch_thread(next); | |
709 | } | |
710 | ||
2556c98b BN |
711 | static void * |
712 | pf_queuing_worker( | |
713 | void *param) | |
cb5b3ef4 | 714 | { |
2556c98b | 715 | prefetch_args_t *args = param; |
b97ad969 | 716 | prefetch_args_t *next_args; |
2556c98b BN |
717 | int num_inos; |
718 | ino_tree_node_t *irec; | |
719 | ino_tree_node_t *cur_irec; | |
2556c98b | 720 | xfs_agblock_t bno; |
cb5b3ef4 | 721 | int i; |
2556c98b | 722 | int err; |
870b18fd | 723 | uint64_t sparse; |
e7fd2b6f | 724 | struct xfs_ino_geometry *igeo = M_IGEO(mp); |
41baceb7 | 725 | unsigned long long cluster_mask; |
2556c98b | 726 | |
41baceb7 | 727 | cluster_mask = (1ULL << igeo->inodes_per_cluster) - 1; |
2556c98b BN |
728 | |
729 | for (i = 0; i < PF_THREAD_COUNT; i++) { | |
730 | err = pthread_create(&args->io_threads[i], NULL, | |
731 | pf_io_worker, args); | |
732 | if (err != 0) { | |
733 | do_warn(_("failed to create prefetch thread: %s\n"), | |
734 | strerror(err)); | |
e8ff6275 JM |
735 | pftrace("failed to create prefetch thread for AG %d: %s", |
736 | args->agno, strerror(err)); | |
53dc81db | 737 | args->io_threads[i] = 0; |
2556c98b | 738 | if (i == 0) { |
b97ad969 | 739 | pf_skip_prefetch_thread(args); |
2556c98b BN |
740 | return NULL; |
741 | } | |
742 | /* | |
743 | * since we have at least one I/O thread, use them for | |
744 | * prefetch | |
745 | */ | |
746 | break; | |
747 | } | |
cb5b3ef4 | 748 | } |
2556c98b | 749 | pftrace("starting prefetch for AG %d", args->agno); |
cb5b3ef4 | 750 | |
2556c98b BN |
751 | for (irec = findfirst_inode_rec(args->agno); irec != NULL; |
752 | irec = next_ino_rec(irec)) { | |
cb5b3ef4 | 753 | |
2556c98b | 754 | cur_irec = irec; |
cb5b3ef4 | 755 | |
2556c98b | 756 | num_inos = XFS_INODES_PER_CHUNK; |
e7fd2b6f | 757 | while (num_inos < igeo->ialloc_inos && irec != NULL) { |
2556c98b BN |
758 | irec = next_ino_rec(irec); |
759 | num_inos += XFS_INODES_PER_CHUNK; | |
760 | } | |
cb5b3ef4 | 761 | |
2556c98b BN |
762 | if (args->dirs_only && cur_irec->ino_isa_dir == 0) |
763 | continue; | |
764 | #ifdef XR_PF_TRACE | |
765 | sem_getvalue(&args->ra_count, &i); | |
766 | pftrace("queuing irec %p in AG %d, sem count = %d", | |
767 | irec, args->agno, i); | |
768 | #endif | |
3724f674 | 769 | err = sem_trywait(&args->ra_count); |
004e18d4 | 770 | if (err < 0 && errno == EAGAIN) { |
3724f674 CH |
771 | /* |
772 | * Kick the queue once we have reached the limit; | |
773 | * without this the threads processing the inodes | |
774 | * might get stuck on a buffer that has been locked | |
775 | * and added to the I/O queue but is waiting for | |
776 | * the thread to be woken. | |
7cf2aa1a ES |
777 | * Start processing as well, in case everything so |
778 | * far was already prefetched and the queue is empty. | |
3724f674 | 779 | */ |
7cf2aa1a | 780 | |
3724f674 | 781 | pf_start_io_workers(args); |
7cf2aa1a | 782 | pf_start_processing(args); |
3724f674 CH |
783 | sem_wait(&args->ra_count); |
784 | } | |
2556c98b BN |
785 | |
786 | num_inos = 0; | |
787 | bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum); | |
870b18fd | 788 | sparse = cur_irec->ir_sparse; |
2556c98b BN |
789 | |
790 | do { | |
dd9093de DC |
791 | struct xfs_buf_map map; |
792 | ||
793 | map.bm_bn = XFS_AGB_TO_DADDR(mp, args->agno, bno); | |
41baceb7 DW |
794 | map.bm_len = XFS_FSB_TO_BB(mp, |
795 | igeo->blocks_per_cluster); | |
870b18fd BF |
796 | |
797 | /* | |
798 | * Queue I/O for each non-sparse cluster. We can check | |
799 | * sparse state in cluster sized chunks as cluster size | |
800 | * is the min. granularity of sparse irec regions. | |
801 | */ | |
41baceb7 | 802 | if ((sparse & cluster_mask) == 0) |
870b18fd BF |
803 | pf_queue_io(args, &map, 1, |
804 | (cur_irec->ino_isa_dir != 0) ? | |
805 | B_DIR_INODE : B_INODE); | |
806 | ||
41baceb7 DW |
807 | bno += igeo->blocks_per_cluster; |
808 | num_inos += igeo->inodes_per_cluster; | |
809 | sparse >>= igeo->inodes_per_cluster; | |
e7fd2b6f | 810 | } while (num_inos < igeo->ialloc_inos); |
cb5b3ef4 MV |
811 | } |
812 | ||
2556c98b | 813 | pthread_mutex_lock(&args->lock); |
cb5b3ef4 | 814 | |
2556c98b BN |
815 | pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)", |
816 | args->agno, args->inode_bufs_queued); | |
4c0a98ae | 817 | |
2556c98b BN |
818 | args->queuing_done = 1; |
819 | pf_start_io_workers(args); | |
820 | pf_start_processing(args); | |
821 | pthread_mutex_unlock(&args->lock); | |
822 | ||
823 | /* now wait for the readers to finish */ | |
824 | for (i = 0; i < PF_THREAD_COUNT; i++) | |
825 | if (args->io_threads[i]) | |
826 | pthread_join(args->io_threads[i], NULL); | |
827 | ||
2556c98b | 828 | pftrace("prefetch for AG %d finished", args->agno); |
4c0a98ae | 829 | |
2556c98b BN |
830 | pthread_mutex_lock(&args->lock); |
831 | ||
bb34c934 | 832 | ASSERT(btree_is_empty(args->io_queue)); |
2556c98b BN |
833 | |
834 | args->prefetch_done = 1; | |
b97ad969 JM |
835 | next_args = args->next_args; |
836 | args->next_args = NULL; | |
2556c98b BN |
837 | pthread_mutex_unlock(&args->lock); |
838 | ||
b97ad969 JM |
839 | if (next_args) |
840 | pf_create_prefetch_thread(next_args); | |
841 | ||
2556c98b | 842 | return NULL; |
cb5b3ef4 MV |
843 | } |
844 | ||
2556c98b BN |
845 | static int |
846 | pf_create_prefetch_thread( | |
847 | prefetch_args_t *args) | |
848 | { | |
849 | int err; | |
850 | ||
2556c98b | 851 | pftrace("creating queue thread for AG %d", args->agno); |
4c0a98ae | 852 | |
2556c98b BN |
853 | err = pthread_create(&args->queuing_thread, NULL, |
854 | pf_queuing_worker, args); | |
855 | if (err != 0) { | |
856 | do_warn(_("failed to create prefetch thread: %s\n"), | |
857 | strerror(err)); | |
e8ff6275 JM |
858 | pftrace("failed to create prefetch thread for AG %d: %s", |
859 | args->agno, strerror(err)); | |
53dc81db | 860 | args->queuing_thread = 0; |
b97ad969 | 861 | pf_skip_prefetch_thread(args); |
2556c98b BN |
862 | } |
863 | ||
864 | return err == 0; | |
865 | } | |
cb5b3ef4 MV |
866 | |
867 | void | |
2556c98b BN |
868 | init_prefetch( |
869 | xfs_mount_t *pmp) | |
cb5b3ef4 | 870 | { |
2556c98b | 871 | mp = pmp; |
ab434d12 | 872 | mp_fd = libxfs_device_to_fd(mp->m_ddev_targp->bt_bdev); |
2556c98b BN |
873 | pf_max_bytes = sysconf(_SC_PAGE_SIZE) << 7; |
874 | pf_max_bbs = pf_max_bytes >> BBSHIFT; | |
875 | pf_max_fsbs = pf_max_bytes >> mp->m_sb.sb_blocklog; | |
876 | pf_batch_bytes = DEF_BATCH_BYTES; | |
877 | pf_batch_fsbs = DEF_BATCH_BYTES >> (mp->m_sb.sb_blocklog + 1); | |
878 | } | |
cb5b3ef4 | 879 | |
2556c98b BN |
880 | prefetch_args_t * |
881 | start_inode_prefetch( | |
882 | xfs_agnumber_t agno, | |
883 | int dirs_only, | |
884 | prefetch_args_t *prev_args) | |
885 | { | |
886 | prefetch_args_t *args; | |
edf3f9d0 | 887 | long max_queue; |
e7fd2b6f | 888 | struct xfs_ino_geometry *igeo = M_IGEO(mp); |
cb5b3ef4 | 889 | |
2556c98b BN |
890 | if (!do_prefetch || agno >= mp->m_sb.sb_agcount) |
891 | return NULL; | |
cb5b3ef4 | 892 | |
2556c98b BN |
893 | args = calloc(1, sizeof(prefetch_args_t)); |
894 | ||
bb34c934 | 895 | btree_init(&args->io_queue); |
5e656dbb BN |
896 | if (pthread_mutex_init(&args->lock, NULL) != 0) |
897 | do_error(_("failed to initialize prefetch mutex\n")); | |
898 | if (pthread_cond_init(&args->start_reading, NULL) != 0) | |
899 | do_error(_("failed to initialize prefetch cond var\n")); | |
900 | if (pthread_cond_init(&args->start_processing, NULL) != 0) | |
901 | do_error(_("failed to initialize prefetch cond var\n")); | |
2556c98b BN |
902 | args->agno = agno; |
903 | args->dirs_only = dirs_only; | |
904 | ||
905 | /* | |
906 | * use only 1/8 of the libxfs cache as we are only counting inodes | |
907 | * and not any other associated metadata like directories | |
908 | */ | |
909 | ||
edf3f9d0 | 910 | max_queue = libxfs_bcache->c_maxcount / thread_count / 8; |
e7fd2b6f | 911 | if (igeo->inode_cluster_size > mp->m_sb.sb_blocksize) |
41baceb7 DW |
912 | max_queue = max_queue * igeo->blocks_per_cluster / |
913 | igeo->ialloc_blks; | |
edf3f9d0 BN |
914 | |
915 | sem_init(&args->ra_count, 0, max_queue); | |
2556c98b BN |
916 | |
917 | if (!prev_args) { | |
918 | if (!pf_create_prefetch_thread(args)) | |
919 | return NULL; | |
920 | } else { | |
921 | pthread_mutex_lock(&prev_args->lock); | |
922 | if (prev_args->prefetch_done) { | |
b97ad969 | 923 | pthread_mutex_unlock(&prev_args->lock); |
2556c98b BN |
924 | if (!pf_create_prefetch_thread(args)) |
925 | args = NULL; | |
e8ff6275 | 926 | } else { |
2556c98b | 927 | prev_args->next_args = args; |
e8ff6275 JM |
928 | pftrace("queued AG %d after AG %d", |
929 | args->agno, prev_args->agno); | |
b97ad969 | 930 | pthread_mutex_unlock(&prev_args->lock); |
e8ff6275 | 931 | } |
cb5b3ef4 | 932 | } |
2556c98b BN |
933 | |
934 | return args; | |
cb5b3ef4 MV |
935 | } |
936 | ||
71014d19 DC |
937 | /* |
938 | * prefetch_ag_range runs a prefetch-and-process loop across a range of AGs. It | |
939 | * begins with @start+ag, and finishes with @end_ag - 1 (i.e. does not prefetch | |
940 | * or process @end_ag). The function starts prefetch on the first AG, then loops | |
941 | * starting prefetch on the next AG and then blocks processing the current AG as | |
942 | * the prefetch queue brings inodes into the processing queue. | |
943 | * | |
944 | * There is only one prefetch taking place at a time, so the prefetch on the | |
945 | * next AG only starts once the current AG has been completely prefetched. Hence | |
946 | * the prefetch of the next AG will start some time before the processing of the | |
947 | * current AG finishes, ensuring that when we iterate an start processing the | |
948 | * next AG there is already a significant queue of inodes to process. | |
949 | * | |
950 | * Prefetch is done this way to prevent it from running too far ahead of the | |
951 | * processing. Allowing it to do so can cause cache thrashing, where new | |
952 | * prefetch causes previously prefetched buffers to be reclaimed before the | |
953 | * processing thread uses them. This results in reading all the inodes and | |
954 | * metadata twice per phase and it greatly slows down the processing. Hence we | |
955 | * have to carefully control how far ahead we prefetch... | |
956 | */ | |
957 | static void | |
958 | prefetch_ag_range( | |
62843f36 | 959 | struct workqueue *work, |
71014d19 DC |
960 | xfs_agnumber_t start_ag, |
961 | xfs_agnumber_t end_ag, | |
962 | bool dirs_only, | |
62843f36 | 963 | void (*func)(struct workqueue *, |
71014d19 DC |
964 | xfs_agnumber_t, void *)) |
965 | { | |
966 | int i; | |
967 | struct prefetch_args *pf_args[2]; | |
968 | ||
969 | pf_args[start_ag & 1] = start_inode_prefetch(start_ag, dirs_only, NULL); | |
970 | for (i = start_ag; i < end_ag; i++) { | |
971 | /* Don't prefetch end_ag */ | |
972 | if (i + 1 < end_ag) | |
973 | pf_args[(~i) & 1] = start_inode_prefetch(i + 1, | |
974 | dirs_only, pf_args[i & 1]); | |
975 | func(work, i, pf_args[i & 1]); | |
976 | } | |
977 | } | |
978 | ||
979 | struct pf_work_args { | |
980 | xfs_agnumber_t start_ag; | |
981 | xfs_agnumber_t end_ag; | |
982 | bool dirs_only; | |
62843f36 | 983 | void (*func)(struct workqueue *, xfs_agnumber_t, void *); |
71014d19 DC |
984 | }; |
985 | ||
986 | static void | |
987 | prefetch_ag_range_work( | |
62843f36 | 988 | struct workqueue *work, |
71014d19 DC |
989 | xfs_agnumber_t unused, |
990 | void *args) | |
991 | { | |
992 | struct pf_work_args *wargs = args; | |
993 | ||
f8149110 | 994 | prefetch_ag_range(work, wargs->start_ag, wargs->end_ag, |
71014d19 DC |
995 | wargs->dirs_only, wargs->func); |
996 | free(args); | |
997 | } | |
998 | ||
1164bde5 DC |
999 | /* |
1000 | * Do inode prefetch in the most optimal way for the context under which repair | |
1001 | * has been run. | |
1002 | */ | |
1003 | void | |
1004 | do_inode_prefetch( | |
1005 | struct xfs_mount *mp, | |
1006 | int stride, | |
62843f36 | 1007 | void (*func)(struct workqueue *, |
1164bde5 DC |
1008 | xfs_agnumber_t, void *), |
1009 | bool check_cache, | |
1010 | bool dirs_only) | |
1011 | { | |
71014d19 | 1012 | int i; |
62843f36 DW |
1013 | struct workqueue queue; |
1014 | struct workqueue *queues; | |
f994d14f | 1015 | int queues_started = 0; |
1164bde5 DC |
1016 | |
1017 | /* | |
1018 | * If the previous phases of repair have not overflowed the buffer | |
1019 | * cache, then we don't need to re-read any of the metadata in the | |
1020 | * filesystem - it's all in the cache. In that case, run a thread per | |
1021 | * CPU to maximise parallelism of the queue to be processed. | |
1022 | */ | |
1023 | if (check_cache && !libxfs_bcache_overflowed()) { | |
62843f36 | 1024 | queue.wq_ctx = mp; |
4b45ff6f | 1025 | create_work_queue(&queue, mp, platform_nproc()); |
1164bde5 DC |
1026 | for (i = 0; i < mp->m_sb.sb_agcount; i++) |
1027 | queue_work(&queue, func, i, NULL); | |
1028 | destroy_work_queue(&queue); | |
1029 | return; | |
1030 | } | |
1031 | ||
1032 | /* | |
1033 | * single threaded behaviour - single prefetch thread, processed | |
1034 | * directly after each AG is queued. | |
1035 | */ | |
1036 | if (!stride) { | |
62843f36 | 1037 | queue.wq_ctx = mp; |
71014d19 DC |
1038 | prefetch_ag_range(&queue, 0, mp->m_sb.sb_agcount, |
1039 | dirs_only, func); | |
1164bde5 DC |
1040 | return; |
1041 | } | |
1042 | ||
1043 | /* | |
1044 | * create one worker thread for each segment of the volume | |
1045 | */ | |
62843f36 | 1046 | queues = malloc(thread_count * sizeof(struct workqueue)); |
71014d19 DC |
1047 | for (i = 0; i < thread_count; i++) { |
1048 | struct pf_work_args *wargs; | |
1049 | ||
1050 | wargs = malloc(sizeof(struct pf_work_args)); | |
1051 | wargs->start_ag = i * stride; | |
1052 | wargs->end_ag = min((i + 1) * stride, | |
1053 | mp->m_sb.sb_agcount); | |
1054 | wargs->dirs_only = dirs_only; | |
1055 | wargs->func = func; | |
1056 | ||
1164bde5 | 1057 | create_work_queue(&queues[i], mp, 1); |
71014d19 | 1058 | queue_work(&queues[i], prefetch_ag_range_work, 0, wargs); |
f994d14f | 1059 | queues_started++; |
71014d19 DC |
1060 | |
1061 | if (wargs->end_ag >= mp->m_sb.sb_agcount) | |
1062 | break; | |
1164bde5 | 1063 | } |
71014d19 | 1064 | |
1164bde5 DC |
1065 | /* |
1066 | * wait for workers to complete | |
1067 | */ | |
f994d14f | 1068 | for (i = 0; i < queues_started; i++) |
1164bde5 DC |
1069 | destroy_work_queue(&queues[i]); |
1070 | free(queues); | |
1071 | } | |
1072 | ||
cb5b3ef4 | 1073 | void |
2556c98b BN |
1074 | wait_for_inode_prefetch( |
1075 | prefetch_args_t *args) | |
cb5b3ef4 | 1076 | { |
2556c98b | 1077 | if (args == NULL) |
cb5b3ef4 | 1078 | return; |
2556c98b BN |
1079 | |
1080 | pthread_mutex_lock(&args->lock); | |
1081 | ||
1082 | while (!args->can_start_processing) { | |
2556c98b | 1083 | pftrace("waiting to start processing AG %d", args->agno); |
4c0a98ae | 1084 | |
2556c98b | 1085 | pthread_cond_wait(&args->start_processing, &args->lock); |
cb5b3ef4 | 1086 | } |
2556c98b | 1087 | pftrace("can start processing AG %d", args->agno); |
4c0a98ae | 1088 | |
2556c98b BN |
1089 | pthread_mutex_unlock(&args->lock); |
1090 | } | |
cb5b3ef4 | 1091 | |
2556c98b BN |
1092 | void |
1093 | cleanup_inode_prefetch( | |
1094 | prefetch_args_t *args) | |
1095 | { | |
1096 | if (args == NULL) | |
1097 | return; | |
cb5b3ef4 | 1098 | |
2556c98b | 1099 | pftrace("waiting AG %d prefetch to finish", args->agno); |
4c0a98ae | 1100 | |
2556c98b BN |
1101 | if (args->queuing_thread) |
1102 | pthread_join(args->queuing_thread, NULL); | |
1103 | ||
2556c98b | 1104 | pftrace("AG %d prefetch done", args->agno); |
4c0a98ae | 1105 | |
b97ad969 JM |
1106 | ASSERT(args->next_args == NULL); |
1107 | ||
2556c98b BN |
1108 | pthread_mutex_destroy(&args->lock); |
1109 | pthread_cond_destroy(&args->start_reading); | |
1110 | pthread_cond_destroy(&args->start_processing); | |
1111 | sem_destroy(&args->ra_count); | |
bb34c934 | 1112 | btree_destroy(args->io_queue); |
2556c98b BN |
1113 | |
1114 | free(args); | |
cb5b3ef4 MV |
1115 | } |
1116 | ||
2556c98b BN |
1117 | #ifdef XR_PF_TRACE |
1118 | ||
4c0a98ae BN |
1119 | static FILE *pf_trace_file; |
1120 | ||
1121 | void | |
1122 | pftrace_init(void) | |
1123 | { | |
1124 | pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w"); | |
1125 | setvbuf(pf_trace_file, NULL, _IOLBF, 1024); | |
1126 | } | |
1127 | ||
1128 | void | |
1129 | pftrace_done(void) | |
1130 | { | |
1131 | fclose(pf_trace_file); | |
1132 | } | |
1133 | ||
cb5b3ef4 | 1134 | void |
2556c98b | 1135 | _pftrace(const char *func, const char *msg, ...) |
cb5b3ef4 | 1136 | { |
2556c98b BN |
1137 | char buf[200]; |
1138 | struct timeval tv; | |
1139 | va_list args; | |
cb5b3ef4 | 1140 | |
2556c98b | 1141 | gettimeofday(&tv, NULL); |
cb5b3ef4 | 1142 | |
2556c98b BN |
1143 | va_start(args, msg); |
1144 | vsnprintf(buf, sizeof(buf), msg, args); | |
1145 | buf[sizeof(buf)-1] = '\0'; | |
1146 | va_end(args); | |
cb5b3ef4 | 1147 | |
4c0a98ae BN |
1148 | fprintf(pf_trace_file, "%lu.%06lu %s: %s\n", tv.tv_sec, tv.tv_usec, |
1149 | func, buf); | |
cb5b3ef4 | 1150 | } |
2556c98b BN |
1151 | |
1152 | #endif |