]>
Commit | Line | Data |
---|---|---|
959ef981 DC |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
6b803e5a | 3 | #include "libxfs.h" |
2556c98b | 4 | #include <pthread.h> |
cb5b3ef4 | 5 | #include "avl.h" |
379397bf | 6 | #include "btree.h" |
cb5b3ef4 MV |
7 | #include "globals.h" |
8 | #include "agheader.h" | |
9 | #include "incore.h" | |
cb5b3ef4 | 10 | #include "dir2.h" |
cb5b3ef4 MV |
11 | #include "protos.h" |
12 | #include "err_protos.h" | |
13 | #include "dinode.h" | |
14 | #include "bmap.h" | |
15 | #include "versions.h" | |
2556c98b BN |
16 | #include "threads.h" |
17 | #include "prefetch.h" | |
18 | #include "progress.h" | |
cb5b3ef4 MV |
19 | |
20 | int do_prefetch = 1; | |
21 | ||
2556c98b BN |
22 | /* |
23 | * Performs prefetching by priming the libxfs cache by using a dedicate thread | |
24 | * scanning inodes and reading blocks in ahead of time they are required. | |
25 | * | |
26 | * Any I/O errors can be safely ignored. | |
27 | */ | |
cb5b3ef4 | 28 | |
2556c98b BN |
29 | static xfs_mount_t *mp; |
30 | static int mp_fd; | |
31 | static int pf_max_bytes; | |
32 | static int pf_max_bbs; | |
33 | static int pf_max_fsbs; | |
34 | static int pf_batch_bytes; | |
35 | static int pf_batch_fsbs; | |
cb5b3ef4 | 36 | |
69ec88b5 BN |
37 | static void pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *); |
38 | ||
a040d7c9 BN |
39 | /* |
40 | * Buffer priorities for the libxfs cache | |
41 | * | |
42 | * Directory metadata is ranked higher than other metadata as it's used | |
43 | * in phases 3, 4 and 6, while other metadata is only used in 3 & 4. | |
44 | */ | |
69ec88b5 | 45 | |
a040d7c9 BN |
46 | /* intermediate directory btree nodes - can't be queued */ |
47 | #define B_DIR_BMAP CACHE_PREFETCH_PRIORITY + 7 | |
48 | /* directory metadata in secondary queue */ | |
49 | #define B_DIR_META_2 CACHE_PREFETCH_PRIORITY + 6 | |
50 | /* dir metadata that had to fetched from the primary queue to avoid stalling */ | |
51 | #define B_DIR_META_H CACHE_PREFETCH_PRIORITY + 5 | |
52 | /* single block of directory metadata (can't batch read) */ | |
53 | #define B_DIR_META_S CACHE_PREFETCH_PRIORITY + 4 | |
54 | /* dir metadata with more than one block fetched in a single I/O */ | |
55 | #define B_DIR_META CACHE_PREFETCH_PRIORITY + 3 | |
56 | /* inode clusters with directory inodes */ | |
57 | #define B_DIR_INODE CACHE_PREFETCH_PRIORITY + 2 | |
58 | /* intermediate extent btree nodes */ | |
59 | #define B_BMAP CACHE_PREFETCH_PRIORITY + 1 | |
60 | /* inode clusters without any directory entries */ | |
61 | #define B_INODE CACHE_PREFETCH_PRIORITY | |
69ec88b5 | 62 | |
a040d7c9 BN |
63 | /* |
64 | * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if | |
65 | * the buffer is for an inode or other metadata. | |
66 | */ | |
67 | #define B_IS_INODE(f) (((f) & 5) == 0) | |
cb5b3ef4 | 68 | |
2556c98b BN |
69 | #define DEF_BATCH_BYTES 0x10000 |
70 | ||
71 | #define MAX_BUFS 128 | |
72 | ||
69ec88b5 | 73 | #define IO_THRESHOLD (MAX_BUFS * 2) |
2556c98b BN |
74 | |
75 | typedef enum pf_which { | |
76 | PF_PRIMARY, | |
77 | PF_SECONDARY, | |
78 | PF_META_ONLY | |
79 | } pf_which_t; | |
80 | ||
81 | ||
82 | static inline void | |
83 | pf_start_processing( | |
84 | prefetch_args_t *args) | |
85 | { | |
86 | if (!args->can_start_processing) { | |
2556c98b | 87 | pftrace("signalling processing for AG %d", args->agno); |
4c0a98ae | 88 | |
2556c98b BN |
89 | args->can_start_processing = 1; |
90 | pthread_cond_signal(&args->start_processing); | |
cb5b3ef4 | 91 | } |
2556c98b BN |
92 | } |
93 | ||
94 | static inline void | |
95 | pf_start_io_workers( | |
96 | prefetch_args_t *args) | |
97 | { | |
98 | if (!args->can_start_reading) { | |
2556c98b | 99 | pftrace("signalling reading for AG %d", args->agno); |
4c0a98ae | 100 | |
2556c98b BN |
101 | args->can_start_reading = 1; |
102 | pthread_cond_broadcast(&args->start_reading); | |
cb5b3ef4 | 103 | } |
cb5b3ef4 MV |
104 | } |
105 | ||
2556c98b | 106 | |
cb5b3ef4 | 107 | static void |
2556c98b BN |
108 | pf_queue_io( |
109 | prefetch_args_t *args, | |
dd9093de DC |
110 | struct xfs_buf_map *map, |
111 | int nmaps, | |
2556c98b | 112 | int flag) |
cb5b3ef4 | 113 | { |
dd9093de DC |
114 | struct xfs_buf *bp; |
115 | xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, map[0].bm_bn); | |
cb5b3ef4 | 116 | |
2ae22647 CH |
117 | /* |
118 | * Never block on a buffer lock here, given that the actual repair | |
119 | * code might lock buffers in a different order from us. Given that | |
120 | * the lock holder is either reading it from disk himself or | |
121 | * completely overwriting it this behaviour is perfectly fine. | |
122 | */ | |
dd9093de | 123 | bp = libxfs_getbuf_map(mp->m_dev, map, nmaps, LIBXFS_GETBUF_TRYLOCK); |
2ae22647 CH |
124 | if (!bp) |
125 | return; | |
126 | ||
2556c98b | 127 | if (bp->b_flags & LIBXFS_B_UPTODATE) { |
69ec88b5 BN |
128 | if (B_IS_INODE(flag)) |
129 | pf_read_inode_dirs(args, bp); | |
b3563c19 BN |
130 | XFS_BUF_SET_PRIORITY(bp, XFS_BUF_PRIORITY(bp) + |
131 | CACHE_PREFETCH_PRIORITY); | |
2556c98b | 132 | libxfs_putbuf(bp); |
cb5b3ef4 MV |
133 | return; |
134 | } | |
69ec88b5 | 135 | XFS_BUF_SET_PRIORITY(bp, flag); |
cb5b3ef4 | 136 | |
2556c98b | 137 | pthread_mutex_lock(&args->lock); |
cb5b3ef4 | 138 | |
bb34c934 BN |
139 | btree_insert(args->io_queue, fsbno, bp); |
140 | ||
2556c98b | 141 | if (fsbno > args->last_bno_read) { |
379397bf | 142 | if (B_IS_INODE(flag)) { |
2556c98b BN |
143 | args->inode_bufs_queued++; |
144 | if (args->inode_bufs_queued == IO_THRESHOLD) | |
145 | pf_start_io_workers(args); | |
cb5b3ef4 | 146 | } |
2556c98b | 147 | } else { |
08cee623 | 148 | ASSERT(!B_IS_INODE(flag)); |
69ec88b5 | 149 | XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2); |
cb5b3ef4 MV |
150 | } |
151 | ||
4c0a98ae BN |
152 | pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue" |
153 | "(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag) ? | |
154 | 'I' : 'M', bp, (long long)XFS_BUF_ADDR(bp), args->agno, fsbno, | |
155 | args->inode_bufs_queued, args->last_bno_read); | |
156 | ||
2556c98b | 157 | pf_start_processing(args); |
cb5b3ef4 | 158 | |
2556c98b | 159 | pthread_mutex_unlock(&args->lock); |
cb5b3ef4 MV |
160 | } |
161 | ||
2556c98b BN |
162 | static int |
163 | pf_read_bmbt_reclist( | |
164 | prefetch_args_t *args, | |
165 | xfs_bmbt_rec_t *rp, | |
166 | int numrecs) | |
cb5b3ef4 | 167 | { |
cb5b3ef4 | 168 | int i; |
e0a12bda | 169 | xfs_bmbt_irec_t irec; |
5a35bf2c DC |
170 | xfs_filblks_t cp = 0; /* prev count */ |
171 | xfs_fileoff_t op = 0; /* prev offset */ | |
dd9093de DC |
172 | #define MAP_ARRAY_SZ 4 |
173 | struct xfs_buf_map map_array[MAP_ARRAY_SZ]; | |
174 | struct xfs_buf_map *map = map_array; | |
175 | int max_extents = MAP_ARRAY_SZ; | |
24e04791 | 176 | int nmaps = 0; |
dd9093de DC |
177 | unsigned int len = 0; |
178 | int ret = 0; | |
179 | ||
2556c98b | 180 | |
5e656dbb BN |
181 | for (i = 0; i < numrecs; i++) { |
182 | libxfs_bmbt_disk_get_all(rp + i, &irec); | |
2556c98b | 183 | |
e0a12bda BN |
184 | if (((i > 0) && (op + cp > irec.br_startoff)) || |
185 | (irec.br_blockcount == 0) || | |
186 | (irec.br_startoff >= fs_max_file_offset)) | |
dd9093de | 187 | goto out_free; |
2556c98b | 188 | |
e0a12bda BN |
189 | if (!verify_dfsbno(mp, irec.br_startblock) || !verify_dfsbno(mp, |
190 | irec.br_startblock + irec.br_blockcount - 1)) | |
dd9093de | 191 | goto out_free; |
2556c98b | 192 | |
e0a12bda | 193 | if (!args->dirs_only && ((irec.br_startoff + |
ff105f75 | 194 | irec.br_blockcount) >= mp->m_dir_geo->freeblk)) |
2556c98b BN |
195 | break; /* only Phase 6 reads the free blocks */ |
196 | ||
e0a12bda BN |
197 | op = irec.br_startoff; |
198 | cp = irec.br_blockcount; | |
2556c98b | 199 | |
e0a12bda | 200 | while (irec.br_blockcount) { |
dd9093de | 201 | unsigned int bm_len; |
4c0a98ae | 202 | |
2556c98b | 203 | pftrace("queuing dir extent in AG %d", args->agno); |
4c0a98ae | 204 | |
ff105f75 DC |
205 | if (len + irec.br_blockcount >= mp->m_dir_geo->fsbcount) |
206 | bm_len = mp->m_dir_geo->fsbcount - len; | |
dd9093de DC |
207 | else |
208 | bm_len = irec.br_blockcount; | |
209 | len += bm_len; | |
210 | ||
211 | map[nmaps].bm_bn = XFS_FSB_TO_DADDR(mp, | |
212 | irec.br_startblock); | |
213 | map[nmaps].bm_len = XFS_FSB_TO_BB(mp, bm_len); | |
214 | nmaps++; | |
215 | ||
ff105f75 | 216 | if (len == mp->m_dir_geo->fsbcount) { |
dd9093de DC |
217 | pf_queue_io(args, map, nmaps, B_DIR_META); |
218 | len = 0; | |
219 | nmaps = 0; | |
220 | } | |
221 | ||
222 | irec.br_blockcount -= bm_len; | |
223 | irec.br_startblock += bm_len; | |
224 | ||
225 | /* | |
226 | * Handle very fragmented dir2 blocks with dynamically | |
227 | * allocated buffer maps. | |
228 | */ | |
229 | if (nmaps >= max_extents) { | |
230 | struct xfs_buf_map *old_map = NULL; | |
231 | ||
232 | if (map == map_array) { | |
233 | old_map = map; | |
234 | map = NULL; | |
235 | } | |
236 | max_extents *= 2; | |
237 | map = realloc(map, max_extents * sizeof(*map)); | |
238 | if (map == NULL) { | |
239 | do_error( | |
240 | _("couldn't malloc dir2 buffer list\n")); | |
241 | exit(1); | |
242 | } | |
243 | if (old_map) | |
244 | memcpy(map, old_map, sizeof(map_array)); | |
245 | } | |
246 | ||
2556c98b BN |
247 | } |
248 | } | |
dd9093de DC |
249 | ret = 1; |
250 | out_free: | |
251 | if (map != map_array) | |
252 | free(map); | |
253 | return ret; | |
2556c98b | 254 | } |
cb5b3ef4 | 255 | |
2556c98b BN |
256 | /* |
257 | * simplified version of the main scan_lbtree. Returns 0 to stop. | |
258 | */ | |
259 | ||
260 | static int | |
261 | pf_scan_lbtree( | |
5a35bf2c | 262 | xfs_fsblock_t dbno, |
2556c98b BN |
263 | int level, |
264 | int isadir, | |
265 | prefetch_args_t *args, | |
b3563c19 | 266 | int (*func)(struct xfs_btree_block *block, |
2556c98b BN |
267 | int level, |
268 | int isadir, | |
269 | prefetch_args_t *args)) | |
270 | { | |
271 | xfs_buf_t *bp; | |
272 | int rc; | |
cb5b3ef4 | 273 | |
2556c98b | 274 | bp = libxfs_readbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, dbno), |
e0607266 | 275 | XFS_FSB_TO_BB(mp, 1), 0, &xfs_bmbt_buf_ops); |
2556c98b BN |
276 | if (!bp) |
277 | return 0; | |
cb5b3ef4 | 278 | |
69ec88b5 BN |
279 | XFS_BUF_SET_PRIORITY(bp, isadir ? B_DIR_BMAP : B_BMAP); |
280 | ||
43ba1861 DW |
281 | /* |
282 | * If the verifier flagged a problem with the buffer, we can't trust | |
283 | * its contents for the purposes of reading ahead. Stop prefetching | |
284 | * the tree and mark the buffer unchecked so that the next read of the | |
285 | * buffer will retain the error status and be acted upon appropriately. | |
286 | */ | |
287 | if (bp->b_error) { | |
288 | bp->b_flags |= LIBXFS_B_UNCHECKED; | |
289 | libxfs_putbuf(bp); | |
290 | return 0; | |
291 | } | |
292 | ||
b3563c19 | 293 | rc = (*func)(XFS_BUF_TO_BLOCK(bp), level - 1, isadir, args); |
cb5b3ef4 | 294 | |
2556c98b | 295 | libxfs_putbuf(bp); |
cb5b3ef4 | 296 | |
2556c98b BN |
297 | return rc; |
298 | } | |
299 | ||
300 | static int | |
301 | pf_scanfunc_bmap( | |
b3563c19 | 302 | struct xfs_btree_block *block, |
2556c98b BN |
303 | int level, |
304 | int isadir, | |
305 | prefetch_args_t *args) | |
306 | { | |
2556c98b BN |
307 | xfs_bmbt_ptr_t *pp; |
308 | int numrecs; | |
309 | int i; | |
5a35bf2c | 310 | xfs_fsblock_t dbno; |
2556c98b BN |
311 | |
312 | /* | |
313 | * do some validation on the block contents | |
314 | */ | |
1c88e98c DC |
315 | if ((block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) && |
316 | block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC)) || | |
2556c98b BN |
317 | (be16_to_cpu(block->bb_level) != level)) |
318 | return 0; | |
319 | ||
320 | numrecs = be16_to_cpu(block->bb_numrecs); | |
321 | ||
322 | if (level == 0) { | |
323 | if (numrecs > mp->m_bmap_dmxr[0] || !isadir) | |
324 | return 0; | |
5e656dbb | 325 | return pf_read_bmbt_reclist(args, |
b3563c19 | 326 | XFS_BMBT_REC_ADDR(mp, block, 1), numrecs); |
cb5b3ef4 MV |
327 | } |
328 | ||
2556c98b BN |
329 | if (numrecs > mp->m_bmap_dmxr[1]) |
330 | return 0; | |
cb5b3ef4 | 331 | |
b3563c19 | 332 | pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); |
2556c98b BN |
333 | |
334 | for (i = 0; i < numrecs; i++) { | |
fb36a55d | 335 | dbno = get_unaligned_be64(&pp[i]); |
2556c98b BN |
336 | if (!verify_dfsbno(mp, dbno)) |
337 | return 0; | |
338 | if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap)) | |
339 | return 0; | |
cb5b3ef4 | 340 | } |
2556c98b | 341 | return 1; |
cb5b3ef4 MV |
342 | } |
343 | ||
2556c98b BN |
344 | |
345 | static void | |
346 | pf_read_btinode( | |
347 | prefetch_args_t *args, | |
348 | xfs_dinode_t *dino, | |
349 | int isadir) | |
cb5b3ef4 | 350 | { |
2556c98b BN |
351 | xfs_bmdr_block_t *dib; |
352 | xfs_bmbt_ptr_t *pp; | |
353 | int i; | |
354 | int level; | |
355 | int numrecs; | |
356 | int dsize; | |
5a35bf2c | 357 | xfs_fsblock_t dbno; |
2556c98b BN |
358 | |
359 | dib = (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dino); | |
360 | ||
361 | level = be16_to_cpu(dib->bb_level); | |
362 | numrecs = be16_to_cpu(dib->bb_numrecs); | |
363 | ||
364 | if ((numrecs == 0) || (level == 0) || | |
365 | (level > XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))) | |
cb5b3ef4 | 366 | return; |
2556c98b BN |
367 | /* |
368 | * use bmdr/dfork_dsize since the root block is in the data fork | |
369 | */ | |
370 | if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp)) | |
371 | return; | |
372 | ||
373 | dsize = XFS_DFORK_DSIZE(dino, mp); | |
e2f60652 | 374 | pp = XFS_BMDR_PTR_ADDR(dib, 1, libxfs_bmdr_maxrecs(dsize, 0)); |
cb5b3ef4 | 375 | |
2556c98b | 376 | for (i = 0; i < numrecs; i++) { |
fb36a55d | 377 | dbno = get_unaligned_be64(&pp[i]); |
2556c98b | 378 | if (!verify_dfsbno(mp, dbno)) |
cb5b3ef4 | 379 | break; |
2556c98b | 380 | if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap)) |
cb5b3ef4 | 381 | break; |
2556c98b BN |
382 | } |
383 | } | |
384 | ||
385 | static void | |
386 | pf_read_exinode( | |
387 | prefetch_args_t *args, | |
388 | xfs_dinode_t *dino) | |
389 | { | |
390 | pf_read_bmbt_reclist(args, (xfs_bmbt_rec_t *)XFS_DFORK_DPTR(dino), | |
56b2de80 | 391 | be32_to_cpu(dino->di_nextents)); |
2556c98b | 392 | } |
cb5b3ef4 | 393 | |
2556c98b BN |
394 | static void |
395 | pf_read_inode_dirs( | |
396 | prefetch_args_t *args, | |
397 | xfs_buf_t *bp) | |
398 | { | |
399 | xfs_dinode_t *dino; | |
400 | int icnt = 0; | |
69ec88b5 BN |
401 | int hasdir = 0; |
402 | int isadir; | |
2556c98b | 403 | |
adbb3573 | 404 | libxfs_readbuf_verify(bp, &xfs_inode_buf_ops); |
e0607266 DC |
405 | if (bp->b_error) |
406 | return; | |
407 | ||
135e4bfe | 408 | for (icnt = 0; icnt < (bp->b_bcount >> mp->m_sb.sb_inodelog); icnt++) { |
56b2de80 | 409 | dino = xfs_make_iptr(mp, bp, icnt); |
2556c98b BN |
410 | |
411 | /* | |
412 | * We are only prefetching directory contents in extents | |
413 | * and btree nodes for other inodes | |
414 | */ | |
56b2de80 | 415 | isadir = (be16_to_cpu(dino->di_mode) & S_IFMT) == S_IFDIR; |
69ec88b5 BN |
416 | hasdir |= isadir; |
417 | ||
56b2de80 | 418 | if (dino->di_format <= XFS_DINODE_FMT_LOCAL) |
69ec88b5 BN |
419 | continue; |
420 | ||
56b2de80 | 421 | if (!isadir && (dino->di_format == XFS_DINODE_FMT_EXTENTS || |
69ec88b5 | 422 | args->dirs_only)) |
2556c98b BN |
423 | continue; |
424 | ||
425 | /* | |
426 | * do some checks on the inode to see if we can prefetch | |
427 | * its directory data. It's a cut down version of | |
428 | * process_dinode_int() in dinode.c. | |
429 | */ | |
56b2de80 | 430 | if (dino->di_format > XFS_DINODE_FMT_BTREE) |
2556c98b BN |
431 | continue; |
432 | ||
56b2de80 | 433 | if (be16_to_cpu(dino->di_magic) != XFS_DINODE_MAGIC) |
2556c98b BN |
434 | continue; |
435 | ||
e2f60652 | 436 | if (!libxfs_dinode_good_version(mp, dino->di_version)) |
2556c98b BN |
437 | continue; |
438 | ||
56b2de80 | 439 | if (be64_to_cpu(dino->di_size) <= XFS_DFORK_DSIZE(dino, mp)) |
2556c98b BN |
440 | continue; |
441 | ||
56b2de80 | 442 | if ((dino->di_forkoff != 0) && |
49f693fa | 443 | (dino->di_forkoff >= XFS_LITINO(mp, dino->di_version) >> 3)) |
2556c98b BN |
444 | continue; |
445 | ||
56b2de80 | 446 | switch (dino->di_format) { |
2556c98b BN |
447 | case XFS_DINODE_FMT_EXTENTS: |
448 | pf_read_exinode(args, dino); | |
cb5b3ef4 | 449 | break; |
2556c98b | 450 | case XFS_DINODE_FMT_BTREE: |
69ec88b5 | 451 | pf_read_btinode(args, dino, isadir); |
cb5b3ef4 | 452 | break; |
cb5b3ef4 MV |
453 | } |
454 | } | |
69ec88b5 BN |
455 | if (hasdir) |
456 | XFS_BUF_SET_PRIORITY(bp, B_DIR_INODE); | |
cb5b3ef4 MV |
457 | } |
458 | ||
dd9093de DC |
459 | /* |
460 | * pf_batch_read must be called with the lock locked. | |
461 | */ | |
cb5b3ef4 | 462 | static void |
2556c98b BN |
463 | pf_batch_read( |
464 | prefetch_args_t *args, | |
465 | pf_which_t which, | |
466 | void *buf) | |
cb5b3ef4 | 467 | { |
2556c98b BN |
468 | xfs_buf_t *bplist[MAX_BUFS]; |
469 | unsigned int num; | |
470 | off64_t first_off, last_off, next_off; | |
471 | int len, size; | |
cb5b3ef4 | 472 | int i; |
2556c98b | 473 | int inode_bufs; |
e33b06a3 | 474 | unsigned long fsbno = 0; |
379397bf | 475 | unsigned long max_fsbno; |
2556c98b BN |
476 | char *pbuf; |
477 | ||
bb34c934 | 478 | for (;;) { |
379397bf | 479 | num = 0; |
bb34c934 BN |
480 | if (which == PF_SECONDARY) { |
481 | bplist[0] = btree_find(args->io_queue, 0, &fsbno); | |
68d16907 | 482 | max_fsbno = min(fsbno + pf_max_fsbs, |
bb34c934 BN |
483 | args->last_bno_read); |
484 | } else { | |
485 | bplist[0] = btree_find(args->io_queue, | |
486 | args->last_bno_read, &fsbno); | |
487 | max_fsbno = fsbno + pf_max_fsbs; | |
488 | } | |
379397bf | 489 | while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) { |
dd9093de | 490 | /* |
bbd32754 DC |
491 | * Discontiguous buffers need special handling, so stop |
492 | * gathering new buffers and process the list and this | |
493 | * discontigous buffer immediately. This avoids the | |
494 | * complexity of keeping a separate discontigous buffer | |
495 | * list and seeking back over ranges we've already done | |
496 | * optimised reads for. | |
dd9093de DC |
497 | */ |
498 | if ((bplist[num]->b_flags & LIBXFS_B_DISCONTIG)) { | |
bbd32754 DC |
499 | num++; |
500 | break; | |
501 | } | |
502 | ||
503 | if (which != PF_META_ONLY || | |
dd9093de | 504 | !B_IS_INODE(XFS_BUF_PRIORITY(bplist[num]))) |
379397bf | 505 | num++; |
e49f30a7 ES |
506 | if (num == MAX_BUFS) |
507 | break; | |
bb34c934 | 508 | bplist[num] = btree_lookup_next(args->io_queue, &fsbno); |
2556c98b | 509 | } |
379397bf BN |
510 | if (!num) |
511 | return; | |
cb5b3ef4 | 512 | |
2556c98b BN |
513 | /* |
514 | * do a big read if 25% of the potential buffer is useful, | |
515 | * otherwise, find as many close together blocks and | |
516 | * read them in one read | |
517 | */ | |
518 | first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0])); | |
519 | last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) + | |
520 | XFS_BUF_SIZE(bplist[num-1]); | |
2c350101 | 521 | while (num > 1 && last_off - first_off > pf_max_bytes) { |
2556c98b BN |
522 | num--; |
523 | last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) + | |
524 | XFS_BUF_SIZE(bplist[num-1]); | |
525 | } | |
526 | if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) { | |
527 | /* | |
528 | * not enough blocks for one big read, so determine | |
529 | * the number of blocks that are close enough. | |
530 | */ | |
531 | last_off = first_off + XFS_BUF_SIZE(bplist[0]); | |
532 | for (i = 1; i < num; i++) { | |
533 | next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) + | |
534 | XFS_BUF_SIZE(bplist[i]); | |
535 | if (next_off - last_off > pf_batch_bytes) | |
536 | break; | |
537 | last_off = next_off; | |
538 | } | |
539 | num = i; | |
540 | } | |
cb5b3ef4 | 541 | |
2556c98b | 542 | for (i = 0; i < num; i++) { |
bb34c934 | 543 | if (btree_delete(args->io_queue, XFS_DADDR_TO_FSB(mp, |
2556c98b BN |
544 | XFS_BUF_ADDR(bplist[i]))) == NULL) |
545 | do_error(_("prefetch corruption\n")); | |
cb5b3ef4 MV |
546 | } |
547 | ||
2556c98b BN |
548 | if (which == PF_PRIMARY) { |
549 | for (inode_bufs = 0, i = 0; i < num; i++) { | |
69ec88b5 | 550 | if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i]))) |
2556c98b BN |
551 | inode_bufs++; |
552 | } | |
553 | args->inode_bufs_queued -= inode_bufs; | |
554 | if (inode_bufs && (first_off >> mp->m_sb.sb_blocklog) > | |
555 | pf_batch_fsbs) | |
556 | args->last_bno_read = (first_off >> mp->m_sb.sb_blocklog); | |
557 | } | |
558 | #ifdef XR_PF_TRACE | |
559 | pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)", | |
560 | (long long)XFS_BUF_ADDR(bplist[0]), | |
561 | (long long)XFS_BUF_ADDR(bplist[num-1]), num, | |
562 | (which != PF_SECONDARY) ? "pri" : "sec", args->agno, | |
563 | args->last_bno_read, args->inode_bufs_queued); | |
564 | #endif | |
565 | pthread_mutex_unlock(&args->lock); | |
566 | ||
567 | /* | |
568 | * now read the data and put into the xfs_but_t's | |
569 | */ | |
2f9a125c | 570 | len = pread(mp_fd, buf, (int)(last_off - first_off), first_off); |
bbd32754 DC |
571 | |
572 | /* | |
573 | * Check the last buffer on the list to see if we need to | |
574 | * process a discontiguous buffer. The gather above loop | |
575 | * guarantees that only the last buffer in the list will be a | |
576 | * discontiguous buffer. | |
577 | */ | |
578 | if ((bplist[num - 1]->b_flags & LIBXFS_B_DISCONTIG)) { | |
579 | libxfs_readbufr_map(mp->m_ddev_targp, bplist[num - 1], 0); | |
580 | bplist[num - 1]->b_flags |= LIBXFS_B_UNCHECKED; | |
581 | libxfs_putbuf(bplist[num - 1]); | |
582 | num--; | |
583 | } | |
584 | ||
2556c98b BN |
585 | if (len > 0) { |
586 | /* | |
587 | * go through the xfs_buf_t list copying from the | |
588 | * read buffer into the xfs_buf_t's and release them. | |
589 | */ | |
2556c98b BN |
590 | for (i = 0; i < num; i++) { |
591 | ||
592 | pbuf = ((char *)buf) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) - first_off); | |
593 | size = XFS_BUF_SIZE(bplist[i]); | |
594 | if (len < size) | |
595 | break; | |
04338619 | 596 | memcpy(bplist[i]->b_addr, pbuf, size); |
adbb3573 DC |
597 | bplist[i]->b_flags |= (LIBXFS_B_UPTODATE | |
598 | LIBXFS_B_UNCHECKED); | |
2556c98b | 599 | len -= size; |
69ec88b5 | 600 | if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i]))) |
2556c98b | 601 | pf_read_inode_dirs(args, bplist[i]); |
69ec88b5 BN |
602 | else if (which == PF_META_ONLY) |
603 | XFS_BUF_SET_PRIORITY(bplist[i], | |
604 | B_DIR_META_H); | |
605 | else if (which == PF_PRIMARY && num == 1) | |
606 | XFS_BUF_SET_PRIORITY(bplist[i], | |
607 | B_DIR_META_S); | |
2556c98b BN |
608 | } |
609 | } | |
610 | for (i = 0; i < num; i++) { | |
2556c98b | 611 | pftrace("putbuf %c %p (%llu) in AG %d", |
69ec88b5 | 612 | B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M', |
2556c98b BN |
613 | bplist[i], (long long)XFS_BUF_ADDR(bplist[i]), |
614 | args->agno); | |
2556c98b BN |
615 | libxfs_putbuf(bplist[i]); |
616 | } | |
617 | pthread_mutex_lock(&args->lock); | |
618 | if (which != PF_SECONDARY) { | |
2556c98b BN |
619 | pftrace("inode_bufs_queued for AG %d = %d", args->agno, |
620 | args->inode_bufs_queued); | |
2556c98b BN |
621 | /* |
622 | * if primary inode queue running low, process metadata | |
623 | * in boths queues to avoid I/O starvation as the | |
624 | * processing thread would be waiting for a metadata | |
625 | * buffer | |
626 | */ | |
627 | if (which == PF_PRIMARY && !args->queuing_done && | |
628 | args->inode_bufs_queued < IO_THRESHOLD) { | |
2556c98b BN |
629 | pftrace("reading metadata bufs from primary queue for AG %d", |
630 | args->agno); | |
4c0a98ae | 631 | |
2556c98b | 632 | pf_batch_read(args, PF_META_ONLY, buf); |
4c0a98ae | 633 | |
2556c98b BN |
634 | pftrace("reading bufs from secondary queue for AG %d", |
635 | args->agno); | |
4c0a98ae | 636 | |
2556c98b BN |
637 | pf_batch_read(args, PF_SECONDARY, buf); |
638 | } | |
cb5b3ef4 | 639 | } |
cb5b3ef4 | 640 | } |
2556c98b BN |
641 | } |
642 | ||
643 | static void * | |
644 | pf_io_worker( | |
645 | void *param) | |
646 | { | |
647 | prefetch_args_t *args = param; | |
648 | void *buf = memalign(libxfs_device_alignment(), | |
649 | pf_max_bytes); | |
650 | ||
651 | if (buf == NULL) | |
652 | return NULL; | |
cb5b3ef4 | 653 | |
2556c98b | 654 | pthread_mutex_lock(&args->lock); |
bb34c934 | 655 | while (!args->queuing_done || !btree_is_empty(args->io_queue)) { |
2556c98b | 656 | pftrace("waiting to start prefetch I/O for AG %d", args->agno); |
4c0a98ae | 657 | |
2556c98b BN |
658 | while (!args->can_start_reading && !args->queuing_done) |
659 | pthread_cond_wait(&args->start_reading, &args->lock); | |
4c0a98ae | 660 | |
2556c98b | 661 | pftrace("starting prefetch I/O for AG %d", args->agno); |
4c0a98ae | 662 | |
2556c98b BN |
663 | pf_batch_read(args, PF_PRIMARY, buf); |
664 | pf_batch_read(args, PF_SECONDARY, buf); | |
665 | ||
2556c98b | 666 | pftrace("ran out of bufs to prefetch for AG %d", args->agno); |
4c0a98ae | 667 | |
2556c98b BN |
668 | if (!args->queuing_done) |
669 | args->can_start_reading = 0; | |
cb5b3ef4 | 670 | } |
2556c98b | 671 | pthread_mutex_unlock(&args->lock); |
cb5b3ef4 | 672 | |
2556c98b BN |
673 | free(buf); |
674 | ||
2556c98b | 675 | pftrace("finished prefetch I/O for AG %d", args->agno); |
4c0a98ae | 676 | |
2556c98b | 677 | return NULL; |
cb5b3ef4 MV |
678 | } |
679 | ||
2556c98b BN |
680 | static int |
681 | pf_create_prefetch_thread( | |
682 | prefetch_args_t *args); | |
683 | ||
b97ad969 JM |
684 | /* |
685 | * If we fail to create the queuing thread or can't create even one | |
686 | * prefetch thread, we need to let processing continue without it. | |
687 | */ | |
688 | static void | |
689 | pf_skip_prefetch_thread(prefetch_args_t *args) | |
690 | { | |
691 | prefetch_args_t *next; | |
692 | ||
693 | pthread_mutex_lock(&args->lock); | |
694 | args->prefetch_done = 1; | |
695 | pf_start_processing(args); | |
696 | next = args->next_args; | |
697 | args->next_args = NULL; | |
698 | pthread_mutex_unlock(&args->lock); | |
699 | ||
700 | if (next) | |
701 | pf_create_prefetch_thread(next); | |
702 | } | |
703 | ||
2556c98b BN |
704 | static void * |
705 | pf_queuing_worker( | |
706 | void *param) | |
cb5b3ef4 | 707 | { |
2556c98b | 708 | prefetch_args_t *args = param; |
b97ad969 | 709 | prefetch_args_t *next_args; |
2556c98b BN |
710 | int num_inos; |
711 | ino_tree_node_t *irec; | |
712 | ino_tree_node_t *cur_irec; | |
713 | int blks_per_cluster; | |
2556c98b | 714 | xfs_agblock_t bno; |
cb5b3ef4 | 715 | int i; |
2556c98b | 716 | int err; |
870b18fd | 717 | uint64_t sparse; |
2556c98b | 718 | |
ff105f75 | 719 | blks_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog; |
2556c98b BN |
720 | if (blks_per_cluster == 0) |
721 | blks_per_cluster = 1; | |
2556c98b BN |
722 | |
723 | for (i = 0; i < PF_THREAD_COUNT; i++) { | |
724 | err = pthread_create(&args->io_threads[i], NULL, | |
725 | pf_io_worker, args); | |
726 | if (err != 0) { | |
727 | do_warn(_("failed to create prefetch thread: %s\n"), | |
728 | strerror(err)); | |
e8ff6275 JM |
729 | pftrace("failed to create prefetch thread for AG %d: %s", |
730 | args->agno, strerror(err)); | |
53dc81db | 731 | args->io_threads[i] = 0; |
2556c98b | 732 | if (i == 0) { |
b97ad969 | 733 | pf_skip_prefetch_thread(args); |
2556c98b BN |
734 | return NULL; |
735 | } | |
736 | /* | |
737 | * since we have at least one I/O thread, use them for | |
738 | * prefetch | |
739 | */ | |
740 | break; | |
741 | } | |
cb5b3ef4 | 742 | } |
2556c98b | 743 | pftrace("starting prefetch for AG %d", args->agno); |
cb5b3ef4 | 744 | |
2556c98b BN |
745 | for (irec = findfirst_inode_rec(args->agno); irec != NULL; |
746 | irec = next_ino_rec(irec)) { | |
cb5b3ef4 | 747 | |
2556c98b | 748 | cur_irec = irec; |
cb5b3ef4 | 749 | |
2556c98b | 750 | num_inos = XFS_INODES_PER_CHUNK; |
ff105f75 | 751 | while (num_inos < mp->m_ialloc_inos && irec != NULL) { |
2556c98b BN |
752 | irec = next_ino_rec(irec); |
753 | num_inos += XFS_INODES_PER_CHUNK; | |
754 | } | |
cb5b3ef4 | 755 | |
2556c98b BN |
756 | if (args->dirs_only && cur_irec->ino_isa_dir == 0) |
757 | continue; | |
758 | #ifdef XR_PF_TRACE | |
759 | sem_getvalue(&args->ra_count, &i); | |
760 | pftrace("queuing irec %p in AG %d, sem count = %d", | |
761 | irec, args->agno, i); | |
762 | #endif | |
3724f674 | 763 | err = sem_trywait(&args->ra_count); |
004e18d4 | 764 | if (err < 0 && errno == EAGAIN) { |
3724f674 CH |
765 | /* |
766 | * Kick the queue once we have reached the limit; | |
767 | * without this the threads processing the inodes | |
768 | * might get stuck on a buffer that has been locked | |
769 | * and added to the I/O queue but is waiting for | |
770 | * the thread to be woken. | |
771 | */ | |
772 | pf_start_io_workers(args); | |
773 | sem_wait(&args->ra_count); | |
774 | } | |
2556c98b BN |
775 | |
776 | num_inos = 0; | |
777 | bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum); | |
870b18fd | 778 | sparse = cur_irec->ir_sparse; |
2556c98b BN |
779 | |
780 | do { | |
dd9093de DC |
781 | struct xfs_buf_map map; |
782 | ||
783 | map.bm_bn = XFS_AGB_TO_DADDR(mp, args->agno, bno); | |
784 | map.bm_len = XFS_FSB_TO_BB(mp, blks_per_cluster); | |
870b18fd BF |
785 | |
786 | /* | |
787 | * Queue I/O for each non-sparse cluster. We can check | |
788 | * sparse state in cluster sized chunks as cluster size | |
789 | * is the min. granularity of sparse irec regions. | |
790 | */ | |
c782bf02 | 791 | if ((sparse & ((1ULL << inodes_per_cluster) - 1)) == 0) |
870b18fd BF |
792 | pf_queue_io(args, &map, 1, |
793 | (cur_irec->ino_isa_dir != 0) ? | |
794 | B_DIR_INODE : B_INODE); | |
795 | ||
2556c98b | 796 | bno += blks_per_cluster; |
edf3f9d0 | 797 | num_inos += inodes_per_cluster; |
870b18fd | 798 | sparse >>= inodes_per_cluster; |
ff105f75 | 799 | } while (num_inos < mp->m_ialloc_inos); |
cb5b3ef4 MV |
800 | } |
801 | ||
2556c98b | 802 | pthread_mutex_lock(&args->lock); |
cb5b3ef4 | 803 | |
2556c98b BN |
804 | pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)", |
805 | args->agno, args->inode_bufs_queued); | |
4c0a98ae | 806 | |
2556c98b BN |
807 | args->queuing_done = 1; |
808 | pf_start_io_workers(args); | |
809 | pf_start_processing(args); | |
810 | pthread_mutex_unlock(&args->lock); | |
811 | ||
812 | /* now wait for the readers to finish */ | |
813 | for (i = 0; i < PF_THREAD_COUNT; i++) | |
814 | if (args->io_threads[i]) | |
815 | pthread_join(args->io_threads[i], NULL); | |
816 | ||
2556c98b | 817 | pftrace("prefetch for AG %d finished", args->agno); |
4c0a98ae | 818 | |
2556c98b BN |
819 | pthread_mutex_lock(&args->lock); |
820 | ||
bb34c934 | 821 | ASSERT(btree_is_empty(args->io_queue)); |
2556c98b BN |
822 | |
823 | args->prefetch_done = 1; | |
b97ad969 JM |
824 | next_args = args->next_args; |
825 | args->next_args = NULL; | |
2556c98b BN |
826 | pthread_mutex_unlock(&args->lock); |
827 | ||
b97ad969 JM |
828 | if (next_args) |
829 | pf_create_prefetch_thread(next_args); | |
830 | ||
2556c98b | 831 | return NULL; |
cb5b3ef4 MV |
832 | } |
833 | ||
2556c98b BN |
834 | static int |
835 | pf_create_prefetch_thread( | |
836 | prefetch_args_t *args) | |
837 | { | |
838 | int err; | |
839 | ||
2556c98b | 840 | pftrace("creating queue thread for AG %d", args->agno); |
4c0a98ae | 841 | |
2556c98b BN |
842 | err = pthread_create(&args->queuing_thread, NULL, |
843 | pf_queuing_worker, args); | |
844 | if (err != 0) { | |
845 | do_warn(_("failed to create prefetch thread: %s\n"), | |
846 | strerror(err)); | |
e8ff6275 JM |
847 | pftrace("failed to create prefetch thread for AG %d: %s", |
848 | args->agno, strerror(err)); | |
53dc81db | 849 | args->queuing_thread = 0; |
b97ad969 | 850 | pf_skip_prefetch_thread(args); |
2556c98b BN |
851 | } |
852 | ||
853 | return err == 0; | |
854 | } | |
cb5b3ef4 MV |
855 | |
856 | void | |
2556c98b BN |
857 | init_prefetch( |
858 | xfs_mount_t *pmp) | |
cb5b3ef4 | 859 | { |
2556c98b | 860 | mp = pmp; |
75c8b434 | 861 | mp_fd = libxfs_device_to_fd(mp->m_ddev_targp->dev); |
2556c98b BN |
862 | pf_max_bytes = sysconf(_SC_PAGE_SIZE) << 7; |
863 | pf_max_bbs = pf_max_bytes >> BBSHIFT; | |
864 | pf_max_fsbs = pf_max_bytes >> mp->m_sb.sb_blocklog; | |
865 | pf_batch_bytes = DEF_BATCH_BYTES; | |
866 | pf_batch_fsbs = DEF_BATCH_BYTES >> (mp->m_sb.sb_blocklog + 1); | |
867 | } | |
cb5b3ef4 | 868 | |
2556c98b BN |
869 | prefetch_args_t * |
870 | start_inode_prefetch( | |
871 | xfs_agnumber_t agno, | |
872 | int dirs_only, | |
873 | prefetch_args_t *prev_args) | |
874 | { | |
875 | prefetch_args_t *args; | |
edf3f9d0 | 876 | long max_queue; |
cb5b3ef4 | 877 | |
2556c98b BN |
878 | if (!do_prefetch || agno >= mp->m_sb.sb_agcount) |
879 | return NULL; | |
cb5b3ef4 | 880 | |
2556c98b BN |
881 | args = calloc(1, sizeof(prefetch_args_t)); |
882 | ||
bb34c934 | 883 | btree_init(&args->io_queue); |
5e656dbb BN |
884 | if (pthread_mutex_init(&args->lock, NULL) != 0) |
885 | do_error(_("failed to initialize prefetch mutex\n")); | |
886 | if (pthread_cond_init(&args->start_reading, NULL) != 0) | |
887 | do_error(_("failed to initialize prefetch cond var\n")); | |
888 | if (pthread_cond_init(&args->start_processing, NULL) != 0) | |
889 | do_error(_("failed to initialize prefetch cond var\n")); | |
2556c98b BN |
890 | args->agno = agno; |
891 | args->dirs_only = dirs_only; | |
892 | ||
893 | /* | |
894 | * use only 1/8 of the libxfs cache as we are only counting inodes | |
895 | * and not any other associated metadata like directories | |
896 | */ | |
897 | ||
edf3f9d0 | 898 | max_queue = libxfs_bcache->c_maxcount / thread_count / 8; |
ff105f75 DC |
899 | if (mp->m_inode_cluster_size > mp->m_sb.sb_blocksize) |
900 | max_queue = max_queue * | |
901 | (mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog) / | |
902 | mp->m_ialloc_blks; | |
edf3f9d0 BN |
903 | |
904 | sem_init(&args->ra_count, 0, max_queue); | |
2556c98b BN |
905 | |
906 | if (!prev_args) { | |
907 | if (!pf_create_prefetch_thread(args)) | |
908 | return NULL; | |
909 | } else { | |
910 | pthread_mutex_lock(&prev_args->lock); | |
911 | if (prev_args->prefetch_done) { | |
b97ad969 | 912 | pthread_mutex_unlock(&prev_args->lock); |
2556c98b BN |
913 | if (!pf_create_prefetch_thread(args)) |
914 | args = NULL; | |
e8ff6275 | 915 | } else { |
2556c98b | 916 | prev_args->next_args = args; |
e8ff6275 JM |
917 | pftrace("queued AG %d after AG %d", |
918 | args->agno, prev_args->agno); | |
b97ad969 | 919 | pthread_mutex_unlock(&prev_args->lock); |
e8ff6275 | 920 | } |
cb5b3ef4 | 921 | } |
2556c98b BN |
922 | |
923 | return args; | |
cb5b3ef4 MV |
924 | } |
925 | ||
71014d19 DC |
926 | /* |
927 | * prefetch_ag_range runs a prefetch-and-process loop across a range of AGs. It | |
928 | * begins with @start+ag, and finishes with @end_ag - 1 (i.e. does not prefetch | |
929 | * or process @end_ag). The function starts prefetch on the first AG, then loops | |
930 | * starting prefetch on the next AG and then blocks processing the current AG as | |
931 | * the prefetch queue brings inodes into the processing queue. | |
932 | * | |
933 | * There is only one prefetch taking place at a time, so the prefetch on the | |
934 | * next AG only starts once the current AG has been completely prefetched. Hence | |
935 | * the prefetch of the next AG will start some time before the processing of the | |
936 | * current AG finishes, ensuring that when we iterate an start processing the | |
937 | * next AG there is already a significant queue of inodes to process. | |
938 | * | |
939 | * Prefetch is done this way to prevent it from running too far ahead of the | |
940 | * processing. Allowing it to do so can cause cache thrashing, where new | |
941 | * prefetch causes previously prefetched buffers to be reclaimed before the | |
942 | * processing thread uses them. This results in reading all the inodes and | |
943 | * metadata twice per phase and it greatly slows down the processing. Hence we | |
944 | * have to carefully control how far ahead we prefetch... | |
945 | */ | |
946 | static void | |
947 | prefetch_ag_range( | |
62843f36 | 948 | struct workqueue *work, |
71014d19 DC |
949 | xfs_agnumber_t start_ag, |
950 | xfs_agnumber_t end_ag, | |
951 | bool dirs_only, | |
62843f36 | 952 | void (*func)(struct workqueue *, |
71014d19 DC |
953 | xfs_agnumber_t, void *)) |
954 | { | |
955 | int i; | |
956 | struct prefetch_args *pf_args[2]; | |
957 | ||
958 | pf_args[start_ag & 1] = start_inode_prefetch(start_ag, dirs_only, NULL); | |
959 | for (i = start_ag; i < end_ag; i++) { | |
960 | /* Don't prefetch end_ag */ | |
961 | if (i + 1 < end_ag) | |
962 | pf_args[(~i) & 1] = start_inode_prefetch(i + 1, | |
963 | dirs_only, pf_args[i & 1]); | |
964 | func(work, i, pf_args[i & 1]); | |
965 | } | |
966 | } | |
967 | ||
968 | struct pf_work_args { | |
969 | xfs_agnumber_t start_ag; | |
970 | xfs_agnumber_t end_ag; | |
971 | bool dirs_only; | |
62843f36 | 972 | void (*func)(struct workqueue *, xfs_agnumber_t, void *); |
71014d19 DC |
973 | }; |
974 | ||
975 | static void | |
976 | prefetch_ag_range_work( | |
62843f36 | 977 | struct workqueue *work, |
71014d19 DC |
978 | xfs_agnumber_t unused, |
979 | void *args) | |
980 | { | |
981 | struct pf_work_args *wargs = args; | |
982 | ||
f8149110 | 983 | prefetch_ag_range(work, wargs->start_ag, wargs->end_ag, |
71014d19 DC |
984 | wargs->dirs_only, wargs->func); |
985 | free(args); | |
986 | } | |
987 | ||
1164bde5 DC |
988 | /* |
989 | * Do inode prefetch in the most optimal way for the context under which repair | |
990 | * has been run. | |
991 | */ | |
992 | void | |
993 | do_inode_prefetch( | |
994 | struct xfs_mount *mp, | |
995 | int stride, | |
62843f36 | 996 | void (*func)(struct workqueue *, |
1164bde5 DC |
997 | xfs_agnumber_t, void *), |
998 | bool check_cache, | |
999 | bool dirs_only) | |
1000 | { | |
71014d19 | 1001 | int i; |
62843f36 DW |
1002 | struct workqueue queue; |
1003 | struct workqueue *queues; | |
f994d14f | 1004 | int queues_started = 0; |
1164bde5 DC |
1005 | |
1006 | /* | |
1007 | * If the previous phases of repair have not overflowed the buffer | |
1008 | * cache, then we don't need to re-read any of the metadata in the | |
1009 | * filesystem - it's all in the cache. In that case, run a thread per | |
1010 | * CPU to maximise parallelism of the queue to be processed. | |
1011 | */ | |
1012 | if (check_cache && !libxfs_bcache_overflowed()) { | |
62843f36 | 1013 | queue.wq_ctx = mp; |
1164bde5 DC |
1014 | create_work_queue(&queue, mp, libxfs_nproc()); |
1015 | for (i = 0; i < mp->m_sb.sb_agcount; i++) | |
1016 | queue_work(&queue, func, i, NULL); | |
1017 | destroy_work_queue(&queue); | |
1018 | return; | |
1019 | } | |
1020 | ||
1021 | /* | |
1022 | * single threaded behaviour - single prefetch thread, processed | |
1023 | * directly after each AG is queued. | |
1024 | */ | |
1025 | if (!stride) { | |
62843f36 | 1026 | queue.wq_ctx = mp; |
71014d19 DC |
1027 | prefetch_ag_range(&queue, 0, mp->m_sb.sb_agcount, |
1028 | dirs_only, func); | |
1164bde5 DC |
1029 | return; |
1030 | } | |
1031 | ||
1032 | /* | |
1033 | * create one worker thread for each segment of the volume | |
1034 | */ | |
62843f36 | 1035 | queues = malloc(thread_count * sizeof(struct workqueue)); |
71014d19 DC |
1036 | for (i = 0; i < thread_count; i++) { |
1037 | struct pf_work_args *wargs; | |
1038 | ||
1039 | wargs = malloc(sizeof(struct pf_work_args)); | |
1040 | wargs->start_ag = i * stride; | |
1041 | wargs->end_ag = min((i + 1) * stride, | |
1042 | mp->m_sb.sb_agcount); | |
1043 | wargs->dirs_only = dirs_only; | |
1044 | wargs->func = func; | |
1045 | ||
1164bde5 | 1046 | create_work_queue(&queues[i], mp, 1); |
71014d19 | 1047 | queue_work(&queues[i], prefetch_ag_range_work, 0, wargs); |
f994d14f | 1048 | queues_started++; |
71014d19 DC |
1049 | |
1050 | if (wargs->end_ag >= mp->m_sb.sb_agcount) | |
1051 | break; | |
1164bde5 | 1052 | } |
71014d19 | 1053 | |
1164bde5 DC |
1054 | /* |
1055 | * wait for workers to complete | |
1056 | */ | |
f994d14f | 1057 | for (i = 0; i < queues_started; i++) |
1164bde5 DC |
1058 | destroy_work_queue(&queues[i]); |
1059 | free(queues); | |
1060 | } | |
1061 | ||
cb5b3ef4 | 1062 | void |
2556c98b BN |
1063 | wait_for_inode_prefetch( |
1064 | prefetch_args_t *args) | |
cb5b3ef4 | 1065 | { |
2556c98b | 1066 | if (args == NULL) |
cb5b3ef4 | 1067 | return; |
2556c98b BN |
1068 | |
1069 | pthread_mutex_lock(&args->lock); | |
1070 | ||
1071 | while (!args->can_start_processing) { | |
2556c98b | 1072 | pftrace("waiting to start processing AG %d", args->agno); |
4c0a98ae | 1073 | |
2556c98b | 1074 | pthread_cond_wait(&args->start_processing, &args->lock); |
cb5b3ef4 | 1075 | } |
2556c98b | 1076 | pftrace("can start processing AG %d", args->agno); |
4c0a98ae | 1077 | |
2556c98b BN |
1078 | pthread_mutex_unlock(&args->lock); |
1079 | } | |
cb5b3ef4 | 1080 | |
2556c98b BN |
1081 | void |
1082 | cleanup_inode_prefetch( | |
1083 | prefetch_args_t *args) | |
1084 | { | |
1085 | if (args == NULL) | |
1086 | return; | |
cb5b3ef4 | 1087 | |
2556c98b | 1088 | pftrace("waiting AG %d prefetch to finish", args->agno); |
4c0a98ae | 1089 | |
2556c98b BN |
1090 | if (args->queuing_thread) |
1091 | pthread_join(args->queuing_thread, NULL); | |
1092 | ||
2556c98b | 1093 | pftrace("AG %d prefetch done", args->agno); |
4c0a98ae | 1094 | |
b97ad969 JM |
1095 | ASSERT(args->next_args == NULL); |
1096 | ||
2556c98b BN |
1097 | pthread_mutex_destroy(&args->lock); |
1098 | pthread_cond_destroy(&args->start_reading); | |
1099 | pthread_cond_destroy(&args->start_processing); | |
1100 | sem_destroy(&args->ra_count); | |
bb34c934 | 1101 | btree_destroy(args->io_queue); |
2556c98b BN |
1102 | |
1103 | free(args); | |
cb5b3ef4 MV |
1104 | } |
1105 | ||
2556c98b BN |
1106 | #ifdef XR_PF_TRACE |
1107 | ||
4c0a98ae BN |
1108 | static FILE *pf_trace_file; |
1109 | ||
1110 | void | |
1111 | pftrace_init(void) | |
1112 | { | |
1113 | pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w"); | |
1114 | setvbuf(pf_trace_file, NULL, _IOLBF, 1024); | |
1115 | } | |
1116 | ||
1117 | void | |
1118 | pftrace_done(void) | |
1119 | { | |
1120 | fclose(pf_trace_file); | |
1121 | } | |
1122 | ||
cb5b3ef4 | 1123 | void |
2556c98b | 1124 | _pftrace(const char *func, const char *msg, ...) |
cb5b3ef4 | 1125 | { |
2556c98b BN |
1126 | char buf[200]; |
1127 | struct timeval tv; | |
1128 | va_list args; | |
cb5b3ef4 | 1129 | |
2556c98b | 1130 | gettimeofday(&tv, NULL); |
cb5b3ef4 | 1131 | |
2556c98b BN |
1132 | va_start(args, msg); |
1133 | vsnprintf(buf, sizeof(buf), msg, args); | |
1134 | buf[sizeof(buf)-1] = '\0'; | |
1135 | va_end(args); | |
cb5b3ef4 | 1136 | |
4c0a98ae BN |
1137 | fprintf(pf_trace_file, "%lu.%06lu %s: %s\n", tv.tv_sec, tv.tv_usec, |
1138 | func, buf); | |
cb5b3ef4 | 1139 | } |
2556c98b BN |
1140 | |
1141 | #endif |