]>
Commit | Line | Data |
---|---|---|
959ef981 DC |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | ||
6b803e5a | 3 | #include "libxfs.h" |
2556c98b | 4 | #include <pthread.h> |
cb5b3ef4 | 5 | #include "avl.h" |
379397bf | 6 | #include "btree.h" |
cb5b3ef4 MV |
7 | #include "globals.h" |
8 | #include "agheader.h" | |
9 | #include "incore.h" | |
cb5b3ef4 | 10 | #include "dir2.h" |
cb5b3ef4 MV |
11 | #include "protos.h" |
12 | #include "err_protos.h" | |
13 | #include "dinode.h" | |
14 | #include "bmap.h" | |
15 | #include "versions.h" | |
2556c98b BN |
16 | #include "threads.h" |
17 | #include "prefetch.h" | |
18 | #include "progress.h" | |
cb5b3ef4 MV |
19 | |
20 | int do_prefetch = 1; | |
21 | ||
2556c98b BN |
22 | /* |
23 | * Performs prefetching by priming the libxfs cache by using a dedicate thread | |
24 | * scanning inodes and reading blocks in ahead of time they are required. | |
25 | * | |
26 | * Any I/O errors can be safely ignored. | |
27 | */ | |
cb5b3ef4 | 28 | |
2556c98b BN |
29 | static xfs_mount_t *mp; |
30 | static int mp_fd; | |
31 | static int pf_max_bytes; | |
32 | static int pf_max_bbs; | |
33 | static int pf_max_fsbs; | |
34 | static int pf_batch_bytes; | |
35 | static int pf_batch_fsbs; | |
cb5b3ef4 | 36 | |
167137fe | 37 | static void pf_read_inode_dirs(prefetch_args_t *, struct xfs_buf *); |
69ec88b5 | 38 | |
a040d7c9 BN |
39 | /* |
40 | * Buffer priorities for the libxfs cache | |
41 | * | |
42 | * Directory metadata is ranked higher than other metadata as it's used | |
43 | * in phases 3, 4 and 6, while other metadata is only used in 3 & 4. | |
44 | */ | |
69ec88b5 | 45 | |
a040d7c9 BN |
46 | /* intermediate directory btree nodes - can't be queued */ |
47 | #define B_DIR_BMAP CACHE_PREFETCH_PRIORITY + 7 | |
48 | /* directory metadata in secondary queue */ | |
49 | #define B_DIR_META_2 CACHE_PREFETCH_PRIORITY + 6 | |
50 | /* dir metadata that had to fetched from the primary queue to avoid stalling */ | |
51 | #define B_DIR_META_H CACHE_PREFETCH_PRIORITY + 5 | |
52 | /* single block of directory metadata (can't batch read) */ | |
53 | #define B_DIR_META_S CACHE_PREFETCH_PRIORITY + 4 | |
54 | /* dir metadata with more than one block fetched in a single I/O */ | |
55 | #define B_DIR_META CACHE_PREFETCH_PRIORITY + 3 | |
56 | /* inode clusters with directory inodes */ | |
57 | #define B_DIR_INODE CACHE_PREFETCH_PRIORITY + 2 | |
58 | /* intermediate extent btree nodes */ | |
59 | #define B_BMAP CACHE_PREFETCH_PRIORITY + 1 | |
60 | /* inode clusters without any directory entries */ | |
61 | #define B_INODE CACHE_PREFETCH_PRIORITY | |
69ec88b5 | 62 | |
a040d7c9 BN |
63 | /* |
64 | * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if | |
65 | * the buffer is for an inode or other metadata. | |
66 | */ | |
67 | #define B_IS_INODE(f) (((f) & 5) == 0) | |
cb5b3ef4 | 68 | |
2556c98b BN |
69 | #define DEF_BATCH_BYTES 0x10000 |
70 | ||
71 | #define MAX_BUFS 128 | |
72 | ||
69ec88b5 | 73 | #define IO_THRESHOLD (MAX_BUFS * 2) |
2556c98b BN |
74 | |
75 | typedef enum pf_which { | |
76 | PF_PRIMARY, | |
77 | PF_SECONDARY, | |
78 | PF_META_ONLY | |
79 | } pf_which_t; | |
80 | ||
81 | ||
82 | static inline void | |
83 | pf_start_processing( | |
84 | prefetch_args_t *args) | |
85 | { | |
86 | if (!args->can_start_processing) { | |
2556c98b | 87 | pftrace("signalling processing for AG %d", args->agno); |
4c0a98ae | 88 | |
2556c98b BN |
89 | args->can_start_processing = 1; |
90 | pthread_cond_signal(&args->start_processing); | |
cb5b3ef4 | 91 | } |
2556c98b BN |
92 | } |
93 | ||
94 | static inline void | |
95 | pf_start_io_workers( | |
96 | prefetch_args_t *args) | |
97 | { | |
98 | if (!args->can_start_reading) { | |
2556c98b | 99 | pftrace("signalling reading for AG %d", args->agno); |
4c0a98ae | 100 | |
2556c98b BN |
101 | args->can_start_reading = 1; |
102 | pthread_cond_broadcast(&args->start_reading); | |
cb5b3ef4 | 103 | } |
cb5b3ef4 MV |
104 | } |
105 | ||
2556c98b | 106 | |
cb5b3ef4 | 107 | static void |
2556c98b BN |
108 | pf_queue_io( |
109 | prefetch_args_t *args, | |
dd9093de DC |
110 | struct xfs_buf_map *map, |
111 | int nmaps, | |
2556c98b | 112 | int flag) |
cb5b3ef4 | 113 | { |
dd9093de DC |
114 | struct xfs_buf *bp; |
115 | xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, map[0].bm_bn); | |
583ca112 | 116 | int error; |
cb5b3ef4 | 117 | |
2ae22647 CH |
118 | /* |
119 | * Never block on a buffer lock here, given that the actual repair | |
120 | * code might lock buffers in a different order from us. Given that | |
121 | * the lock holder is either reading it from disk himself or | |
122 | * completely overwriting it this behaviour is perfectly fine. | |
123 | */ | |
583ca112 DW |
124 | error = -libxfs_buf_get_map(mp->m_dev, map, nmaps, |
125 | LIBXFS_GETBUF_TRYLOCK, &bp); | |
126 | if (error) | |
2ae22647 CH |
127 | return; |
128 | ||
2556c98b | 129 | if (bp->b_flags & LIBXFS_B_UPTODATE) { |
69ec88b5 BN |
130 | if (B_IS_INODE(flag)) |
131 | pf_read_inode_dirs(args, bp); | |
af60a998 | 132 | libxfs_buf_set_priority(bp, libxfs_buf_priority(bp) + |
b3563c19 | 133 | CACHE_PREFETCH_PRIORITY); |
e02ba985 | 134 | libxfs_buf_relse(bp); |
cb5b3ef4 MV |
135 | return; |
136 | } | |
af60a998 | 137 | libxfs_buf_set_priority(bp, flag); |
cb5b3ef4 | 138 | |
2556c98b | 139 | pthread_mutex_lock(&args->lock); |
cb5b3ef4 | 140 | |
bb34c934 BN |
141 | btree_insert(args->io_queue, fsbno, bp); |
142 | ||
2556c98b | 143 | if (fsbno > args->last_bno_read) { |
379397bf | 144 | if (B_IS_INODE(flag)) { |
2556c98b BN |
145 | args->inode_bufs_queued++; |
146 | if (args->inode_bufs_queued == IO_THRESHOLD) | |
147 | pf_start_io_workers(args); | |
cb5b3ef4 | 148 | } |
2556c98b | 149 | } else { |
08cee623 | 150 | ASSERT(!B_IS_INODE(flag)); |
af60a998 | 151 | libxfs_buf_set_priority(bp, B_DIR_META_2); |
cb5b3ef4 MV |
152 | } |
153 | ||
4c0a98ae BN |
154 | pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue" |
155 | "(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag) ? | |
d4aaa66b | 156 | 'I' : 'M', bp, (long long)xfs_buf_daddr(bp), args->agno, fsbno, |
4c0a98ae BN |
157 | args->inode_bufs_queued, args->last_bno_read); |
158 | ||
2556c98b | 159 | pf_start_processing(args); |
cb5b3ef4 | 160 | |
2556c98b | 161 | pthread_mutex_unlock(&args->lock); |
cb5b3ef4 MV |
162 | } |
163 | ||
2556c98b BN |
164 | static int |
165 | pf_read_bmbt_reclist( | |
166 | prefetch_args_t *args, | |
167 | xfs_bmbt_rec_t *rp, | |
168 | int numrecs) | |
cb5b3ef4 | 169 | { |
cb5b3ef4 | 170 | int i; |
e0a12bda | 171 | xfs_bmbt_irec_t irec; |
5a35bf2c DC |
172 | xfs_filblks_t cp = 0; /* prev count */ |
173 | xfs_fileoff_t op = 0; /* prev offset */ | |
dd9093de DC |
174 | #define MAP_ARRAY_SZ 4 |
175 | struct xfs_buf_map map_array[MAP_ARRAY_SZ]; | |
176 | struct xfs_buf_map *map = map_array; | |
177 | int max_extents = MAP_ARRAY_SZ; | |
24e04791 | 178 | int nmaps = 0; |
dd9093de DC |
179 | unsigned int len = 0; |
180 | int ret = 0; | |
181 | ||
2556c98b | 182 | |
5e656dbb BN |
183 | for (i = 0; i < numrecs; i++) { |
184 | libxfs_bmbt_disk_get_all(rp + i, &irec); | |
2556c98b | 185 | |
e0a12bda BN |
186 | if (((i > 0) && (op + cp > irec.br_startoff)) || |
187 | (irec.br_blockcount == 0) || | |
188 | (irec.br_startoff >= fs_max_file_offset)) | |
dd9093de | 189 | goto out_free; |
2556c98b | 190 | |
a6bd55d3 DW |
191 | if (!libxfs_verify_fsbno(mp, irec.br_startblock) || |
192 | !libxfs_verify_fsbno(mp, irec.br_startblock + | |
193 | irec.br_blockcount - 1)) | |
dd9093de | 194 | goto out_free; |
2556c98b | 195 | |
e0a12bda | 196 | if (!args->dirs_only && ((irec.br_startoff + |
ff105f75 | 197 | irec.br_blockcount) >= mp->m_dir_geo->freeblk)) |
2556c98b BN |
198 | break; /* only Phase 6 reads the free blocks */ |
199 | ||
e0a12bda BN |
200 | op = irec.br_startoff; |
201 | cp = irec.br_blockcount; | |
2556c98b | 202 | |
e0a12bda | 203 | while (irec.br_blockcount) { |
dd9093de | 204 | unsigned int bm_len; |
4c0a98ae | 205 | |
2556c98b | 206 | pftrace("queuing dir extent in AG %d", args->agno); |
4c0a98ae | 207 | |
ff105f75 DC |
208 | if (len + irec.br_blockcount >= mp->m_dir_geo->fsbcount) |
209 | bm_len = mp->m_dir_geo->fsbcount - len; | |
dd9093de DC |
210 | else |
211 | bm_len = irec.br_blockcount; | |
212 | len += bm_len; | |
213 | ||
214 | map[nmaps].bm_bn = XFS_FSB_TO_DADDR(mp, | |
215 | irec.br_startblock); | |
216 | map[nmaps].bm_len = XFS_FSB_TO_BB(mp, bm_len); | |
217 | nmaps++; | |
218 | ||
ff105f75 | 219 | if (len == mp->m_dir_geo->fsbcount) { |
dd9093de DC |
220 | pf_queue_io(args, map, nmaps, B_DIR_META); |
221 | len = 0; | |
222 | nmaps = 0; | |
223 | } | |
224 | ||
225 | irec.br_blockcount -= bm_len; | |
226 | irec.br_startblock += bm_len; | |
227 | ||
228 | /* | |
229 | * Handle very fragmented dir2 blocks with dynamically | |
230 | * allocated buffer maps. | |
231 | */ | |
232 | if (nmaps >= max_extents) { | |
233 | struct xfs_buf_map *old_map = NULL; | |
234 | ||
235 | if (map == map_array) { | |
236 | old_map = map; | |
237 | map = NULL; | |
238 | } | |
239 | max_extents *= 2; | |
240 | map = realloc(map, max_extents * sizeof(*map)); | |
241 | if (map == NULL) { | |
242 | do_error( | |
243 | _("couldn't malloc dir2 buffer list\n")); | |
244 | exit(1); | |
245 | } | |
246 | if (old_map) | |
247 | memcpy(map, old_map, sizeof(map_array)); | |
248 | } | |
249 | ||
2556c98b BN |
250 | } |
251 | } | |
dd9093de DC |
252 | ret = 1; |
253 | out_free: | |
254 | if (map != map_array) | |
255 | free(map); | |
256 | return ret; | |
2556c98b | 257 | } |
cb5b3ef4 | 258 | |
2556c98b BN |
259 | /* |
260 | * simplified version of the main scan_lbtree. Returns 0 to stop. | |
261 | */ | |
262 | ||
263 | static int | |
264 | pf_scan_lbtree( | |
5a35bf2c | 265 | xfs_fsblock_t dbno, |
2556c98b BN |
266 | int level, |
267 | int isadir, | |
268 | prefetch_args_t *args, | |
b3563c19 | 269 | int (*func)(struct xfs_btree_block *block, |
2556c98b BN |
270 | int level, |
271 | int isadir, | |
272 | prefetch_args_t *args)) | |
273 | { | |
167137fe | 274 | struct xfs_buf *bp; |
2556c98b | 275 | int rc; |
31079e67 | 276 | int error; |
cb5b3ef4 | 277 | |
31079e67 DW |
278 | error = -libxfs_buf_read(mp->m_dev, XFS_FSB_TO_DADDR(mp, dbno), |
279 | XFS_FSB_TO_BB(mp, 1), LIBXFS_READBUF_SALVAGE, &bp, | |
280 | &xfs_bmbt_buf_ops); | |
281 | if (error) | |
2556c98b | 282 | return 0; |
cb5b3ef4 | 283 | |
af60a998 | 284 | libxfs_buf_set_priority(bp, isadir ? B_DIR_BMAP : B_BMAP); |
69ec88b5 | 285 | |
43ba1861 DW |
286 | /* |
287 | * If the verifier flagged a problem with the buffer, we can't trust | |
288 | * its contents for the purposes of reading ahead. Stop prefetching | |
289 | * the tree and mark the buffer unchecked so that the next read of the | |
290 | * buffer will retain the error status and be acted upon appropriately. | |
291 | */ | |
292 | if (bp->b_error) { | |
293 | bp->b_flags |= LIBXFS_B_UNCHECKED; | |
e02ba985 | 294 | libxfs_buf_relse(bp); |
43ba1861 DW |
295 | return 0; |
296 | } | |
297 | ||
b3563c19 | 298 | rc = (*func)(XFS_BUF_TO_BLOCK(bp), level - 1, isadir, args); |
cb5b3ef4 | 299 | |
e02ba985 | 300 | libxfs_buf_relse(bp); |
cb5b3ef4 | 301 | |
2556c98b BN |
302 | return rc; |
303 | } | |
304 | ||
305 | static int | |
306 | pf_scanfunc_bmap( | |
b3563c19 | 307 | struct xfs_btree_block *block, |
2556c98b BN |
308 | int level, |
309 | int isadir, | |
310 | prefetch_args_t *args) | |
311 | { | |
2556c98b BN |
312 | xfs_bmbt_ptr_t *pp; |
313 | int numrecs; | |
314 | int i; | |
5a35bf2c | 315 | xfs_fsblock_t dbno; |
2556c98b BN |
316 | |
317 | /* | |
318 | * do some validation on the block contents | |
319 | */ | |
1c88e98c DC |
320 | if ((block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) && |
321 | block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC)) || | |
2556c98b BN |
322 | (be16_to_cpu(block->bb_level) != level)) |
323 | return 0; | |
324 | ||
325 | numrecs = be16_to_cpu(block->bb_numrecs); | |
326 | ||
327 | if (level == 0) { | |
328 | if (numrecs > mp->m_bmap_dmxr[0] || !isadir) | |
329 | return 0; | |
5e656dbb | 330 | return pf_read_bmbt_reclist(args, |
b3563c19 | 331 | XFS_BMBT_REC_ADDR(mp, block, 1), numrecs); |
cb5b3ef4 MV |
332 | } |
333 | ||
2556c98b BN |
334 | if (numrecs > mp->m_bmap_dmxr[1]) |
335 | return 0; | |
cb5b3ef4 | 336 | |
b3563c19 | 337 | pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); |
2556c98b BN |
338 | |
339 | for (i = 0; i < numrecs; i++) { | |
fb36a55d | 340 | dbno = get_unaligned_be64(&pp[i]); |
a6bd55d3 | 341 | if (!libxfs_verify_fsbno(mp, dbno)) |
2556c98b BN |
342 | return 0; |
343 | if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap)) | |
344 | return 0; | |
cb5b3ef4 | 345 | } |
2556c98b | 346 | return 1; |
cb5b3ef4 MV |
347 | } |
348 | ||
2556c98b BN |
349 | |
350 | static void | |
351 | pf_read_btinode( | |
352 | prefetch_args_t *args, | |
7328ea6e | 353 | struct xfs_dinode *dino, |
2556c98b | 354 | int isadir) |
cb5b3ef4 | 355 | { |
2556c98b BN |
356 | xfs_bmdr_block_t *dib; |
357 | xfs_bmbt_ptr_t *pp; | |
358 | int i; | |
359 | int level; | |
360 | int numrecs; | |
361 | int dsize; | |
5a35bf2c | 362 | xfs_fsblock_t dbno; |
2556c98b BN |
363 | |
364 | dib = (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dino); | |
365 | ||
366 | level = be16_to_cpu(dib->bb_level); | |
367 | numrecs = be16_to_cpu(dib->bb_numrecs); | |
368 | ||
369 | if ((numrecs == 0) || (level == 0) || | |
370 | (level > XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))) | |
cb5b3ef4 | 371 | return; |
2556c98b BN |
372 | /* |
373 | * use bmdr/dfork_dsize since the root block is in the data fork | |
374 | */ | |
375 | if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp)) | |
376 | return; | |
377 | ||
378 | dsize = XFS_DFORK_DSIZE(dino, mp); | |
e2f60652 | 379 | pp = XFS_BMDR_PTR_ADDR(dib, 1, libxfs_bmdr_maxrecs(dsize, 0)); |
cb5b3ef4 | 380 | |
2556c98b | 381 | for (i = 0; i < numrecs; i++) { |
fb36a55d | 382 | dbno = get_unaligned_be64(&pp[i]); |
a6bd55d3 | 383 | if (!libxfs_verify_fsbno(mp, dbno)) |
cb5b3ef4 | 384 | break; |
2556c98b | 385 | if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap)) |
cb5b3ef4 | 386 | break; |
2556c98b BN |
387 | } |
388 | } | |
389 | ||
390 | static void | |
391 | pf_read_exinode( | |
392 | prefetch_args_t *args, | |
7328ea6e | 393 | struct xfs_dinode *dino) |
2556c98b BN |
394 | { |
395 | pf_read_bmbt_reclist(args, (xfs_bmbt_rec_t *)XFS_DFORK_DPTR(dino), | |
5f70c91b | 396 | xfs_dfork_data_extents(dino)); |
2556c98b | 397 | } |
cb5b3ef4 | 398 | |
2556c98b BN |
399 | static void |
400 | pf_read_inode_dirs( | |
401 | prefetch_args_t *args, | |
167137fe | 402 | struct xfs_buf *bp) |
2556c98b | 403 | { |
7328ea6e | 404 | struct xfs_dinode *dino; |
2556c98b | 405 | int icnt = 0; |
69ec88b5 BN |
406 | int hasdir = 0; |
407 | int isadir; | |
456371d8 | 408 | int error; |
2556c98b | 409 | |
456371d8 DW |
410 | error = -libxfs_readbuf_verify(bp, &xfs_inode_buf_ops); |
411 | if (error) | |
e0607266 DC |
412 | return; |
413 | ||
c0594dd6 DC |
414 | for (icnt = 0; |
415 | icnt < (BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog); | |
416 | icnt++) { | |
56b2de80 | 417 | dino = xfs_make_iptr(mp, bp, icnt); |
2556c98b BN |
418 | |
419 | /* | |
420 | * We are only prefetching directory contents in extents | |
421 | * and btree nodes for other inodes | |
422 | */ | |
56b2de80 | 423 | isadir = (be16_to_cpu(dino->di_mode) & S_IFMT) == S_IFDIR; |
69ec88b5 BN |
424 | hasdir |= isadir; |
425 | ||
56b2de80 | 426 | if (dino->di_format <= XFS_DINODE_FMT_LOCAL) |
69ec88b5 BN |
427 | continue; |
428 | ||
56b2de80 | 429 | if (!isadir && (dino->di_format == XFS_DINODE_FMT_EXTENTS || |
69ec88b5 | 430 | args->dirs_only)) |
2556c98b BN |
431 | continue; |
432 | ||
433 | /* | |
434 | * do some checks on the inode to see if we can prefetch | |
435 | * its directory data. It's a cut down version of | |
436 | * process_dinode_int() in dinode.c. | |
437 | */ | |
56b2de80 | 438 | if (dino->di_format > XFS_DINODE_FMT_BTREE) |
2556c98b BN |
439 | continue; |
440 | ||
56b2de80 | 441 | if (be16_to_cpu(dino->di_magic) != XFS_DINODE_MAGIC) |
2556c98b BN |
442 | continue; |
443 | ||
03d8044d | 444 | if (!libxfs_dinode_good_version(mp, dino->di_version)) |
2556c98b BN |
445 | continue; |
446 | ||
56b2de80 | 447 | if (be64_to_cpu(dino->di_size) <= XFS_DFORK_DSIZE(dino, mp)) |
2556c98b BN |
448 | continue; |
449 | ||
56b2de80 | 450 | if ((dino->di_forkoff != 0) && |
4de63245 | 451 | (dino->di_forkoff >= XFS_LITINO(mp) >> 3)) |
2556c98b BN |
452 | continue; |
453 | ||
56b2de80 | 454 | switch (dino->di_format) { |
2556c98b BN |
455 | case XFS_DINODE_FMT_EXTENTS: |
456 | pf_read_exinode(args, dino); | |
cb5b3ef4 | 457 | break; |
2556c98b | 458 | case XFS_DINODE_FMT_BTREE: |
69ec88b5 | 459 | pf_read_btinode(args, dino, isadir); |
cb5b3ef4 | 460 | break; |
cb5b3ef4 MV |
461 | } |
462 | } | |
69ec88b5 | 463 | if (hasdir) |
af60a998 | 464 | libxfs_buf_set_priority(bp, B_DIR_INODE); |
cb5b3ef4 MV |
465 | } |
466 | ||
dd9093de DC |
467 | /* |
468 | * pf_batch_read must be called with the lock locked. | |
469 | */ | |
cb5b3ef4 | 470 | static void |
2556c98b BN |
471 | pf_batch_read( |
472 | prefetch_args_t *args, | |
473 | pf_which_t which, | |
474 | void *buf) | |
cb5b3ef4 | 475 | { |
167137fe | 476 | struct xfs_buf *bplist[MAX_BUFS]; |
2556c98b | 477 | unsigned int num; |
9e726740 | 478 | off_t first_off, last_off, next_off; |
2556c98b | 479 | int len, size; |
cb5b3ef4 | 480 | int i; |
2556c98b | 481 | int inode_bufs; |
e33b06a3 | 482 | unsigned long fsbno = 0; |
379397bf | 483 | unsigned long max_fsbno; |
2556c98b BN |
484 | char *pbuf; |
485 | ||
bb34c934 | 486 | for (;;) { |
379397bf | 487 | num = 0; |
bb34c934 BN |
488 | if (which == PF_SECONDARY) { |
489 | bplist[0] = btree_find(args->io_queue, 0, &fsbno); | |
68d16907 | 490 | max_fsbno = min(fsbno + pf_max_fsbs, |
bb34c934 BN |
491 | args->last_bno_read); |
492 | } else { | |
493 | bplist[0] = btree_find(args->io_queue, | |
494 | args->last_bno_read, &fsbno); | |
495 | max_fsbno = fsbno + pf_max_fsbs; | |
496 | } | |
379397bf | 497 | while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) { |
dd9093de | 498 | /* |
bbd32754 DC |
499 | * Discontiguous buffers need special handling, so stop |
500 | * gathering new buffers and process the list and this | |
501 | * discontigous buffer immediately. This avoids the | |
502 | * complexity of keeping a separate discontigous buffer | |
503 | * list and seeking back over ranges we've already done | |
504 | * optimised reads for. | |
dd9093de DC |
505 | */ |
506 | if ((bplist[num]->b_flags & LIBXFS_B_DISCONTIG)) { | |
bbd32754 DC |
507 | num++; |
508 | break; | |
509 | } | |
510 | ||
511 | if (which != PF_META_ONLY || | |
af60a998 | 512 | !B_IS_INODE(libxfs_buf_priority(bplist[num]))) |
379397bf | 513 | num++; |
e49f30a7 ES |
514 | if (num == MAX_BUFS) |
515 | break; | |
bb34c934 | 516 | bplist[num] = btree_lookup_next(args->io_queue, &fsbno); |
2556c98b | 517 | } |
379397bf BN |
518 | if (!num) |
519 | return; | |
cb5b3ef4 | 520 | |
2556c98b BN |
521 | /* |
522 | * do a big read if 25% of the potential buffer is useful, | |
523 | * otherwise, find as many close together blocks and | |
524 | * read them in one read | |
525 | */ | |
d4aaa66b DC |
526 | first_off = LIBXFS_BBTOOFF64(xfs_buf_daddr(bplist[0])); |
527 | last_off = LIBXFS_BBTOOFF64(xfs_buf_daddr(bplist[num-1])) + | |
c0594dd6 | 528 | BBTOB(bplist[num-1]->b_length); |
2c350101 | 529 | while (num > 1 && last_off - first_off > pf_max_bytes) { |
2556c98b | 530 | num--; |
d4aaa66b | 531 | last_off = LIBXFS_BBTOOFF64(xfs_buf_daddr(bplist[num-1])) + |
c0594dd6 | 532 | BBTOB(bplist[num-1]->b_length); |
2556c98b BN |
533 | } |
534 | if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) { | |
535 | /* | |
536 | * not enough blocks for one big read, so determine | |
537 | * the number of blocks that are close enough. | |
538 | */ | |
c0594dd6 | 539 | last_off = first_off + BBTOB(bplist[0]->b_length); |
2556c98b | 540 | for (i = 1; i < num; i++) { |
d4aaa66b | 541 | next_off = LIBXFS_BBTOOFF64(xfs_buf_daddr(bplist[i])) + |
c0594dd6 | 542 | BBTOB(bplist[i]->b_length); |
2556c98b BN |
543 | if (next_off - last_off > pf_batch_bytes) |
544 | break; | |
545 | last_off = next_off; | |
546 | } | |
547 | num = i; | |
548 | } | |
cb5b3ef4 | 549 | |
2556c98b | 550 | for (i = 0; i < num; i++) { |
bb34c934 | 551 | if (btree_delete(args->io_queue, XFS_DADDR_TO_FSB(mp, |
d4aaa66b | 552 | xfs_buf_daddr(bplist[i]))) == NULL) |
2556c98b | 553 | do_error(_("prefetch corruption\n")); |
cb5b3ef4 MV |
554 | } |
555 | ||
2556c98b BN |
556 | if (which == PF_PRIMARY) { |
557 | for (inode_bufs = 0, i = 0; i < num; i++) { | |
af60a998 | 558 | if (B_IS_INODE(libxfs_buf_priority(bplist[i]))) |
2556c98b BN |
559 | inode_bufs++; |
560 | } | |
561 | args->inode_bufs_queued -= inode_bufs; | |
562 | if (inode_bufs && (first_off >> mp->m_sb.sb_blocklog) > | |
563 | pf_batch_fsbs) | |
564 | args->last_bno_read = (first_off >> mp->m_sb.sb_blocklog); | |
565 | } | |
566 | #ifdef XR_PF_TRACE | |
567 | pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)", | |
d4aaa66b DC |
568 | (long long)xfs_buf_daddr(bplist[0]), |
569 | (long long)xfs_buf_daddr(bplist[num-1]), num, | |
2556c98b BN |
570 | (which != PF_SECONDARY) ? "pri" : "sec", args->agno, |
571 | args->last_bno_read, args->inode_bufs_queued); | |
572 | #endif | |
573 | pthread_mutex_unlock(&args->lock); | |
574 | ||
575 | /* | |
576 | * now read the data and put into the xfs_but_t's | |
577 | */ | |
2f9a125c | 578 | len = pread(mp_fd, buf, (int)(last_off - first_off), first_off); |
bbd32754 DC |
579 | |
580 | /* | |
581 | * Check the last buffer on the list to see if we need to | |
582 | * process a discontiguous buffer. The gather above loop | |
583 | * guarantees that only the last buffer in the list will be a | |
584 | * discontiguous buffer. | |
585 | */ | |
586 | if ((bplist[num - 1]->b_flags & LIBXFS_B_DISCONTIG)) { | |
587 | libxfs_readbufr_map(mp->m_ddev_targp, bplist[num - 1], 0); | |
588 | bplist[num - 1]->b_flags |= LIBXFS_B_UNCHECKED; | |
e02ba985 | 589 | libxfs_buf_relse(bplist[num - 1]); |
bbd32754 DC |
590 | num--; |
591 | } | |
592 | ||
2556c98b BN |
593 | if (len > 0) { |
594 | /* | |
167137fe DC |
595 | * go through the struct xfs_buf list copying from the |
596 | * read buffer into the struct xfs_buf's and release them. | |
2556c98b | 597 | */ |
2556c98b BN |
598 | for (i = 0; i < num; i++) { |
599 | ||
d4aaa66b | 600 | pbuf = ((char *)buf) + (LIBXFS_BBTOOFF64(xfs_buf_daddr(bplist[i])) - first_off); |
c0594dd6 | 601 | size = BBTOB(bplist[i]->b_length); |
2556c98b BN |
602 | if (len < size) |
603 | break; | |
04338619 | 604 | memcpy(bplist[i]->b_addr, pbuf, size); |
adbb3573 DC |
605 | bplist[i]->b_flags |= (LIBXFS_B_UPTODATE | |
606 | LIBXFS_B_UNCHECKED); | |
2556c98b | 607 | len -= size; |
af60a998 | 608 | if (B_IS_INODE(libxfs_buf_priority(bplist[i]))) |
2556c98b | 609 | pf_read_inode_dirs(args, bplist[i]); |
69ec88b5 | 610 | else if (which == PF_META_ONLY) |
af60a998 | 611 | libxfs_buf_set_priority(bplist[i], |
69ec88b5 BN |
612 | B_DIR_META_H); |
613 | else if (which == PF_PRIMARY && num == 1) | |
af60a998 | 614 | libxfs_buf_set_priority(bplist[i], |
69ec88b5 | 615 | B_DIR_META_S); |
2556c98b BN |
616 | } |
617 | } | |
618 | for (i = 0; i < num; i++) { | |
2556c98b | 619 | pftrace("putbuf %c %p (%llu) in AG %d", |
af60a998 DW |
620 | B_IS_INODE(libxfs_buf_priority(bplist[i])) ? |
621 | 'I' : 'M', | |
d4aaa66b | 622 | bplist[i], (long long)xfs_buf_daddr(bplist[i]), |
2556c98b | 623 | args->agno); |
e02ba985 | 624 | libxfs_buf_relse(bplist[i]); |
2556c98b BN |
625 | } |
626 | pthread_mutex_lock(&args->lock); | |
627 | if (which != PF_SECONDARY) { | |
2556c98b BN |
628 | pftrace("inode_bufs_queued for AG %d = %d", args->agno, |
629 | args->inode_bufs_queued); | |
2556c98b BN |
630 | /* |
631 | * if primary inode queue running low, process metadata | |
632 | * in boths queues to avoid I/O starvation as the | |
633 | * processing thread would be waiting for a metadata | |
634 | * buffer | |
635 | */ | |
636 | if (which == PF_PRIMARY && !args->queuing_done && | |
637 | args->inode_bufs_queued < IO_THRESHOLD) { | |
2556c98b BN |
638 | pftrace("reading metadata bufs from primary queue for AG %d", |
639 | args->agno); | |
4c0a98ae | 640 | |
2556c98b | 641 | pf_batch_read(args, PF_META_ONLY, buf); |
4c0a98ae | 642 | |
2556c98b BN |
643 | pftrace("reading bufs from secondary queue for AG %d", |
644 | args->agno); | |
4c0a98ae | 645 | |
2556c98b BN |
646 | pf_batch_read(args, PF_SECONDARY, buf); |
647 | } | |
cb5b3ef4 | 648 | } |
cb5b3ef4 | 649 | } |
2556c98b BN |
650 | } |
651 | ||
652 | static void * | |
653 | pf_io_worker( | |
654 | void *param) | |
655 | { | |
656 | prefetch_args_t *args = param; | |
657 | void *buf = memalign(libxfs_device_alignment(), | |
658 | pf_max_bytes); | |
659 | ||
660 | if (buf == NULL) | |
661 | return NULL; | |
cb5b3ef4 | 662 | |
e4da1b16 | 663 | rcu_register_thread(); |
2556c98b | 664 | pthread_mutex_lock(&args->lock); |
bb34c934 | 665 | while (!args->queuing_done || !btree_is_empty(args->io_queue)) { |
2556c98b | 666 | pftrace("waiting to start prefetch I/O for AG %d", args->agno); |
4c0a98ae | 667 | |
2556c98b BN |
668 | while (!args->can_start_reading && !args->queuing_done) |
669 | pthread_cond_wait(&args->start_reading, &args->lock); | |
4c0a98ae | 670 | |
2556c98b | 671 | pftrace("starting prefetch I/O for AG %d", args->agno); |
4c0a98ae | 672 | |
2556c98b BN |
673 | pf_batch_read(args, PF_PRIMARY, buf); |
674 | pf_batch_read(args, PF_SECONDARY, buf); | |
675 | ||
2556c98b | 676 | pftrace("ran out of bufs to prefetch for AG %d", args->agno); |
4c0a98ae | 677 | |
2556c98b BN |
678 | if (!args->queuing_done) |
679 | args->can_start_reading = 0; | |
cb5b3ef4 | 680 | } |
2556c98b | 681 | pthread_mutex_unlock(&args->lock); |
cb5b3ef4 | 682 | |
2556c98b BN |
683 | free(buf); |
684 | ||
2556c98b | 685 | pftrace("finished prefetch I/O for AG %d", args->agno); |
e4da1b16 | 686 | rcu_unregister_thread(); |
4c0a98ae | 687 | |
2556c98b | 688 | return NULL; |
cb5b3ef4 MV |
689 | } |
690 | ||
2556c98b BN |
691 | static int |
692 | pf_create_prefetch_thread( | |
693 | prefetch_args_t *args); | |
694 | ||
b97ad969 JM |
695 | /* |
696 | * If we fail to create the queuing thread or can't create even one | |
697 | * prefetch thread, we need to let processing continue without it. | |
698 | */ | |
699 | static void | |
700 | pf_skip_prefetch_thread(prefetch_args_t *args) | |
701 | { | |
702 | prefetch_args_t *next; | |
703 | ||
704 | pthread_mutex_lock(&args->lock); | |
705 | args->prefetch_done = 1; | |
706 | pf_start_processing(args); | |
707 | next = args->next_args; | |
708 | args->next_args = NULL; | |
709 | pthread_mutex_unlock(&args->lock); | |
710 | ||
711 | if (next) | |
712 | pf_create_prefetch_thread(next); | |
713 | } | |
714 | ||
2556c98b BN |
715 | static void * |
716 | pf_queuing_worker( | |
717 | void *param) | |
cb5b3ef4 | 718 | { |
2556c98b | 719 | prefetch_args_t *args = param; |
b97ad969 | 720 | prefetch_args_t *next_args; |
2556c98b BN |
721 | int num_inos; |
722 | ino_tree_node_t *irec; | |
723 | ino_tree_node_t *cur_irec; | |
2556c98b | 724 | xfs_agblock_t bno; |
cb5b3ef4 | 725 | int i; |
2556c98b | 726 | int err; |
870b18fd | 727 | uint64_t sparse; |
e7fd2b6f | 728 | struct xfs_ino_geometry *igeo = M_IGEO(mp); |
41baceb7 | 729 | unsigned long long cluster_mask; |
2556c98b | 730 | |
e4da1b16 DC |
731 | rcu_register_thread(); |
732 | ||
41baceb7 | 733 | cluster_mask = (1ULL << igeo->inodes_per_cluster) - 1; |
2556c98b BN |
734 | |
735 | for (i = 0; i < PF_THREAD_COUNT; i++) { | |
736 | err = pthread_create(&args->io_threads[i], NULL, | |
737 | pf_io_worker, args); | |
738 | if (err != 0) { | |
739 | do_warn(_("failed to create prefetch thread: %s\n"), | |
740 | strerror(err)); | |
e8ff6275 JM |
741 | pftrace("failed to create prefetch thread for AG %d: %s", |
742 | args->agno, strerror(err)); | |
53dc81db | 743 | args->io_threads[i] = 0; |
2556c98b | 744 | if (i == 0) { |
b97ad969 | 745 | pf_skip_prefetch_thread(args); |
e4da1b16 | 746 | goto out; |
2556c98b BN |
747 | } |
748 | /* | |
749 | * since we have at least one I/O thread, use them for | |
750 | * prefetch | |
751 | */ | |
752 | break; | |
753 | } | |
cb5b3ef4 | 754 | } |
2556c98b | 755 | pftrace("starting prefetch for AG %d", args->agno); |
cb5b3ef4 | 756 | |
2556c98b BN |
757 | for (irec = findfirst_inode_rec(args->agno); irec != NULL; |
758 | irec = next_ino_rec(irec)) { | |
cb5b3ef4 | 759 | |
2556c98b | 760 | cur_irec = irec; |
cb5b3ef4 | 761 | |
2556c98b | 762 | num_inos = XFS_INODES_PER_CHUNK; |
e7fd2b6f | 763 | while (num_inos < igeo->ialloc_inos && irec != NULL) { |
2556c98b BN |
764 | irec = next_ino_rec(irec); |
765 | num_inos += XFS_INODES_PER_CHUNK; | |
766 | } | |
cb5b3ef4 | 767 | |
2556c98b BN |
768 | if (args->dirs_only && cur_irec->ino_isa_dir == 0) |
769 | continue; | |
770 | #ifdef XR_PF_TRACE | |
771 | sem_getvalue(&args->ra_count, &i); | |
772 | pftrace("queuing irec %p in AG %d, sem count = %d", | |
773 | irec, args->agno, i); | |
774 | #endif | |
3724f674 | 775 | err = sem_trywait(&args->ra_count); |
004e18d4 | 776 | if (err < 0 && errno == EAGAIN) { |
3724f674 CH |
777 | /* |
778 | * Kick the queue once we have reached the limit; | |
779 | * without this the threads processing the inodes | |
780 | * might get stuck on a buffer that has been locked | |
781 | * and added to the I/O queue but is waiting for | |
782 | * the thread to be woken. | |
7cf2aa1a ES |
783 | * Start processing as well, in case everything so |
784 | * far was already prefetched and the queue is empty. | |
3724f674 CH |
785 | */ |
786 | pf_start_io_workers(args); | |
7cf2aa1a | 787 | pf_start_processing(args); |
3724f674 CH |
788 | sem_wait(&args->ra_count); |
789 | } | |
2556c98b BN |
790 | |
791 | num_inos = 0; | |
792 | bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum); | |
870b18fd | 793 | sparse = cur_irec->ir_sparse; |
2556c98b BN |
794 | |
795 | do { | |
dd9093de DC |
796 | struct xfs_buf_map map; |
797 | ||
798 | map.bm_bn = XFS_AGB_TO_DADDR(mp, args->agno, bno); | |
41baceb7 DW |
799 | map.bm_len = XFS_FSB_TO_BB(mp, |
800 | igeo->blocks_per_cluster); | |
870b18fd BF |
801 | |
802 | /* | |
803 | * Queue I/O for each non-sparse cluster. We can check | |
804 | * sparse state in cluster sized chunks as cluster size | |
805 | * is the min. granularity of sparse irec regions. | |
806 | */ | |
41baceb7 | 807 | if ((sparse & cluster_mask) == 0) |
870b18fd BF |
808 | pf_queue_io(args, &map, 1, |
809 | (cur_irec->ino_isa_dir != 0) ? | |
810 | B_DIR_INODE : B_INODE); | |
811 | ||
41baceb7 DW |
812 | bno += igeo->blocks_per_cluster; |
813 | num_inos += igeo->inodes_per_cluster; | |
814 | sparse >>= igeo->inodes_per_cluster; | |
e7fd2b6f | 815 | } while (num_inos < igeo->ialloc_inos); |
cb5b3ef4 MV |
816 | } |
817 | ||
2556c98b | 818 | pthread_mutex_lock(&args->lock); |
cb5b3ef4 | 819 | |
2556c98b BN |
820 | pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)", |
821 | args->agno, args->inode_bufs_queued); | |
4c0a98ae | 822 | |
2556c98b BN |
823 | args->queuing_done = 1; |
824 | pf_start_io_workers(args); | |
825 | pf_start_processing(args); | |
826 | pthread_mutex_unlock(&args->lock); | |
827 | ||
828 | /* now wait for the readers to finish */ | |
829 | for (i = 0; i < PF_THREAD_COUNT; i++) | |
830 | if (args->io_threads[i]) | |
831 | pthread_join(args->io_threads[i], NULL); | |
832 | ||
2556c98b | 833 | pftrace("prefetch for AG %d finished", args->agno); |
4c0a98ae | 834 | |
2556c98b BN |
835 | pthread_mutex_lock(&args->lock); |
836 | ||
bb34c934 | 837 | ASSERT(btree_is_empty(args->io_queue)); |
2556c98b BN |
838 | |
839 | args->prefetch_done = 1; | |
b97ad969 JM |
840 | next_args = args->next_args; |
841 | args->next_args = NULL; | |
2556c98b BN |
842 | pthread_mutex_unlock(&args->lock); |
843 | ||
b97ad969 JM |
844 | if (next_args) |
845 | pf_create_prefetch_thread(next_args); | |
846 | ||
e4da1b16 DC |
847 | out: |
848 | rcu_unregister_thread(); | |
2556c98b | 849 | return NULL; |
cb5b3ef4 MV |
850 | } |
851 | ||
2556c98b BN |
852 | static int |
853 | pf_create_prefetch_thread( | |
854 | prefetch_args_t *args) | |
855 | { | |
856 | int err; | |
857 | ||
2556c98b | 858 | pftrace("creating queue thread for AG %d", args->agno); |
4c0a98ae | 859 | |
2556c98b BN |
860 | err = pthread_create(&args->queuing_thread, NULL, |
861 | pf_queuing_worker, args); | |
862 | if (err != 0) { | |
863 | do_warn(_("failed to create prefetch thread: %s\n"), | |
864 | strerror(err)); | |
e8ff6275 JM |
865 | pftrace("failed to create prefetch thread for AG %d: %s", |
866 | args->agno, strerror(err)); | |
53dc81db | 867 | args->queuing_thread = 0; |
b97ad969 | 868 | pf_skip_prefetch_thread(args); |
2556c98b BN |
869 | } |
870 | ||
871 | return err == 0; | |
872 | } | |
cb5b3ef4 MV |
873 | |
874 | void | |
2556c98b BN |
875 | init_prefetch( |
876 | xfs_mount_t *pmp) | |
cb5b3ef4 | 877 | { |
2556c98b | 878 | mp = pmp; |
7b47b1bc | 879 | mp_fd = mp->m_ddev_targp->bt_bdev_fd;; |
2556c98b BN |
880 | pf_max_bytes = sysconf(_SC_PAGE_SIZE) << 7; |
881 | pf_max_bbs = pf_max_bytes >> BBSHIFT; | |
882 | pf_max_fsbs = pf_max_bytes >> mp->m_sb.sb_blocklog; | |
883 | pf_batch_bytes = DEF_BATCH_BYTES; | |
884 | pf_batch_fsbs = DEF_BATCH_BYTES >> (mp->m_sb.sb_blocklog + 1); | |
885 | } | |
cb5b3ef4 | 886 | |
2556c98b BN |
887 | prefetch_args_t * |
888 | start_inode_prefetch( | |
889 | xfs_agnumber_t agno, | |
890 | int dirs_only, | |
891 | prefetch_args_t *prev_args) | |
892 | { | |
893 | prefetch_args_t *args; | |
edf3f9d0 | 894 | long max_queue; |
e7fd2b6f | 895 | struct xfs_ino_geometry *igeo = M_IGEO(mp); |
cb5b3ef4 | 896 | |
2556c98b BN |
897 | if (!do_prefetch || agno >= mp->m_sb.sb_agcount) |
898 | return NULL; | |
cb5b3ef4 | 899 | |
2556c98b BN |
900 | args = calloc(1, sizeof(prefetch_args_t)); |
901 | ||
bb34c934 | 902 | btree_init(&args->io_queue); |
5e656dbb BN |
903 | if (pthread_mutex_init(&args->lock, NULL) != 0) |
904 | do_error(_("failed to initialize prefetch mutex\n")); | |
905 | if (pthread_cond_init(&args->start_reading, NULL) != 0) | |
906 | do_error(_("failed to initialize prefetch cond var\n")); | |
907 | if (pthread_cond_init(&args->start_processing, NULL) != 0) | |
908 | do_error(_("failed to initialize prefetch cond var\n")); | |
2556c98b BN |
909 | args->agno = agno; |
910 | args->dirs_only = dirs_only; | |
911 | ||
912 | /* | |
913 | * use only 1/8 of the libxfs cache as we are only counting inodes | |
914 | * and not any other associated metadata like directories | |
915 | */ | |
916 | ||
edf3f9d0 | 917 | max_queue = libxfs_bcache->c_maxcount / thread_count / 8; |
e7fd2b6f | 918 | if (igeo->inode_cluster_size > mp->m_sb.sb_blocksize) |
41baceb7 DW |
919 | max_queue = max_queue * igeo->blocks_per_cluster / |
920 | igeo->ialloc_blks; | |
edf3f9d0 BN |
921 | |
922 | sem_init(&args->ra_count, 0, max_queue); | |
2556c98b BN |
923 | |
924 | if (!prev_args) { | |
925 | if (!pf_create_prefetch_thread(args)) | |
926 | return NULL; | |
927 | } else { | |
928 | pthread_mutex_lock(&prev_args->lock); | |
929 | if (prev_args->prefetch_done) { | |
b97ad969 | 930 | pthread_mutex_unlock(&prev_args->lock); |
2556c98b BN |
931 | if (!pf_create_prefetch_thread(args)) |
932 | args = NULL; | |
e8ff6275 | 933 | } else { |
2556c98b | 934 | prev_args->next_args = args; |
e8ff6275 JM |
935 | pftrace("queued AG %d after AG %d", |
936 | args->agno, prev_args->agno); | |
b97ad969 | 937 | pthread_mutex_unlock(&prev_args->lock); |
e8ff6275 | 938 | } |
cb5b3ef4 | 939 | } |
2556c98b BN |
940 | |
941 | return args; | |
cb5b3ef4 MV |
942 | } |
943 | ||
71014d19 DC |
944 | /* |
945 | * prefetch_ag_range runs a prefetch-and-process loop across a range of AGs. It | |
946 | * begins with @start+ag, and finishes with @end_ag - 1 (i.e. does not prefetch | |
947 | * or process @end_ag). The function starts prefetch on the first AG, then loops | |
948 | * starting prefetch on the next AG and then blocks processing the current AG as | |
949 | * the prefetch queue brings inodes into the processing queue. | |
950 | * | |
951 | * There is only one prefetch taking place at a time, so the prefetch on the | |
952 | * next AG only starts once the current AG has been completely prefetched. Hence | |
953 | * the prefetch of the next AG will start some time before the processing of the | |
954 | * current AG finishes, ensuring that when we iterate an start processing the | |
955 | * next AG there is already a significant queue of inodes to process. | |
956 | * | |
957 | * Prefetch is done this way to prevent it from running too far ahead of the | |
958 | * processing. Allowing it to do so can cause cache thrashing, where new | |
959 | * prefetch causes previously prefetched buffers to be reclaimed before the | |
960 | * processing thread uses them. This results in reading all the inodes and | |
961 | * metadata twice per phase and it greatly slows down the processing. Hence we | |
962 | * have to carefully control how far ahead we prefetch... | |
963 | */ | |
964 | static void | |
965 | prefetch_ag_range( | |
62843f36 | 966 | struct workqueue *work, |
71014d19 DC |
967 | xfs_agnumber_t start_ag, |
968 | xfs_agnumber_t end_ag, | |
969 | bool dirs_only, | |
62843f36 | 970 | void (*func)(struct workqueue *, |
71014d19 DC |
971 | xfs_agnumber_t, void *)) |
972 | { | |
973 | int i; | |
974 | struct prefetch_args *pf_args[2]; | |
975 | ||
976 | pf_args[start_ag & 1] = start_inode_prefetch(start_ag, dirs_only, NULL); | |
977 | for (i = start_ag; i < end_ag; i++) { | |
978 | /* Don't prefetch end_ag */ | |
979 | if (i + 1 < end_ag) | |
980 | pf_args[(~i) & 1] = start_inode_prefetch(i + 1, | |
981 | dirs_only, pf_args[i & 1]); | |
982 | func(work, i, pf_args[i & 1]); | |
983 | } | |
984 | } | |
985 | ||
986 | struct pf_work_args { | |
987 | xfs_agnumber_t start_ag; | |
988 | xfs_agnumber_t end_ag; | |
989 | bool dirs_only; | |
62843f36 | 990 | void (*func)(struct workqueue *, xfs_agnumber_t, void *); |
71014d19 DC |
991 | }; |
992 | ||
993 | static void | |
994 | prefetch_ag_range_work( | |
62843f36 | 995 | struct workqueue *work, |
71014d19 DC |
996 | xfs_agnumber_t unused, |
997 | void *args) | |
998 | { | |
999 | struct pf_work_args *wargs = args; | |
1000 | ||
f8149110 | 1001 | prefetch_ag_range(work, wargs->start_ag, wargs->end_ag, |
71014d19 DC |
1002 | wargs->dirs_only, wargs->func); |
1003 | free(args); | |
1004 | } | |
1005 | ||
1164bde5 DC |
1006 | /* |
1007 | * Do inode prefetch in the most optimal way for the context under which repair | |
1008 | * has been run. | |
1009 | */ | |
1010 | void | |
1011 | do_inode_prefetch( | |
1012 | struct xfs_mount *mp, | |
1013 | int stride, | |
62843f36 | 1014 | void (*func)(struct workqueue *, |
1164bde5 DC |
1015 | xfs_agnumber_t, void *), |
1016 | bool check_cache, | |
1017 | bool dirs_only) | |
1018 | { | |
71014d19 | 1019 | int i; |
62843f36 DW |
1020 | struct workqueue queue; |
1021 | struct workqueue *queues; | |
f994d14f | 1022 | int queues_started = 0; |
1164bde5 DC |
1023 | |
1024 | /* | |
1025 | * If the previous phases of repair have not overflowed the buffer | |
1026 | * cache, then we don't need to re-read any of the metadata in the | |
1027 | * filesystem - it's all in the cache. In that case, run a thread per | |
1028 | * CPU to maximise parallelism of the queue to be processed. | |
1029 | */ | |
1030 | if (check_cache && !libxfs_bcache_overflowed()) { | |
62843f36 | 1031 | queue.wq_ctx = mp; |
4b45ff6f | 1032 | create_work_queue(&queue, mp, platform_nproc()); |
1164bde5 DC |
1033 | for (i = 0; i < mp->m_sb.sb_agcount; i++) |
1034 | queue_work(&queue, func, i, NULL); | |
1035 | destroy_work_queue(&queue); | |
1036 | return; | |
1037 | } | |
1038 | ||
1039 | /* | |
1040 | * single threaded behaviour - single prefetch thread, processed | |
1041 | * directly after each AG is queued. | |
1042 | */ | |
1043 | if (!stride) { | |
62843f36 | 1044 | queue.wq_ctx = mp; |
71014d19 DC |
1045 | prefetch_ag_range(&queue, 0, mp->m_sb.sb_agcount, |
1046 | dirs_only, func); | |
1164bde5 DC |
1047 | return; |
1048 | } | |
1049 | ||
1050 | /* | |
1051 | * create one worker thread for each segment of the volume | |
1052 | */ | |
62843f36 | 1053 | queues = malloc(thread_count * sizeof(struct workqueue)); |
71014d19 DC |
1054 | for (i = 0; i < thread_count; i++) { |
1055 | struct pf_work_args *wargs; | |
1056 | ||
1057 | wargs = malloc(sizeof(struct pf_work_args)); | |
1058 | wargs->start_ag = i * stride; | |
1059 | wargs->end_ag = min((i + 1) * stride, | |
1060 | mp->m_sb.sb_agcount); | |
1061 | wargs->dirs_only = dirs_only; | |
1062 | wargs->func = func; | |
1063 | ||
1164bde5 | 1064 | create_work_queue(&queues[i], mp, 1); |
71014d19 | 1065 | queue_work(&queues[i], prefetch_ag_range_work, 0, wargs); |
f994d14f | 1066 | queues_started++; |
71014d19 DC |
1067 | |
1068 | if (wargs->end_ag >= mp->m_sb.sb_agcount) | |
1069 | break; | |
1164bde5 | 1070 | } |
71014d19 | 1071 | |
1164bde5 DC |
1072 | /* |
1073 | * wait for workers to complete | |
1074 | */ | |
f994d14f | 1075 | for (i = 0; i < queues_started; i++) |
1164bde5 DC |
1076 | destroy_work_queue(&queues[i]); |
1077 | free(queues); | |
1078 | } | |
1079 | ||
cb5b3ef4 | 1080 | void |
2556c98b BN |
1081 | wait_for_inode_prefetch( |
1082 | prefetch_args_t *args) | |
cb5b3ef4 | 1083 | { |
2556c98b | 1084 | if (args == NULL) |
cb5b3ef4 | 1085 | return; |
2556c98b BN |
1086 | |
1087 | pthread_mutex_lock(&args->lock); | |
1088 | ||
1089 | while (!args->can_start_processing) { | |
2556c98b | 1090 | pftrace("waiting to start processing AG %d", args->agno); |
4c0a98ae | 1091 | |
2556c98b | 1092 | pthread_cond_wait(&args->start_processing, &args->lock); |
cb5b3ef4 | 1093 | } |
2556c98b | 1094 | pftrace("can start processing AG %d", args->agno); |
4c0a98ae | 1095 | |
2556c98b BN |
1096 | pthread_mutex_unlock(&args->lock); |
1097 | } | |
cb5b3ef4 | 1098 | |
2556c98b BN |
1099 | void |
1100 | cleanup_inode_prefetch( | |
1101 | prefetch_args_t *args) | |
1102 | { | |
1103 | if (args == NULL) | |
1104 | return; | |
cb5b3ef4 | 1105 | |
2556c98b | 1106 | pftrace("waiting AG %d prefetch to finish", args->agno); |
4c0a98ae | 1107 | |
2556c98b BN |
1108 | if (args->queuing_thread) |
1109 | pthread_join(args->queuing_thread, NULL); | |
1110 | ||
2556c98b | 1111 | pftrace("AG %d prefetch done", args->agno); |
4c0a98ae | 1112 | |
b97ad969 JM |
1113 | ASSERT(args->next_args == NULL); |
1114 | ||
2556c98b BN |
1115 | pthread_mutex_destroy(&args->lock); |
1116 | pthread_cond_destroy(&args->start_reading); | |
1117 | pthread_cond_destroy(&args->start_processing); | |
1118 | sem_destroy(&args->ra_count); | |
bb34c934 | 1119 | btree_destroy(args->io_queue); |
2556c98b BN |
1120 | |
1121 | free(args); | |
cb5b3ef4 MV |
1122 | } |
1123 | ||
2556c98b BN |
1124 | #ifdef XR_PF_TRACE |
1125 | ||
4c0a98ae BN |
1126 | static FILE *pf_trace_file; |
1127 | ||
1128 | void | |
1129 | pftrace_init(void) | |
1130 | { | |
1131 | pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w"); | |
1132 | setvbuf(pf_trace_file, NULL, _IOLBF, 1024); | |
1133 | } | |
1134 | ||
1135 | void | |
1136 | pftrace_done(void) | |
1137 | { | |
1138 | fclose(pf_trace_file); | |
1139 | } | |
1140 | ||
cb5b3ef4 | 1141 | void |
2556c98b | 1142 | _pftrace(const char *func, const char *msg, ...) |
cb5b3ef4 | 1143 | { |
2556c98b BN |
1144 | char buf[200]; |
1145 | struct timeval tv; | |
1146 | va_list args; | |
cb5b3ef4 | 1147 | |
2556c98b | 1148 | gettimeofday(&tv, NULL); |
cb5b3ef4 | 1149 | |
2556c98b BN |
1150 | va_start(args, msg); |
1151 | vsnprintf(buf, sizeof(buf), msg, args); | |
1152 | buf[sizeof(buf)-1] = '\0'; | |
1153 | va_end(args); | |
cb5b3ef4 | 1154 | |
4c0a98ae BN |
1155 | fprintf(pf_trace_file, "%lu.%06lu %s: %s\n", tv.tv_sec, tv.tv_usec, |
1156 | func, buf); | |
cb5b3ef4 | 1157 | } |
2556c98b BN |
1158 | |
1159 | #endif |