]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - repair/prefetch.c
repair: simplify bmap_next_offset
[thirdparty/xfsprogs-dev.git] / repair / prefetch.c
CommitLineData
959ef981
DC
1// SPDX-License-Identifier: GPL-2.0
2
6b803e5a 3#include "libxfs.h"
2556c98b 4#include <pthread.h>
cb5b3ef4 5#include "avl.h"
379397bf 6#include "btree.h"
cb5b3ef4
MV
7#include "globals.h"
8#include "agheader.h"
9#include "incore.h"
cb5b3ef4 10#include "dir2.h"
cb5b3ef4
MV
11#include "protos.h"
12#include "err_protos.h"
13#include "dinode.h"
14#include "bmap.h"
15#include "versions.h"
2556c98b
BN
16#include "threads.h"
17#include "prefetch.h"
18#include "progress.h"
cb5b3ef4
MV
19
20int do_prefetch = 1;
21
2556c98b
BN
22/*
23 * Performs prefetching by priming the libxfs cache by using a dedicate thread
24 * scanning inodes and reading blocks in ahead of time they are required.
25 *
26 * Any I/O errors can be safely ignored.
27 */
cb5b3ef4 28
2556c98b
BN
29static xfs_mount_t *mp;
30static int mp_fd;
31static int pf_max_bytes;
32static int pf_max_bbs;
33static int pf_max_fsbs;
34static int pf_batch_bytes;
35static int pf_batch_fsbs;
cb5b3ef4 36
69ec88b5
BN
37static void pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *);
38
a040d7c9
BN
39/*
40 * Buffer priorities for the libxfs cache
41 *
42 * Directory metadata is ranked higher than other metadata as it's used
43 * in phases 3, 4 and 6, while other metadata is only used in 3 & 4.
44 */
69ec88b5 45
a040d7c9
BN
46/* intermediate directory btree nodes - can't be queued */
47#define B_DIR_BMAP CACHE_PREFETCH_PRIORITY + 7
48/* directory metadata in secondary queue */
49#define B_DIR_META_2 CACHE_PREFETCH_PRIORITY + 6
50/* dir metadata that had to fetched from the primary queue to avoid stalling */
51#define B_DIR_META_H CACHE_PREFETCH_PRIORITY + 5
52/* single block of directory metadata (can't batch read) */
53#define B_DIR_META_S CACHE_PREFETCH_PRIORITY + 4
54/* dir metadata with more than one block fetched in a single I/O */
55#define B_DIR_META CACHE_PREFETCH_PRIORITY + 3
56/* inode clusters with directory inodes */
57#define B_DIR_INODE CACHE_PREFETCH_PRIORITY + 2
58/* intermediate extent btree nodes */
59#define B_BMAP CACHE_PREFETCH_PRIORITY + 1
60/* inode clusters without any directory entries */
61#define B_INODE CACHE_PREFETCH_PRIORITY
69ec88b5 62
a040d7c9
BN
63/*
64 * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if
65 * the buffer is for an inode or other metadata.
66 */
67#define B_IS_INODE(f) (((f) & 5) == 0)
cb5b3ef4 68
2556c98b
BN
69#define DEF_BATCH_BYTES 0x10000
70
71#define MAX_BUFS 128
72
69ec88b5 73#define IO_THRESHOLD (MAX_BUFS * 2)
2556c98b
BN
74
75typedef enum pf_which {
76 PF_PRIMARY,
77 PF_SECONDARY,
78 PF_META_ONLY
79} pf_which_t;
80
81
82static inline void
83pf_start_processing(
84 prefetch_args_t *args)
85{
86 if (!args->can_start_processing) {
2556c98b 87 pftrace("signalling processing for AG %d", args->agno);
4c0a98ae 88
2556c98b
BN
89 args->can_start_processing = 1;
90 pthread_cond_signal(&args->start_processing);
cb5b3ef4 91 }
2556c98b
BN
92}
93
94static inline void
95pf_start_io_workers(
96 prefetch_args_t *args)
97{
98 if (!args->can_start_reading) {
2556c98b 99 pftrace("signalling reading for AG %d", args->agno);
4c0a98ae 100
2556c98b
BN
101 args->can_start_reading = 1;
102 pthread_cond_broadcast(&args->start_reading);
cb5b3ef4 103 }
cb5b3ef4
MV
104}
105
2556c98b 106
cb5b3ef4 107static void
2556c98b
BN
108pf_queue_io(
109 prefetch_args_t *args,
dd9093de
DC
110 struct xfs_buf_map *map,
111 int nmaps,
2556c98b 112 int flag)
cb5b3ef4 113{
dd9093de
DC
114 struct xfs_buf *bp;
115 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, map[0].bm_bn);
583ca112 116 int error;
cb5b3ef4 117
2ae22647
CH
118 /*
119 * Never block on a buffer lock here, given that the actual repair
120 * code might lock buffers in a different order from us. Given that
121 * the lock holder is either reading it from disk himself or
122 * completely overwriting it this behaviour is perfectly fine.
123 */
583ca112
DW
124 error = -libxfs_buf_get_map(mp->m_dev, map, nmaps,
125 LIBXFS_GETBUF_TRYLOCK, &bp);
126 if (error)
2ae22647
CH
127 return;
128
2556c98b 129 if (bp->b_flags & LIBXFS_B_UPTODATE) {
69ec88b5
BN
130 if (B_IS_INODE(flag))
131 pf_read_inode_dirs(args, bp);
af60a998 132 libxfs_buf_set_priority(bp, libxfs_buf_priority(bp) +
b3563c19 133 CACHE_PREFETCH_PRIORITY);
e02ba985 134 libxfs_buf_relse(bp);
cb5b3ef4
MV
135 return;
136 }
af60a998 137 libxfs_buf_set_priority(bp, flag);
cb5b3ef4 138
2556c98b 139 pthread_mutex_lock(&args->lock);
cb5b3ef4 140
bb34c934
BN
141 btree_insert(args->io_queue, fsbno, bp);
142
2556c98b 143 if (fsbno > args->last_bno_read) {
379397bf 144 if (B_IS_INODE(flag)) {
2556c98b
BN
145 args->inode_bufs_queued++;
146 if (args->inode_bufs_queued == IO_THRESHOLD)
147 pf_start_io_workers(args);
cb5b3ef4 148 }
2556c98b 149 } else {
08cee623 150 ASSERT(!B_IS_INODE(flag));
af60a998 151 libxfs_buf_set_priority(bp, B_DIR_META_2);
cb5b3ef4
MV
152 }
153
4c0a98ae
BN
154 pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue"
155 "(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag) ?
156 'I' : 'M', bp, (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
157 args->inode_bufs_queued, args->last_bno_read);
158
2556c98b 159 pf_start_processing(args);
cb5b3ef4 160
2556c98b 161 pthread_mutex_unlock(&args->lock);
cb5b3ef4
MV
162}
163
2556c98b
BN
164static int
165pf_read_bmbt_reclist(
166 prefetch_args_t *args,
167 xfs_bmbt_rec_t *rp,
168 int numrecs)
cb5b3ef4 169{
cb5b3ef4 170 int i;
e0a12bda 171 xfs_bmbt_irec_t irec;
5a35bf2c
DC
172 xfs_filblks_t cp = 0; /* prev count */
173 xfs_fileoff_t op = 0; /* prev offset */
dd9093de
DC
174#define MAP_ARRAY_SZ 4
175 struct xfs_buf_map map_array[MAP_ARRAY_SZ];
176 struct xfs_buf_map *map = map_array;
177 int max_extents = MAP_ARRAY_SZ;
24e04791 178 int nmaps = 0;
dd9093de
DC
179 unsigned int len = 0;
180 int ret = 0;
181
2556c98b 182
5e656dbb
BN
183 for (i = 0; i < numrecs; i++) {
184 libxfs_bmbt_disk_get_all(rp + i, &irec);
2556c98b 185
e0a12bda
BN
186 if (((i > 0) && (op + cp > irec.br_startoff)) ||
187 (irec.br_blockcount == 0) ||
188 (irec.br_startoff >= fs_max_file_offset))
dd9093de 189 goto out_free;
2556c98b 190
a6bd55d3
DW
191 if (!libxfs_verify_fsbno(mp, irec.br_startblock) ||
192 !libxfs_verify_fsbno(mp, irec.br_startblock +
193 irec.br_blockcount - 1))
dd9093de 194 goto out_free;
2556c98b 195
e0a12bda 196 if (!args->dirs_only && ((irec.br_startoff +
ff105f75 197 irec.br_blockcount) >= mp->m_dir_geo->freeblk))
2556c98b
BN
198 break; /* only Phase 6 reads the free blocks */
199
e0a12bda
BN
200 op = irec.br_startoff;
201 cp = irec.br_blockcount;
2556c98b 202
e0a12bda 203 while (irec.br_blockcount) {
dd9093de 204 unsigned int bm_len;
4c0a98ae 205
2556c98b 206 pftrace("queuing dir extent in AG %d", args->agno);
4c0a98ae 207
ff105f75
DC
208 if (len + irec.br_blockcount >= mp->m_dir_geo->fsbcount)
209 bm_len = mp->m_dir_geo->fsbcount - len;
dd9093de
DC
210 else
211 bm_len = irec.br_blockcount;
212 len += bm_len;
213
214 map[nmaps].bm_bn = XFS_FSB_TO_DADDR(mp,
215 irec.br_startblock);
216 map[nmaps].bm_len = XFS_FSB_TO_BB(mp, bm_len);
217 nmaps++;
218
ff105f75 219 if (len == mp->m_dir_geo->fsbcount) {
dd9093de
DC
220 pf_queue_io(args, map, nmaps, B_DIR_META);
221 len = 0;
222 nmaps = 0;
223 }
224
225 irec.br_blockcount -= bm_len;
226 irec.br_startblock += bm_len;
227
228 /*
229 * Handle very fragmented dir2 blocks with dynamically
230 * allocated buffer maps.
231 */
232 if (nmaps >= max_extents) {
233 struct xfs_buf_map *old_map = NULL;
234
235 if (map == map_array) {
236 old_map = map;
237 map = NULL;
238 }
239 max_extents *= 2;
240 map = realloc(map, max_extents * sizeof(*map));
241 if (map == NULL) {
242 do_error(
243 _("couldn't malloc dir2 buffer list\n"));
244 exit(1);
245 }
246 if (old_map)
247 memcpy(map, old_map, sizeof(map_array));
248 }
249
2556c98b
BN
250 }
251 }
dd9093de
DC
252 ret = 1;
253out_free:
254 if (map != map_array)
255 free(map);
256 return ret;
2556c98b 257}
cb5b3ef4 258
2556c98b
BN
259/*
260 * simplified version of the main scan_lbtree. Returns 0 to stop.
261 */
262
263static int
264pf_scan_lbtree(
5a35bf2c 265 xfs_fsblock_t dbno,
2556c98b
BN
266 int level,
267 int isadir,
268 prefetch_args_t *args,
b3563c19 269 int (*func)(struct xfs_btree_block *block,
2556c98b
BN
270 int level,
271 int isadir,
272 prefetch_args_t *args))
273{
274 xfs_buf_t *bp;
275 int rc;
31079e67 276 int error;
cb5b3ef4 277
31079e67
DW
278 error = -libxfs_buf_read(mp->m_dev, XFS_FSB_TO_DADDR(mp, dbno),
279 XFS_FSB_TO_BB(mp, 1), LIBXFS_READBUF_SALVAGE, &bp,
280 &xfs_bmbt_buf_ops);
281 if (error)
2556c98b 282 return 0;
cb5b3ef4 283
af60a998 284 libxfs_buf_set_priority(bp, isadir ? B_DIR_BMAP : B_BMAP);
69ec88b5 285
43ba1861
DW
286 /*
287 * If the verifier flagged a problem with the buffer, we can't trust
288 * its contents for the purposes of reading ahead. Stop prefetching
289 * the tree and mark the buffer unchecked so that the next read of the
290 * buffer will retain the error status and be acted upon appropriately.
291 */
292 if (bp->b_error) {
293 bp->b_flags |= LIBXFS_B_UNCHECKED;
e02ba985 294 libxfs_buf_relse(bp);
43ba1861
DW
295 return 0;
296 }
297
b3563c19 298 rc = (*func)(XFS_BUF_TO_BLOCK(bp), level - 1, isadir, args);
cb5b3ef4 299
e02ba985 300 libxfs_buf_relse(bp);
cb5b3ef4 301
2556c98b
BN
302 return rc;
303}
304
305static int
306pf_scanfunc_bmap(
b3563c19 307 struct xfs_btree_block *block,
2556c98b
BN
308 int level,
309 int isadir,
310 prefetch_args_t *args)
311{
2556c98b
BN
312 xfs_bmbt_ptr_t *pp;
313 int numrecs;
314 int i;
5a35bf2c 315 xfs_fsblock_t dbno;
2556c98b
BN
316
317 /*
318 * do some validation on the block contents
319 */
1c88e98c
DC
320 if ((block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) &&
321 block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC)) ||
2556c98b
BN
322 (be16_to_cpu(block->bb_level) != level))
323 return 0;
324
325 numrecs = be16_to_cpu(block->bb_numrecs);
326
327 if (level == 0) {
328 if (numrecs > mp->m_bmap_dmxr[0] || !isadir)
329 return 0;
5e656dbb 330 return pf_read_bmbt_reclist(args,
b3563c19 331 XFS_BMBT_REC_ADDR(mp, block, 1), numrecs);
cb5b3ef4
MV
332 }
333
2556c98b
BN
334 if (numrecs > mp->m_bmap_dmxr[1])
335 return 0;
cb5b3ef4 336
b3563c19 337 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
2556c98b
BN
338
339 for (i = 0; i < numrecs; i++) {
fb36a55d 340 dbno = get_unaligned_be64(&pp[i]);
a6bd55d3 341 if (!libxfs_verify_fsbno(mp, dbno))
2556c98b
BN
342 return 0;
343 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
344 return 0;
cb5b3ef4 345 }
2556c98b 346 return 1;
cb5b3ef4
MV
347}
348
2556c98b
BN
349
350static void
351pf_read_btinode(
352 prefetch_args_t *args,
353 xfs_dinode_t *dino,
354 int isadir)
cb5b3ef4 355{
2556c98b
BN
356 xfs_bmdr_block_t *dib;
357 xfs_bmbt_ptr_t *pp;
358 int i;
359 int level;
360 int numrecs;
361 int dsize;
5a35bf2c 362 xfs_fsblock_t dbno;
2556c98b
BN
363
364 dib = (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dino);
365
366 level = be16_to_cpu(dib->bb_level);
367 numrecs = be16_to_cpu(dib->bb_numrecs);
368
369 if ((numrecs == 0) || (level == 0) ||
370 (level > XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))
cb5b3ef4 371 return;
2556c98b
BN
372 /*
373 * use bmdr/dfork_dsize since the root block is in the data fork
374 */
375 if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp))
376 return;
377
378 dsize = XFS_DFORK_DSIZE(dino, mp);
e2f60652 379 pp = XFS_BMDR_PTR_ADDR(dib, 1, libxfs_bmdr_maxrecs(dsize, 0));
cb5b3ef4 380
2556c98b 381 for (i = 0; i < numrecs; i++) {
fb36a55d 382 dbno = get_unaligned_be64(&pp[i]);
a6bd55d3 383 if (!libxfs_verify_fsbno(mp, dbno))
cb5b3ef4 384 break;
2556c98b 385 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
cb5b3ef4 386 break;
2556c98b
BN
387 }
388}
389
390static void
391pf_read_exinode(
392 prefetch_args_t *args,
393 xfs_dinode_t *dino)
394{
395 pf_read_bmbt_reclist(args, (xfs_bmbt_rec_t *)XFS_DFORK_DPTR(dino),
56b2de80 396 be32_to_cpu(dino->di_nextents));
2556c98b 397}
cb5b3ef4 398
2556c98b
BN
399static void
400pf_read_inode_dirs(
401 prefetch_args_t *args,
402 xfs_buf_t *bp)
403{
404 xfs_dinode_t *dino;
405 int icnt = 0;
69ec88b5
BN
406 int hasdir = 0;
407 int isadir;
456371d8 408 int error;
2556c98b 409
456371d8
DW
410 error = -libxfs_readbuf_verify(bp, &xfs_inode_buf_ops);
411 if (error)
e0607266
DC
412 return;
413
135e4bfe 414 for (icnt = 0; icnt < (bp->b_bcount >> mp->m_sb.sb_inodelog); icnt++) {
56b2de80 415 dino = xfs_make_iptr(mp, bp, icnt);
2556c98b
BN
416
417 /*
418 * We are only prefetching directory contents in extents
419 * and btree nodes for other inodes
420 */
56b2de80 421 isadir = (be16_to_cpu(dino->di_mode) & S_IFMT) == S_IFDIR;
69ec88b5
BN
422 hasdir |= isadir;
423
56b2de80 424 if (dino->di_format <= XFS_DINODE_FMT_LOCAL)
69ec88b5
BN
425 continue;
426
56b2de80 427 if (!isadir && (dino->di_format == XFS_DINODE_FMT_EXTENTS ||
69ec88b5 428 args->dirs_only))
2556c98b
BN
429 continue;
430
431 /*
432 * do some checks on the inode to see if we can prefetch
433 * its directory data. It's a cut down version of
434 * process_dinode_int() in dinode.c.
435 */
56b2de80 436 if (dino->di_format > XFS_DINODE_FMT_BTREE)
2556c98b
BN
437 continue;
438
56b2de80 439 if (be16_to_cpu(dino->di_magic) != XFS_DINODE_MAGIC)
2556c98b
BN
440 continue;
441
db84c7e8 442 if (!libxfs_dinode_good_version(&mp->m_sb, dino->di_version))
2556c98b
BN
443 continue;
444
56b2de80 445 if (be64_to_cpu(dino->di_size) <= XFS_DFORK_DSIZE(dino, mp))
2556c98b
BN
446 continue;
447
56b2de80 448 if ((dino->di_forkoff != 0) &&
4de63245 449 (dino->di_forkoff >= XFS_LITINO(mp) >> 3))
2556c98b
BN
450 continue;
451
56b2de80 452 switch (dino->di_format) {
2556c98b
BN
453 case XFS_DINODE_FMT_EXTENTS:
454 pf_read_exinode(args, dino);
cb5b3ef4 455 break;
2556c98b 456 case XFS_DINODE_FMT_BTREE:
69ec88b5 457 pf_read_btinode(args, dino, isadir);
cb5b3ef4 458 break;
cb5b3ef4
MV
459 }
460 }
69ec88b5 461 if (hasdir)
af60a998 462 libxfs_buf_set_priority(bp, B_DIR_INODE);
cb5b3ef4
MV
463}
464
dd9093de
DC
465/*
466 * pf_batch_read must be called with the lock locked.
467 */
cb5b3ef4 468static void
2556c98b
BN
469pf_batch_read(
470 prefetch_args_t *args,
471 pf_which_t which,
472 void *buf)
cb5b3ef4 473{
2556c98b
BN
474 xfs_buf_t *bplist[MAX_BUFS];
475 unsigned int num;
476 off64_t first_off, last_off, next_off;
477 int len, size;
cb5b3ef4 478 int i;
2556c98b 479 int inode_bufs;
e33b06a3 480 unsigned long fsbno = 0;
379397bf 481 unsigned long max_fsbno;
2556c98b
BN
482 char *pbuf;
483
bb34c934 484 for (;;) {
379397bf 485 num = 0;
bb34c934
BN
486 if (which == PF_SECONDARY) {
487 bplist[0] = btree_find(args->io_queue, 0, &fsbno);
68d16907 488 max_fsbno = min(fsbno + pf_max_fsbs,
bb34c934
BN
489 args->last_bno_read);
490 } else {
491 bplist[0] = btree_find(args->io_queue,
492 args->last_bno_read, &fsbno);
493 max_fsbno = fsbno + pf_max_fsbs;
494 }
379397bf 495 while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) {
dd9093de 496 /*
bbd32754
DC
497 * Discontiguous buffers need special handling, so stop
498 * gathering new buffers and process the list and this
499 * discontigous buffer immediately. This avoids the
500 * complexity of keeping a separate discontigous buffer
501 * list and seeking back over ranges we've already done
502 * optimised reads for.
dd9093de
DC
503 */
504 if ((bplist[num]->b_flags & LIBXFS_B_DISCONTIG)) {
bbd32754
DC
505 num++;
506 break;
507 }
508
509 if (which != PF_META_ONLY ||
af60a998 510 !B_IS_INODE(libxfs_buf_priority(bplist[num])))
379397bf 511 num++;
e49f30a7
ES
512 if (num == MAX_BUFS)
513 break;
bb34c934 514 bplist[num] = btree_lookup_next(args->io_queue, &fsbno);
2556c98b 515 }
379397bf
BN
516 if (!num)
517 return;
cb5b3ef4 518
2556c98b
BN
519 /*
520 * do a big read if 25% of the potential buffer is useful,
521 * otherwise, find as many close together blocks and
522 * read them in one read
523 */
524 first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0]));
525 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
526 XFS_BUF_SIZE(bplist[num-1]);
2c350101 527 while (num > 1 && last_off - first_off > pf_max_bytes) {
2556c98b
BN
528 num--;
529 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
530 XFS_BUF_SIZE(bplist[num-1]);
531 }
532 if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) {
533 /*
534 * not enough blocks for one big read, so determine
535 * the number of blocks that are close enough.
536 */
537 last_off = first_off + XFS_BUF_SIZE(bplist[0]);
538 for (i = 1; i < num; i++) {
539 next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) +
540 XFS_BUF_SIZE(bplist[i]);
541 if (next_off - last_off > pf_batch_bytes)
542 break;
543 last_off = next_off;
544 }
545 num = i;
546 }
cb5b3ef4 547
2556c98b 548 for (i = 0; i < num; i++) {
bb34c934 549 if (btree_delete(args->io_queue, XFS_DADDR_TO_FSB(mp,
2556c98b
BN
550 XFS_BUF_ADDR(bplist[i]))) == NULL)
551 do_error(_("prefetch corruption\n"));
cb5b3ef4
MV
552 }
553
2556c98b
BN
554 if (which == PF_PRIMARY) {
555 for (inode_bufs = 0, i = 0; i < num; i++) {
af60a998 556 if (B_IS_INODE(libxfs_buf_priority(bplist[i])))
2556c98b
BN
557 inode_bufs++;
558 }
559 args->inode_bufs_queued -= inode_bufs;
560 if (inode_bufs && (first_off >> mp->m_sb.sb_blocklog) >
561 pf_batch_fsbs)
562 args->last_bno_read = (first_off >> mp->m_sb.sb_blocklog);
563 }
564#ifdef XR_PF_TRACE
565 pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)",
566 (long long)XFS_BUF_ADDR(bplist[0]),
567 (long long)XFS_BUF_ADDR(bplist[num-1]), num,
568 (which != PF_SECONDARY) ? "pri" : "sec", args->agno,
569 args->last_bno_read, args->inode_bufs_queued);
570#endif
571 pthread_mutex_unlock(&args->lock);
572
573 /*
574 * now read the data and put into the xfs_but_t's
575 */
2f9a125c 576 len = pread(mp_fd, buf, (int)(last_off - first_off), first_off);
bbd32754
DC
577
578 /*
579 * Check the last buffer on the list to see if we need to
580 * process a discontiguous buffer. The gather above loop
581 * guarantees that only the last buffer in the list will be a
582 * discontiguous buffer.
583 */
584 if ((bplist[num - 1]->b_flags & LIBXFS_B_DISCONTIG)) {
585 libxfs_readbufr_map(mp->m_ddev_targp, bplist[num - 1], 0);
586 bplist[num - 1]->b_flags |= LIBXFS_B_UNCHECKED;
e02ba985 587 libxfs_buf_relse(bplist[num - 1]);
bbd32754
DC
588 num--;
589 }
590
2556c98b
BN
591 if (len > 0) {
592 /*
593 * go through the xfs_buf_t list copying from the
594 * read buffer into the xfs_buf_t's and release them.
595 */
2556c98b
BN
596 for (i = 0; i < num; i++) {
597
598 pbuf = ((char *)buf) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) - first_off);
599 size = XFS_BUF_SIZE(bplist[i]);
600 if (len < size)
601 break;
04338619 602 memcpy(bplist[i]->b_addr, pbuf, size);
adbb3573
DC
603 bplist[i]->b_flags |= (LIBXFS_B_UPTODATE |
604 LIBXFS_B_UNCHECKED);
2556c98b 605 len -= size;
af60a998 606 if (B_IS_INODE(libxfs_buf_priority(bplist[i])))
2556c98b 607 pf_read_inode_dirs(args, bplist[i]);
69ec88b5 608 else if (which == PF_META_ONLY)
af60a998 609 libxfs_buf_set_priority(bplist[i],
69ec88b5
BN
610 B_DIR_META_H);
611 else if (which == PF_PRIMARY && num == 1)
af60a998 612 libxfs_buf_set_priority(bplist[i],
69ec88b5 613 B_DIR_META_S);
2556c98b
BN
614 }
615 }
616 for (i = 0; i < num; i++) {
2556c98b 617 pftrace("putbuf %c %p (%llu) in AG %d",
af60a998
DW
618 B_IS_INODE(libxfs_buf_priority(bplist[i])) ?
619 'I' : 'M',
2556c98b
BN
620 bplist[i], (long long)XFS_BUF_ADDR(bplist[i]),
621 args->agno);
e02ba985 622 libxfs_buf_relse(bplist[i]);
2556c98b
BN
623 }
624 pthread_mutex_lock(&args->lock);
625 if (which != PF_SECONDARY) {
2556c98b
BN
626 pftrace("inode_bufs_queued for AG %d = %d", args->agno,
627 args->inode_bufs_queued);
2556c98b
BN
628 /*
629 * if primary inode queue running low, process metadata
630 * in boths queues to avoid I/O starvation as the
631 * processing thread would be waiting for a metadata
632 * buffer
633 */
634 if (which == PF_PRIMARY && !args->queuing_done &&
635 args->inode_bufs_queued < IO_THRESHOLD) {
2556c98b
BN
636 pftrace("reading metadata bufs from primary queue for AG %d",
637 args->agno);
4c0a98ae 638
2556c98b 639 pf_batch_read(args, PF_META_ONLY, buf);
4c0a98ae 640
2556c98b
BN
641 pftrace("reading bufs from secondary queue for AG %d",
642 args->agno);
4c0a98ae 643
2556c98b
BN
644 pf_batch_read(args, PF_SECONDARY, buf);
645 }
cb5b3ef4 646 }
cb5b3ef4 647 }
2556c98b
BN
648}
649
650static void *
651pf_io_worker(
652 void *param)
653{
654 prefetch_args_t *args = param;
655 void *buf = memalign(libxfs_device_alignment(),
656 pf_max_bytes);
657
658 if (buf == NULL)
659 return NULL;
cb5b3ef4 660
2556c98b 661 pthread_mutex_lock(&args->lock);
bb34c934 662 while (!args->queuing_done || !btree_is_empty(args->io_queue)) {
2556c98b 663 pftrace("waiting to start prefetch I/O for AG %d", args->agno);
4c0a98ae 664
2556c98b
BN
665 while (!args->can_start_reading && !args->queuing_done)
666 pthread_cond_wait(&args->start_reading, &args->lock);
4c0a98ae 667
2556c98b 668 pftrace("starting prefetch I/O for AG %d", args->agno);
4c0a98ae 669
2556c98b
BN
670 pf_batch_read(args, PF_PRIMARY, buf);
671 pf_batch_read(args, PF_SECONDARY, buf);
672
2556c98b 673 pftrace("ran out of bufs to prefetch for AG %d", args->agno);
4c0a98ae 674
2556c98b
BN
675 if (!args->queuing_done)
676 args->can_start_reading = 0;
cb5b3ef4 677 }
2556c98b 678 pthread_mutex_unlock(&args->lock);
cb5b3ef4 679
2556c98b
BN
680 free(buf);
681
2556c98b 682 pftrace("finished prefetch I/O for AG %d", args->agno);
4c0a98ae 683
2556c98b 684 return NULL;
cb5b3ef4
MV
685}
686
2556c98b
BN
687static int
688pf_create_prefetch_thread(
689 prefetch_args_t *args);
690
b97ad969
JM
691/*
692 * If we fail to create the queuing thread or can't create even one
693 * prefetch thread, we need to let processing continue without it.
694 */
695static void
696pf_skip_prefetch_thread(prefetch_args_t *args)
697{
698 prefetch_args_t *next;
699
700 pthread_mutex_lock(&args->lock);
701 args->prefetch_done = 1;
702 pf_start_processing(args);
703 next = args->next_args;
704 args->next_args = NULL;
705 pthread_mutex_unlock(&args->lock);
706
707 if (next)
708 pf_create_prefetch_thread(next);
709}
710
2556c98b
BN
711static void *
712pf_queuing_worker(
713 void *param)
cb5b3ef4 714{
2556c98b 715 prefetch_args_t *args = param;
b97ad969 716 prefetch_args_t *next_args;
2556c98b
BN
717 int num_inos;
718 ino_tree_node_t *irec;
719 ino_tree_node_t *cur_irec;
2556c98b 720 xfs_agblock_t bno;
cb5b3ef4 721 int i;
2556c98b 722 int err;
870b18fd 723 uint64_t sparse;
e7fd2b6f 724 struct xfs_ino_geometry *igeo = M_IGEO(mp);
41baceb7 725 unsigned long long cluster_mask;
2556c98b 726
41baceb7 727 cluster_mask = (1ULL << igeo->inodes_per_cluster) - 1;
2556c98b
BN
728
729 for (i = 0; i < PF_THREAD_COUNT; i++) {
730 err = pthread_create(&args->io_threads[i], NULL,
731 pf_io_worker, args);
732 if (err != 0) {
733 do_warn(_("failed to create prefetch thread: %s\n"),
734 strerror(err));
e8ff6275
JM
735 pftrace("failed to create prefetch thread for AG %d: %s",
736 args->agno, strerror(err));
53dc81db 737 args->io_threads[i] = 0;
2556c98b 738 if (i == 0) {
b97ad969 739 pf_skip_prefetch_thread(args);
2556c98b
BN
740 return NULL;
741 }
742 /*
743 * since we have at least one I/O thread, use them for
744 * prefetch
745 */
746 break;
747 }
cb5b3ef4 748 }
2556c98b 749 pftrace("starting prefetch for AG %d", args->agno);
cb5b3ef4 750
2556c98b
BN
751 for (irec = findfirst_inode_rec(args->agno); irec != NULL;
752 irec = next_ino_rec(irec)) {
cb5b3ef4 753
2556c98b 754 cur_irec = irec;
cb5b3ef4 755
2556c98b 756 num_inos = XFS_INODES_PER_CHUNK;
e7fd2b6f 757 while (num_inos < igeo->ialloc_inos && irec != NULL) {
2556c98b
BN
758 irec = next_ino_rec(irec);
759 num_inos += XFS_INODES_PER_CHUNK;
760 }
cb5b3ef4 761
2556c98b
BN
762 if (args->dirs_only && cur_irec->ino_isa_dir == 0)
763 continue;
764#ifdef XR_PF_TRACE
765 sem_getvalue(&args->ra_count, &i);
766 pftrace("queuing irec %p in AG %d, sem count = %d",
767 irec, args->agno, i);
768#endif
3724f674 769 err = sem_trywait(&args->ra_count);
004e18d4 770 if (err < 0 && errno == EAGAIN) {
3724f674
CH
771 /*
772 * Kick the queue once we have reached the limit;
773 * without this the threads processing the inodes
774 * might get stuck on a buffer that has been locked
775 * and added to the I/O queue but is waiting for
776 * the thread to be woken.
7cf2aa1a
ES
777 * Start processing as well, in case everything so
778 * far was already prefetched and the queue is empty.
3724f674 779 */
7cf2aa1a 780
3724f674 781 pf_start_io_workers(args);
7cf2aa1a 782 pf_start_processing(args);
3724f674
CH
783 sem_wait(&args->ra_count);
784 }
2556c98b
BN
785
786 num_inos = 0;
787 bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum);
870b18fd 788 sparse = cur_irec->ir_sparse;
2556c98b
BN
789
790 do {
dd9093de
DC
791 struct xfs_buf_map map;
792
793 map.bm_bn = XFS_AGB_TO_DADDR(mp, args->agno, bno);
41baceb7
DW
794 map.bm_len = XFS_FSB_TO_BB(mp,
795 igeo->blocks_per_cluster);
870b18fd
BF
796
797 /*
798 * Queue I/O for each non-sparse cluster. We can check
799 * sparse state in cluster sized chunks as cluster size
800 * is the min. granularity of sparse irec regions.
801 */
41baceb7 802 if ((sparse & cluster_mask) == 0)
870b18fd
BF
803 pf_queue_io(args, &map, 1,
804 (cur_irec->ino_isa_dir != 0) ?
805 B_DIR_INODE : B_INODE);
806
41baceb7
DW
807 bno += igeo->blocks_per_cluster;
808 num_inos += igeo->inodes_per_cluster;
809 sparse >>= igeo->inodes_per_cluster;
e7fd2b6f 810 } while (num_inos < igeo->ialloc_inos);
cb5b3ef4
MV
811 }
812
2556c98b 813 pthread_mutex_lock(&args->lock);
cb5b3ef4 814
2556c98b
BN
815 pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)",
816 args->agno, args->inode_bufs_queued);
4c0a98ae 817
2556c98b
BN
818 args->queuing_done = 1;
819 pf_start_io_workers(args);
820 pf_start_processing(args);
821 pthread_mutex_unlock(&args->lock);
822
823 /* now wait for the readers to finish */
824 for (i = 0; i < PF_THREAD_COUNT; i++)
825 if (args->io_threads[i])
826 pthread_join(args->io_threads[i], NULL);
827
2556c98b 828 pftrace("prefetch for AG %d finished", args->agno);
4c0a98ae 829
2556c98b
BN
830 pthread_mutex_lock(&args->lock);
831
bb34c934 832 ASSERT(btree_is_empty(args->io_queue));
2556c98b
BN
833
834 args->prefetch_done = 1;
b97ad969
JM
835 next_args = args->next_args;
836 args->next_args = NULL;
2556c98b
BN
837 pthread_mutex_unlock(&args->lock);
838
b97ad969
JM
839 if (next_args)
840 pf_create_prefetch_thread(next_args);
841
2556c98b 842 return NULL;
cb5b3ef4
MV
843}
844
2556c98b
BN
845static int
846pf_create_prefetch_thread(
847 prefetch_args_t *args)
848{
849 int err;
850
2556c98b 851 pftrace("creating queue thread for AG %d", args->agno);
4c0a98ae 852
2556c98b
BN
853 err = pthread_create(&args->queuing_thread, NULL,
854 pf_queuing_worker, args);
855 if (err != 0) {
856 do_warn(_("failed to create prefetch thread: %s\n"),
857 strerror(err));
e8ff6275
JM
858 pftrace("failed to create prefetch thread for AG %d: %s",
859 args->agno, strerror(err));
53dc81db 860 args->queuing_thread = 0;
b97ad969 861 pf_skip_prefetch_thread(args);
2556c98b
BN
862 }
863
864 return err == 0;
865}
cb5b3ef4
MV
866
867void
2556c98b
BN
868init_prefetch(
869 xfs_mount_t *pmp)
cb5b3ef4 870{
2556c98b 871 mp = pmp;
ab434d12 872 mp_fd = libxfs_device_to_fd(mp->m_ddev_targp->bt_bdev);
2556c98b
BN
873 pf_max_bytes = sysconf(_SC_PAGE_SIZE) << 7;
874 pf_max_bbs = pf_max_bytes >> BBSHIFT;
875 pf_max_fsbs = pf_max_bytes >> mp->m_sb.sb_blocklog;
876 pf_batch_bytes = DEF_BATCH_BYTES;
877 pf_batch_fsbs = DEF_BATCH_BYTES >> (mp->m_sb.sb_blocklog + 1);
878}
cb5b3ef4 879
2556c98b
BN
880prefetch_args_t *
881start_inode_prefetch(
882 xfs_agnumber_t agno,
883 int dirs_only,
884 prefetch_args_t *prev_args)
885{
886 prefetch_args_t *args;
edf3f9d0 887 long max_queue;
e7fd2b6f 888 struct xfs_ino_geometry *igeo = M_IGEO(mp);
cb5b3ef4 889
2556c98b
BN
890 if (!do_prefetch || agno >= mp->m_sb.sb_agcount)
891 return NULL;
cb5b3ef4 892
2556c98b
BN
893 args = calloc(1, sizeof(prefetch_args_t));
894
bb34c934 895 btree_init(&args->io_queue);
5e656dbb
BN
896 if (pthread_mutex_init(&args->lock, NULL) != 0)
897 do_error(_("failed to initialize prefetch mutex\n"));
898 if (pthread_cond_init(&args->start_reading, NULL) != 0)
899 do_error(_("failed to initialize prefetch cond var\n"));
900 if (pthread_cond_init(&args->start_processing, NULL) != 0)
901 do_error(_("failed to initialize prefetch cond var\n"));
2556c98b
BN
902 args->agno = agno;
903 args->dirs_only = dirs_only;
904
905 /*
906 * use only 1/8 of the libxfs cache as we are only counting inodes
907 * and not any other associated metadata like directories
908 */
909
edf3f9d0 910 max_queue = libxfs_bcache->c_maxcount / thread_count / 8;
e7fd2b6f 911 if (igeo->inode_cluster_size > mp->m_sb.sb_blocksize)
41baceb7
DW
912 max_queue = max_queue * igeo->blocks_per_cluster /
913 igeo->ialloc_blks;
edf3f9d0
BN
914
915 sem_init(&args->ra_count, 0, max_queue);
2556c98b
BN
916
917 if (!prev_args) {
918 if (!pf_create_prefetch_thread(args))
919 return NULL;
920 } else {
921 pthread_mutex_lock(&prev_args->lock);
922 if (prev_args->prefetch_done) {
b97ad969 923 pthread_mutex_unlock(&prev_args->lock);
2556c98b
BN
924 if (!pf_create_prefetch_thread(args))
925 args = NULL;
e8ff6275 926 } else {
2556c98b 927 prev_args->next_args = args;
e8ff6275
JM
928 pftrace("queued AG %d after AG %d",
929 args->agno, prev_args->agno);
b97ad969 930 pthread_mutex_unlock(&prev_args->lock);
e8ff6275 931 }
cb5b3ef4 932 }
2556c98b
BN
933
934 return args;
cb5b3ef4
MV
935}
936
71014d19
DC
937/*
938 * prefetch_ag_range runs a prefetch-and-process loop across a range of AGs. It
939 * begins with @start+ag, and finishes with @end_ag - 1 (i.e. does not prefetch
940 * or process @end_ag). The function starts prefetch on the first AG, then loops
941 * starting prefetch on the next AG and then blocks processing the current AG as
942 * the prefetch queue brings inodes into the processing queue.
943 *
944 * There is only one prefetch taking place at a time, so the prefetch on the
945 * next AG only starts once the current AG has been completely prefetched. Hence
946 * the prefetch of the next AG will start some time before the processing of the
947 * current AG finishes, ensuring that when we iterate an start processing the
948 * next AG there is already a significant queue of inodes to process.
949 *
950 * Prefetch is done this way to prevent it from running too far ahead of the
951 * processing. Allowing it to do so can cause cache thrashing, where new
952 * prefetch causes previously prefetched buffers to be reclaimed before the
953 * processing thread uses them. This results in reading all the inodes and
954 * metadata twice per phase and it greatly slows down the processing. Hence we
955 * have to carefully control how far ahead we prefetch...
956 */
957static void
958prefetch_ag_range(
62843f36 959 struct workqueue *work,
71014d19
DC
960 xfs_agnumber_t start_ag,
961 xfs_agnumber_t end_ag,
962 bool dirs_only,
62843f36 963 void (*func)(struct workqueue *,
71014d19
DC
964 xfs_agnumber_t, void *))
965{
966 int i;
967 struct prefetch_args *pf_args[2];
968
969 pf_args[start_ag & 1] = start_inode_prefetch(start_ag, dirs_only, NULL);
970 for (i = start_ag; i < end_ag; i++) {
971 /* Don't prefetch end_ag */
972 if (i + 1 < end_ag)
973 pf_args[(~i) & 1] = start_inode_prefetch(i + 1,
974 dirs_only, pf_args[i & 1]);
975 func(work, i, pf_args[i & 1]);
976 }
977}
978
979struct pf_work_args {
980 xfs_agnumber_t start_ag;
981 xfs_agnumber_t end_ag;
982 bool dirs_only;
62843f36 983 void (*func)(struct workqueue *, xfs_agnumber_t, void *);
71014d19
DC
984};
985
986static void
987prefetch_ag_range_work(
62843f36 988 struct workqueue *work,
71014d19
DC
989 xfs_agnumber_t unused,
990 void *args)
991{
992 struct pf_work_args *wargs = args;
993
f8149110 994 prefetch_ag_range(work, wargs->start_ag, wargs->end_ag,
71014d19
DC
995 wargs->dirs_only, wargs->func);
996 free(args);
997}
998
1164bde5
DC
999/*
1000 * Do inode prefetch in the most optimal way for the context under which repair
1001 * has been run.
1002 */
1003void
1004do_inode_prefetch(
1005 struct xfs_mount *mp,
1006 int stride,
62843f36 1007 void (*func)(struct workqueue *,
1164bde5
DC
1008 xfs_agnumber_t, void *),
1009 bool check_cache,
1010 bool dirs_only)
1011{
71014d19 1012 int i;
62843f36
DW
1013 struct workqueue queue;
1014 struct workqueue *queues;
f994d14f 1015 int queues_started = 0;
1164bde5
DC
1016
1017 /*
1018 * If the previous phases of repair have not overflowed the buffer
1019 * cache, then we don't need to re-read any of the metadata in the
1020 * filesystem - it's all in the cache. In that case, run a thread per
1021 * CPU to maximise parallelism of the queue to be processed.
1022 */
1023 if (check_cache && !libxfs_bcache_overflowed()) {
62843f36 1024 queue.wq_ctx = mp;
4b45ff6f 1025 create_work_queue(&queue, mp, platform_nproc());
1164bde5
DC
1026 for (i = 0; i < mp->m_sb.sb_agcount; i++)
1027 queue_work(&queue, func, i, NULL);
1028 destroy_work_queue(&queue);
1029 return;
1030 }
1031
1032 /*
1033 * single threaded behaviour - single prefetch thread, processed
1034 * directly after each AG is queued.
1035 */
1036 if (!stride) {
62843f36 1037 queue.wq_ctx = mp;
71014d19
DC
1038 prefetch_ag_range(&queue, 0, mp->m_sb.sb_agcount,
1039 dirs_only, func);
1164bde5
DC
1040 return;
1041 }
1042
1043 /*
1044 * create one worker thread for each segment of the volume
1045 */
62843f36 1046 queues = malloc(thread_count * sizeof(struct workqueue));
71014d19
DC
1047 for (i = 0; i < thread_count; i++) {
1048 struct pf_work_args *wargs;
1049
1050 wargs = malloc(sizeof(struct pf_work_args));
1051 wargs->start_ag = i * stride;
1052 wargs->end_ag = min((i + 1) * stride,
1053 mp->m_sb.sb_agcount);
1054 wargs->dirs_only = dirs_only;
1055 wargs->func = func;
1056
1164bde5 1057 create_work_queue(&queues[i], mp, 1);
71014d19 1058 queue_work(&queues[i], prefetch_ag_range_work, 0, wargs);
f994d14f 1059 queues_started++;
71014d19
DC
1060
1061 if (wargs->end_ag >= mp->m_sb.sb_agcount)
1062 break;
1164bde5 1063 }
71014d19 1064
1164bde5
DC
1065 /*
1066 * wait for workers to complete
1067 */
f994d14f 1068 for (i = 0; i < queues_started; i++)
1164bde5
DC
1069 destroy_work_queue(&queues[i]);
1070 free(queues);
1071}
1072
cb5b3ef4 1073void
2556c98b
BN
1074wait_for_inode_prefetch(
1075 prefetch_args_t *args)
cb5b3ef4 1076{
2556c98b 1077 if (args == NULL)
cb5b3ef4 1078 return;
2556c98b
BN
1079
1080 pthread_mutex_lock(&args->lock);
1081
1082 while (!args->can_start_processing) {
2556c98b 1083 pftrace("waiting to start processing AG %d", args->agno);
4c0a98ae 1084
2556c98b 1085 pthread_cond_wait(&args->start_processing, &args->lock);
cb5b3ef4 1086 }
2556c98b 1087 pftrace("can start processing AG %d", args->agno);
4c0a98ae 1088
2556c98b
BN
1089 pthread_mutex_unlock(&args->lock);
1090}
cb5b3ef4 1091
2556c98b
BN
1092void
1093cleanup_inode_prefetch(
1094 prefetch_args_t *args)
1095{
1096 if (args == NULL)
1097 return;
cb5b3ef4 1098
2556c98b 1099 pftrace("waiting AG %d prefetch to finish", args->agno);
4c0a98ae 1100
2556c98b
BN
1101 if (args->queuing_thread)
1102 pthread_join(args->queuing_thread, NULL);
1103
2556c98b 1104 pftrace("AG %d prefetch done", args->agno);
4c0a98ae 1105
b97ad969
JM
1106 ASSERT(args->next_args == NULL);
1107
2556c98b
BN
1108 pthread_mutex_destroy(&args->lock);
1109 pthread_cond_destroy(&args->start_reading);
1110 pthread_cond_destroy(&args->start_processing);
1111 sem_destroy(&args->ra_count);
bb34c934 1112 btree_destroy(args->io_queue);
2556c98b
BN
1113
1114 free(args);
cb5b3ef4
MV
1115}
1116
2556c98b
BN
1117#ifdef XR_PF_TRACE
1118
4c0a98ae
BN
1119static FILE *pf_trace_file;
1120
1121void
1122pftrace_init(void)
1123{
1124 pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w");
1125 setvbuf(pf_trace_file, NULL, _IOLBF, 1024);
1126}
1127
1128void
1129pftrace_done(void)
1130{
1131 fclose(pf_trace_file);
1132}
1133
cb5b3ef4 1134void
2556c98b 1135_pftrace(const char *func, const char *msg, ...)
cb5b3ef4 1136{
2556c98b
BN
1137 char buf[200];
1138 struct timeval tv;
1139 va_list args;
cb5b3ef4 1140
2556c98b 1141 gettimeofday(&tv, NULL);
cb5b3ef4 1142
2556c98b
BN
1143 va_start(args, msg);
1144 vsnprintf(buf, sizeof(buf), msg, args);
1145 buf[sizeof(buf)-1] = '\0';
1146 va_end(args);
cb5b3ef4 1147
4c0a98ae
BN
1148 fprintf(pf_trace_file, "%lu.%06lu %s: %s\n", tv.tv_sec, tv.tv_usec,
1149 func, buf);
cb5b3ef4 1150}
2556c98b
BN
1151
1152#endif