]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - repair/prefetch.c
xfs: don't treat unknown di_flags2 as corruption in scrub
[thirdparty/xfsprogs-dev.git] / repair / prefetch.c
CommitLineData
959ef981
DC
1// SPDX-License-Identifier: GPL-2.0
2
6b803e5a 3#include "libxfs.h"
2556c98b 4#include <pthread.h>
cb5b3ef4 5#include "avl.h"
379397bf 6#include "btree.h"
cb5b3ef4
MV
7#include "globals.h"
8#include "agheader.h"
9#include "incore.h"
cb5b3ef4 10#include "dir2.h"
cb5b3ef4
MV
11#include "protos.h"
12#include "err_protos.h"
13#include "dinode.h"
14#include "bmap.h"
15#include "versions.h"
2556c98b
BN
16#include "threads.h"
17#include "prefetch.h"
18#include "progress.h"
cb5b3ef4
MV
19
20int do_prefetch = 1;
21
2556c98b
BN
22/*
23 * Performs prefetching by priming the libxfs cache by using a dedicate thread
24 * scanning inodes and reading blocks in ahead of time they are required.
25 *
26 * Any I/O errors can be safely ignored.
27 */
cb5b3ef4 28
2556c98b
BN
29static xfs_mount_t *mp;
30static int mp_fd;
31static int pf_max_bytes;
32static int pf_max_bbs;
33static int pf_max_fsbs;
34static int pf_batch_bytes;
35static int pf_batch_fsbs;
cb5b3ef4 36
69ec88b5
BN
37static void pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *);
38
a040d7c9
BN
39/*
40 * Buffer priorities for the libxfs cache
41 *
42 * Directory metadata is ranked higher than other metadata as it's used
43 * in phases 3, 4 and 6, while other metadata is only used in 3 & 4.
44 */
69ec88b5 45
a040d7c9
BN
46/* intermediate directory btree nodes - can't be queued */
47#define B_DIR_BMAP CACHE_PREFETCH_PRIORITY + 7
48/* directory metadata in secondary queue */
49#define B_DIR_META_2 CACHE_PREFETCH_PRIORITY + 6
50/* dir metadata that had to fetched from the primary queue to avoid stalling */
51#define B_DIR_META_H CACHE_PREFETCH_PRIORITY + 5
52/* single block of directory metadata (can't batch read) */
53#define B_DIR_META_S CACHE_PREFETCH_PRIORITY + 4
54/* dir metadata with more than one block fetched in a single I/O */
55#define B_DIR_META CACHE_PREFETCH_PRIORITY + 3
56/* inode clusters with directory inodes */
57#define B_DIR_INODE CACHE_PREFETCH_PRIORITY + 2
58/* intermediate extent btree nodes */
59#define B_BMAP CACHE_PREFETCH_PRIORITY + 1
60/* inode clusters without any directory entries */
61#define B_INODE CACHE_PREFETCH_PRIORITY
69ec88b5 62
a040d7c9
BN
63/*
64 * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if
65 * the buffer is for an inode or other metadata.
66 */
67#define B_IS_INODE(f) (((f) & 5) == 0)
cb5b3ef4 68
2556c98b
BN
69#define DEF_BATCH_BYTES 0x10000
70
71#define MAX_BUFS 128
72
69ec88b5 73#define IO_THRESHOLD (MAX_BUFS * 2)
2556c98b
BN
74
75typedef enum pf_which {
76 PF_PRIMARY,
77 PF_SECONDARY,
78 PF_META_ONLY
79} pf_which_t;
80
81
82static inline void
83pf_start_processing(
84 prefetch_args_t *args)
85{
86 if (!args->can_start_processing) {
2556c98b 87 pftrace("signalling processing for AG %d", args->agno);
4c0a98ae 88
2556c98b
BN
89 args->can_start_processing = 1;
90 pthread_cond_signal(&args->start_processing);
cb5b3ef4 91 }
2556c98b
BN
92}
93
94static inline void
95pf_start_io_workers(
96 prefetch_args_t *args)
97{
98 if (!args->can_start_reading) {
2556c98b 99 pftrace("signalling reading for AG %d", args->agno);
4c0a98ae 100
2556c98b
BN
101 args->can_start_reading = 1;
102 pthread_cond_broadcast(&args->start_reading);
cb5b3ef4 103 }
cb5b3ef4
MV
104}
105
2556c98b 106
cb5b3ef4 107static void
2556c98b
BN
108pf_queue_io(
109 prefetch_args_t *args,
dd9093de
DC
110 struct xfs_buf_map *map,
111 int nmaps,
2556c98b 112 int flag)
cb5b3ef4 113{
dd9093de
DC
114 struct xfs_buf *bp;
115 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, map[0].bm_bn);
cb5b3ef4 116
2ae22647
CH
117 /*
118 * Never block on a buffer lock here, given that the actual repair
119 * code might lock buffers in a different order from us. Given that
120 * the lock holder is either reading it from disk himself or
121 * completely overwriting it this behaviour is perfectly fine.
122 */
dd9093de 123 bp = libxfs_getbuf_map(mp->m_dev, map, nmaps, LIBXFS_GETBUF_TRYLOCK);
2ae22647
CH
124 if (!bp)
125 return;
126
2556c98b 127 if (bp->b_flags & LIBXFS_B_UPTODATE) {
69ec88b5
BN
128 if (B_IS_INODE(flag))
129 pf_read_inode_dirs(args, bp);
b3563c19
BN
130 XFS_BUF_SET_PRIORITY(bp, XFS_BUF_PRIORITY(bp) +
131 CACHE_PREFETCH_PRIORITY);
2556c98b 132 libxfs_putbuf(bp);
cb5b3ef4
MV
133 return;
134 }
69ec88b5 135 XFS_BUF_SET_PRIORITY(bp, flag);
cb5b3ef4 136
2556c98b 137 pthread_mutex_lock(&args->lock);
cb5b3ef4 138
bb34c934
BN
139 btree_insert(args->io_queue, fsbno, bp);
140
2556c98b 141 if (fsbno > args->last_bno_read) {
379397bf 142 if (B_IS_INODE(flag)) {
2556c98b
BN
143 args->inode_bufs_queued++;
144 if (args->inode_bufs_queued == IO_THRESHOLD)
145 pf_start_io_workers(args);
cb5b3ef4 146 }
2556c98b 147 } else {
08cee623 148 ASSERT(!B_IS_INODE(flag));
69ec88b5 149 XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
cb5b3ef4
MV
150 }
151
4c0a98ae
BN
152 pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue"
153 "(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag) ?
154 'I' : 'M', bp, (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
155 args->inode_bufs_queued, args->last_bno_read);
156
2556c98b 157 pf_start_processing(args);
cb5b3ef4 158
2556c98b 159 pthread_mutex_unlock(&args->lock);
cb5b3ef4
MV
160}
161
2556c98b
BN
162static int
163pf_read_bmbt_reclist(
164 prefetch_args_t *args,
165 xfs_bmbt_rec_t *rp,
166 int numrecs)
cb5b3ef4 167{
cb5b3ef4 168 int i;
e0a12bda 169 xfs_bmbt_irec_t irec;
5a35bf2c
DC
170 xfs_filblks_t cp = 0; /* prev count */
171 xfs_fileoff_t op = 0; /* prev offset */
dd9093de
DC
172#define MAP_ARRAY_SZ 4
173 struct xfs_buf_map map_array[MAP_ARRAY_SZ];
174 struct xfs_buf_map *map = map_array;
175 int max_extents = MAP_ARRAY_SZ;
24e04791 176 int nmaps = 0;
dd9093de
DC
177 unsigned int len = 0;
178 int ret = 0;
179
2556c98b 180
5e656dbb
BN
181 for (i = 0; i < numrecs; i++) {
182 libxfs_bmbt_disk_get_all(rp + i, &irec);
2556c98b 183
e0a12bda
BN
184 if (((i > 0) && (op + cp > irec.br_startoff)) ||
185 (irec.br_blockcount == 0) ||
186 (irec.br_startoff >= fs_max_file_offset))
dd9093de 187 goto out_free;
2556c98b 188
e0a12bda
BN
189 if (!verify_dfsbno(mp, irec.br_startblock) || !verify_dfsbno(mp,
190 irec.br_startblock + irec.br_blockcount - 1))
dd9093de 191 goto out_free;
2556c98b 192
e0a12bda 193 if (!args->dirs_only && ((irec.br_startoff +
ff105f75 194 irec.br_blockcount) >= mp->m_dir_geo->freeblk))
2556c98b
BN
195 break; /* only Phase 6 reads the free blocks */
196
e0a12bda
BN
197 op = irec.br_startoff;
198 cp = irec.br_blockcount;
2556c98b 199
e0a12bda 200 while (irec.br_blockcount) {
dd9093de 201 unsigned int bm_len;
4c0a98ae 202
2556c98b 203 pftrace("queuing dir extent in AG %d", args->agno);
4c0a98ae 204
ff105f75
DC
205 if (len + irec.br_blockcount >= mp->m_dir_geo->fsbcount)
206 bm_len = mp->m_dir_geo->fsbcount - len;
dd9093de
DC
207 else
208 bm_len = irec.br_blockcount;
209 len += bm_len;
210
211 map[nmaps].bm_bn = XFS_FSB_TO_DADDR(mp,
212 irec.br_startblock);
213 map[nmaps].bm_len = XFS_FSB_TO_BB(mp, bm_len);
214 nmaps++;
215
ff105f75 216 if (len == mp->m_dir_geo->fsbcount) {
dd9093de
DC
217 pf_queue_io(args, map, nmaps, B_DIR_META);
218 len = 0;
219 nmaps = 0;
220 }
221
222 irec.br_blockcount -= bm_len;
223 irec.br_startblock += bm_len;
224
225 /*
226 * Handle very fragmented dir2 blocks with dynamically
227 * allocated buffer maps.
228 */
229 if (nmaps >= max_extents) {
230 struct xfs_buf_map *old_map = NULL;
231
232 if (map == map_array) {
233 old_map = map;
234 map = NULL;
235 }
236 max_extents *= 2;
237 map = realloc(map, max_extents * sizeof(*map));
238 if (map == NULL) {
239 do_error(
240 _("couldn't malloc dir2 buffer list\n"));
241 exit(1);
242 }
243 if (old_map)
244 memcpy(map, old_map, sizeof(map_array));
245 }
246
2556c98b
BN
247 }
248 }
dd9093de
DC
249 ret = 1;
250out_free:
251 if (map != map_array)
252 free(map);
253 return ret;
2556c98b 254}
cb5b3ef4 255
2556c98b
BN
256/*
257 * simplified version of the main scan_lbtree. Returns 0 to stop.
258 */
259
260static int
261pf_scan_lbtree(
5a35bf2c 262 xfs_fsblock_t dbno,
2556c98b
BN
263 int level,
264 int isadir,
265 prefetch_args_t *args,
b3563c19 266 int (*func)(struct xfs_btree_block *block,
2556c98b
BN
267 int level,
268 int isadir,
269 prefetch_args_t *args))
270{
271 xfs_buf_t *bp;
272 int rc;
cb5b3ef4 273
2556c98b 274 bp = libxfs_readbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, dbno),
e0607266 275 XFS_FSB_TO_BB(mp, 1), 0, &xfs_bmbt_buf_ops);
2556c98b
BN
276 if (!bp)
277 return 0;
cb5b3ef4 278
69ec88b5
BN
279 XFS_BUF_SET_PRIORITY(bp, isadir ? B_DIR_BMAP : B_BMAP);
280
43ba1861
DW
281 /*
282 * If the verifier flagged a problem with the buffer, we can't trust
283 * its contents for the purposes of reading ahead. Stop prefetching
284 * the tree and mark the buffer unchecked so that the next read of the
285 * buffer will retain the error status and be acted upon appropriately.
286 */
287 if (bp->b_error) {
288 bp->b_flags |= LIBXFS_B_UNCHECKED;
289 libxfs_putbuf(bp);
290 return 0;
291 }
292
b3563c19 293 rc = (*func)(XFS_BUF_TO_BLOCK(bp), level - 1, isadir, args);
cb5b3ef4 294
2556c98b 295 libxfs_putbuf(bp);
cb5b3ef4 296
2556c98b
BN
297 return rc;
298}
299
300static int
301pf_scanfunc_bmap(
b3563c19 302 struct xfs_btree_block *block,
2556c98b
BN
303 int level,
304 int isadir,
305 prefetch_args_t *args)
306{
2556c98b
BN
307 xfs_bmbt_ptr_t *pp;
308 int numrecs;
309 int i;
5a35bf2c 310 xfs_fsblock_t dbno;
2556c98b
BN
311
312 /*
313 * do some validation on the block contents
314 */
1c88e98c
DC
315 if ((block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) &&
316 block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC)) ||
2556c98b
BN
317 (be16_to_cpu(block->bb_level) != level))
318 return 0;
319
320 numrecs = be16_to_cpu(block->bb_numrecs);
321
322 if (level == 0) {
323 if (numrecs > mp->m_bmap_dmxr[0] || !isadir)
324 return 0;
5e656dbb 325 return pf_read_bmbt_reclist(args,
b3563c19 326 XFS_BMBT_REC_ADDR(mp, block, 1), numrecs);
cb5b3ef4
MV
327 }
328
2556c98b
BN
329 if (numrecs > mp->m_bmap_dmxr[1])
330 return 0;
cb5b3ef4 331
b3563c19 332 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
2556c98b
BN
333
334 for (i = 0; i < numrecs; i++) {
fb36a55d 335 dbno = get_unaligned_be64(&pp[i]);
2556c98b
BN
336 if (!verify_dfsbno(mp, dbno))
337 return 0;
338 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
339 return 0;
cb5b3ef4 340 }
2556c98b 341 return 1;
cb5b3ef4
MV
342}
343
2556c98b
BN
344
345static void
346pf_read_btinode(
347 prefetch_args_t *args,
348 xfs_dinode_t *dino,
349 int isadir)
cb5b3ef4 350{
2556c98b
BN
351 xfs_bmdr_block_t *dib;
352 xfs_bmbt_ptr_t *pp;
353 int i;
354 int level;
355 int numrecs;
356 int dsize;
5a35bf2c 357 xfs_fsblock_t dbno;
2556c98b
BN
358
359 dib = (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dino);
360
361 level = be16_to_cpu(dib->bb_level);
362 numrecs = be16_to_cpu(dib->bb_numrecs);
363
364 if ((numrecs == 0) || (level == 0) ||
365 (level > XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))
cb5b3ef4 366 return;
2556c98b
BN
367 /*
368 * use bmdr/dfork_dsize since the root block is in the data fork
369 */
370 if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp))
371 return;
372
373 dsize = XFS_DFORK_DSIZE(dino, mp);
e2f60652 374 pp = XFS_BMDR_PTR_ADDR(dib, 1, libxfs_bmdr_maxrecs(dsize, 0));
cb5b3ef4 375
2556c98b 376 for (i = 0; i < numrecs; i++) {
fb36a55d 377 dbno = get_unaligned_be64(&pp[i]);
2556c98b 378 if (!verify_dfsbno(mp, dbno))
cb5b3ef4 379 break;
2556c98b 380 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
cb5b3ef4 381 break;
2556c98b
BN
382 }
383}
384
385static void
386pf_read_exinode(
387 prefetch_args_t *args,
388 xfs_dinode_t *dino)
389{
390 pf_read_bmbt_reclist(args, (xfs_bmbt_rec_t *)XFS_DFORK_DPTR(dino),
56b2de80 391 be32_to_cpu(dino->di_nextents));
2556c98b 392}
cb5b3ef4 393
2556c98b
BN
394static void
395pf_read_inode_dirs(
396 prefetch_args_t *args,
397 xfs_buf_t *bp)
398{
399 xfs_dinode_t *dino;
400 int icnt = 0;
69ec88b5
BN
401 int hasdir = 0;
402 int isadir;
2556c98b 403
adbb3573 404 libxfs_readbuf_verify(bp, &xfs_inode_buf_ops);
e0607266
DC
405 if (bp->b_error)
406 return;
407
135e4bfe 408 for (icnt = 0; icnt < (bp->b_bcount >> mp->m_sb.sb_inodelog); icnt++) {
56b2de80 409 dino = xfs_make_iptr(mp, bp, icnt);
2556c98b
BN
410
411 /*
412 * We are only prefetching directory contents in extents
413 * and btree nodes for other inodes
414 */
56b2de80 415 isadir = (be16_to_cpu(dino->di_mode) & S_IFMT) == S_IFDIR;
69ec88b5
BN
416 hasdir |= isadir;
417
56b2de80 418 if (dino->di_format <= XFS_DINODE_FMT_LOCAL)
69ec88b5
BN
419 continue;
420
56b2de80 421 if (!isadir && (dino->di_format == XFS_DINODE_FMT_EXTENTS ||
69ec88b5 422 args->dirs_only))
2556c98b
BN
423 continue;
424
425 /*
426 * do some checks on the inode to see if we can prefetch
427 * its directory data. It's a cut down version of
428 * process_dinode_int() in dinode.c.
429 */
56b2de80 430 if (dino->di_format > XFS_DINODE_FMT_BTREE)
2556c98b
BN
431 continue;
432
56b2de80 433 if (be16_to_cpu(dino->di_magic) != XFS_DINODE_MAGIC)
2556c98b
BN
434 continue;
435
e2f60652 436 if (!libxfs_dinode_good_version(mp, dino->di_version))
2556c98b
BN
437 continue;
438
56b2de80 439 if (be64_to_cpu(dino->di_size) <= XFS_DFORK_DSIZE(dino, mp))
2556c98b
BN
440 continue;
441
56b2de80 442 if ((dino->di_forkoff != 0) &&
49f693fa 443 (dino->di_forkoff >= XFS_LITINO(mp, dino->di_version) >> 3))
2556c98b
BN
444 continue;
445
56b2de80 446 switch (dino->di_format) {
2556c98b
BN
447 case XFS_DINODE_FMT_EXTENTS:
448 pf_read_exinode(args, dino);
cb5b3ef4 449 break;
2556c98b 450 case XFS_DINODE_FMT_BTREE:
69ec88b5 451 pf_read_btinode(args, dino, isadir);
cb5b3ef4 452 break;
cb5b3ef4
MV
453 }
454 }
69ec88b5
BN
455 if (hasdir)
456 XFS_BUF_SET_PRIORITY(bp, B_DIR_INODE);
cb5b3ef4
MV
457}
458
dd9093de
DC
459/*
460 * pf_batch_read must be called with the lock locked.
461 */
cb5b3ef4 462static void
2556c98b
BN
463pf_batch_read(
464 prefetch_args_t *args,
465 pf_which_t which,
466 void *buf)
cb5b3ef4 467{
2556c98b
BN
468 xfs_buf_t *bplist[MAX_BUFS];
469 unsigned int num;
470 off64_t first_off, last_off, next_off;
471 int len, size;
cb5b3ef4 472 int i;
2556c98b 473 int inode_bufs;
e33b06a3 474 unsigned long fsbno = 0;
379397bf 475 unsigned long max_fsbno;
2556c98b
BN
476 char *pbuf;
477
bb34c934 478 for (;;) {
379397bf 479 num = 0;
bb34c934
BN
480 if (which == PF_SECONDARY) {
481 bplist[0] = btree_find(args->io_queue, 0, &fsbno);
68d16907 482 max_fsbno = min(fsbno + pf_max_fsbs,
bb34c934
BN
483 args->last_bno_read);
484 } else {
485 bplist[0] = btree_find(args->io_queue,
486 args->last_bno_read, &fsbno);
487 max_fsbno = fsbno + pf_max_fsbs;
488 }
379397bf 489 while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) {
dd9093de 490 /*
bbd32754
DC
491 * Discontiguous buffers need special handling, so stop
492 * gathering new buffers and process the list and this
493 * discontigous buffer immediately. This avoids the
494 * complexity of keeping a separate discontigous buffer
495 * list and seeking back over ranges we've already done
496 * optimised reads for.
dd9093de
DC
497 */
498 if ((bplist[num]->b_flags & LIBXFS_B_DISCONTIG)) {
bbd32754
DC
499 num++;
500 break;
501 }
502
503 if (which != PF_META_ONLY ||
dd9093de 504 !B_IS_INODE(XFS_BUF_PRIORITY(bplist[num])))
379397bf 505 num++;
e49f30a7
ES
506 if (num == MAX_BUFS)
507 break;
bb34c934 508 bplist[num] = btree_lookup_next(args->io_queue, &fsbno);
2556c98b 509 }
379397bf
BN
510 if (!num)
511 return;
cb5b3ef4 512
2556c98b
BN
513 /*
514 * do a big read if 25% of the potential buffer is useful,
515 * otherwise, find as many close together blocks and
516 * read them in one read
517 */
518 first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0]));
519 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
520 XFS_BUF_SIZE(bplist[num-1]);
2c350101 521 while (num > 1 && last_off - first_off > pf_max_bytes) {
2556c98b
BN
522 num--;
523 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
524 XFS_BUF_SIZE(bplist[num-1]);
525 }
526 if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) {
527 /*
528 * not enough blocks for one big read, so determine
529 * the number of blocks that are close enough.
530 */
531 last_off = first_off + XFS_BUF_SIZE(bplist[0]);
532 for (i = 1; i < num; i++) {
533 next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) +
534 XFS_BUF_SIZE(bplist[i]);
535 if (next_off - last_off > pf_batch_bytes)
536 break;
537 last_off = next_off;
538 }
539 num = i;
540 }
cb5b3ef4 541
2556c98b 542 for (i = 0; i < num; i++) {
bb34c934 543 if (btree_delete(args->io_queue, XFS_DADDR_TO_FSB(mp,
2556c98b
BN
544 XFS_BUF_ADDR(bplist[i]))) == NULL)
545 do_error(_("prefetch corruption\n"));
cb5b3ef4
MV
546 }
547
2556c98b
BN
548 if (which == PF_PRIMARY) {
549 for (inode_bufs = 0, i = 0; i < num; i++) {
69ec88b5 550 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
2556c98b
BN
551 inode_bufs++;
552 }
553 args->inode_bufs_queued -= inode_bufs;
554 if (inode_bufs && (first_off >> mp->m_sb.sb_blocklog) >
555 pf_batch_fsbs)
556 args->last_bno_read = (first_off >> mp->m_sb.sb_blocklog);
557 }
558#ifdef XR_PF_TRACE
559 pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)",
560 (long long)XFS_BUF_ADDR(bplist[0]),
561 (long long)XFS_BUF_ADDR(bplist[num-1]), num,
562 (which != PF_SECONDARY) ? "pri" : "sec", args->agno,
563 args->last_bno_read, args->inode_bufs_queued);
564#endif
565 pthread_mutex_unlock(&args->lock);
566
567 /*
568 * now read the data and put into the xfs_but_t's
569 */
2f9a125c 570 len = pread(mp_fd, buf, (int)(last_off - first_off), first_off);
bbd32754
DC
571
572 /*
573 * Check the last buffer on the list to see if we need to
574 * process a discontiguous buffer. The gather above loop
575 * guarantees that only the last buffer in the list will be a
576 * discontiguous buffer.
577 */
578 if ((bplist[num - 1]->b_flags & LIBXFS_B_DISCONTIG)) {
579 libxfs_readbufr_map(mp->m_ddev_targp, bplist[num - 1], 0);
580 bplist[num - 1]->b_flags |= LIBXFS_B_UNCHECKED;
581 libxfs_putbuf(bplist[num - 1]);
582 num--;
583 }
584
2556c98b
BN
585 if (len > 0) {
586 /*
587 * go through the xfs_buf_t list copying from the
588 * read buffer into the xfs_buf_t's and release them.
589 */
2556c98b
BN
590 for (i = 0; i < num; i++) {
591
592 pbuf = ((char *)buf) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) - first_off);
593 size = XFS_BUF_SIZE(bplist[i]);
594 if (len < size)
595 break;
04338619 596 memcpy(bplist[i]->b_addr, pbuf, size);
adbb3573
DC
597 bplist[i]->b_flags |= (LIBXFS_B_UPTODATE |
598 LIBXFS_B_UNCHECKED);
2556c98b 599 len -= size;
69ec88b5 600 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
2556c98b 601 pf_read_inode_dirs(args, bplist[i]);
69ec88b5
BN
602 else if (which == PF_META_ONLY)
603 XFS_BUF_SET_PRIORITY(bplist[i],
604 B_DIR_META_H);
605 else if (which == PF_PRIMARY && num == 1)
606 XFS_BUF_SET_PRIORITY(bplist[i],
607 B_DIR_META_S);
2556c98b
BN
608 }
609 }
610 for (i = 0; i < num; i++) {
2556c98b 611 pftrace("putbuf %c %p (%llu) in AG %d",
69ec88b5 612 B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M',
2556c98b
BN
613 bplist[i], (long long)XFS_BUF_ADDR(bplist[i]),
614 args->agno);
2556c98b
BN
615 libxfs_putbuf(bplist[i]);
616 }
617 pthread_mutex_lock(&args->lock);
618 if (which != PF_SECONDARY) {
2556c98b
BN
619 pftrace("inode_bufs_queued for AG %d = %d", args->agno,
620 args->inode_bufs_queued);
2556c98b
BN
621 /*
622 * if primary inode queue running low, process metadata
623 * in boths queues to avoid I/O starvation as the
624 * processing thread would be waiting for a metadata
625 * buffer
626 */
627 if (which == PF_PRIMARY && !args->queuing_done &&
628 args->inode_bufs_queued < IO_THRESHOLD) {
2556c98b
BN
629 pftrace("reading metadata bufs from primary queue for AG %d",
630 args->agno);
4c0a98ae 631
2556c98b 632 pf_batch_read(args, PF_META_ONLY, buf);
4c0a98ae 633
2556c98b
BN
634 pftrace("reading bufs from secondary queue for AG %d",
635 args->agno);
4c0a98ae 636
2556c98b
BN
637 pf_batch_read(args, PF_SECONDARY, buf);
638 }
cb5b3ef4 639 }
cb5b3ef4 640 }
2556c98b
BN
641}
642
643static void *
644pf_io_worker(
645 void *param)
646{
647 prefetch_args_t *args = param;
648 void *buf = memalign(libxfs_device_alignment(),
649 pf_max_bytes);
650
651 if (buf == NULL)
652 return NULL;
cb5b3ef4 653
2556c98b 654 pthread_mutex_lock(&args->lock);
bb34c934 655 while (!args->queuing_done || !btree_is_empty(args->io_queue)) {
2556c98b 656 pftrace("waiting to start prefetch I/O for AG %d", args->agno);
4c0a98ae 657
2556c98b
BN
658 while (!args->can_start_reading && !args->queuing_done)
659 pthread_cond_wait(&args->start_reading, &args->lock);
4c0a98ae 660
2556c98b 661 pftrace("starting prefetch I/O for AG %d", args->agno);
4c0a98ae 662
2556c98b
BN
663 pf_batch_read(args, PF_PRIMARY, buf);
664 pf_batch_read(args, PF_SECONDARY, buf);
665
2556c98b 666 pftrace("ran out of bufs to prefetch for AG %d", args->agno);
4c0a98ae 667
2556c98b
BN
668 if (!args->queuing_done)
669 args->can_start_reading = 0;
cb5b3ef4 670 }
2556c98b 671 pthread_mutex_unlock(&args->lock);
cb5b3ef4 672
2556c98b
BN
673 free(buf);
674
2556c98b 675 pftrace("finished prefetch I/O for AG %d", args->agno);
4c0a98ae 676
2556c98b 677 return NULL;
cb5b3ef4
MV
678}
679
2556c98b
BN
680static int
681pf_create_prefetch_thread(
682 prefetch_args_t *args);
683
b97ad969
JM
684/*
685 * If we fail to create the queuing thread or can't create even one
686 * prefetch thread, we need to let processing continue without it.
687 */
688static void
689pf_skip_prefetch_thread(prefetch_args_t *args)
690{
691 prefetch_args_t *next;
692
693 pthread_mutex_lock(&args->lock);
694 args->prefetch_done = 1;
695 pf_start_processing(args);
696 next = args->next_args;
697 args->next_args = NULL;
698 pthread_mutex_unlock(&args->lock);
699
700 if (next)
701 pf_create_prefetch_thread(next);
702}
703
2556c98b
BN
704static void *
705pf_queuing_worker(
706 void *param)
cb5b3ef4 707{
2556c98b 708 prefetch_args_t *args = param;
b97ad969 709 prefetch_args_t *next_args;
2556c98b
BN
710 int num_inos;
711 ino_tree_node_t *irec;
712 ino_tree_node_t *cur_irec;
713 int blks_per_cluster;
2556c98b 714 xfs_agblock_t bno;
cb5b3ef4 715 int i;
2556c98b 716 int err;
870b18fd 717 uint64_t sparse;
2556c98b 718
ff105f75 719 blks_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
2556c98b
BN
720 if (blks_per_cluster == 0)
721 blks_per_cluster = 1;
2556c98b
BN
722
723 for (i = 0; i < PF_THREAD_COUNT; i++) {
724 err = pthread_create(&args->io_threads[i], NULL,
725 pf_io_worker, args);
726 if (err != 0) {
727 do_warn(_("failed to create prefetch thread: %s\n"),
728 strerror(err));
e8ff6275
JM
729 pftrace("failed to create prefetch thread for AG %d: %s",
730 args->agno, strerror(err));
53dc81db 731 args->io_threads[i] = 0;
2556c98b 732 if (i == 0) {
b97ad969 733 pf_skip_prefetch_thread(args);
2556c98b
BN
734 return NULL;
735 }
736 /*
737 * since we have at least one I/O thread, use them for
738 * prefetch
739 */
740 break;
741 }
cb5b3ef4 742 }
2556c98b 743 pftrace("starting prefetch for AG %d", args->agno);
cb5b3ef4 744
2556c98b
BN
745 for (irec = findfirst_inode_rec(args->agno); irec != NULL;
746 irec = next_ino_rec(irec)) {
cb5b3ef4 747
2556c98b 748 cur_irec = irec;
cb5b3ef4 749
2556c98b 750 num_inos = XFS_INODES_PER_CHUNK;
ff105f75 751 while (num_inos < mp->m_ialloc_inos && irec != NULL) {
2556c98b
BN
752 irec = next_ino_rec(irec);
753 num_inos += XFS_INODES_PER_CHUNK;
754 }
cb5b3ef4 755
2556c98b
BN
756 if (args->dirs_only && cur_irec->ino_isa_dir == 0)
757 continue;
758#ifdef XR_PF_TRACE
759 sem_getvalue(&args->ra_count, &i);
760 pftrace("queuing irec %p in AG %d, sem count = %d",
761 irec, args->agno, i);
762#endif
3724f674 763 err = sem_trywait(&args->ra_count);
004e18d4 764 if (err < 0 && errno == EAGAIN) {
3724f674
CH
765 /*
766 * Kick the queue once we have reached the limit;
767 * without this the threads processing the inodes
768 * might get stuck on a buffer that has been locked
769 * and added to the I/O queue but is waiting for
770 * the thread to be woken.
771 */
772 pf_start_io_workers(args);
773 sem_wait(&args->ra_count);
774 }
2556c98b
BN
775
776 num_inos = 0;
777 bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum);
870b18fd 778 sparse = cur_irec->ir_sparse;
2556c98b
BN
779
780 do {
dd9093de
DC
781 struct xfs_buf_map map;
782
783 map.bm_bn = XFS_AGB_TO_DADDR(mp, args->agno, bno);
784 map.bm_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
870b18fd
BF
785
786 /*
787 * Queue I/O for each non-sparse cluster. We can check
788 * sparse state in cluster sized chunks as cluster size
789 * is the min. granularity of sparse irec regions.
790 */
c782bf02 791 if ((sparse & ((1ULL << inodes_per_cluster) - 1)) == 0)
870b18fd
BF
792 pf_queue_io(args, &map, 1,
793 (cur_irec->ino_isa_dir != 0) ?
794 B_DIR_INODE : B_INODE);
795
2556c98b 796 bno += blks_per_cluster;
edf3f9d0 797 num_inos += inodes_per_cluster;
870b18fd 798 sparse >>= inodes_per_cluster;
ff105f75 799 } while (num_inos < mp->m_ialloc_inos);
cb5b3ef4
MV
800 }
801
2556c98b 802 pthread_mutex_lock(&args->lock);
cb5b3ef4 803
2556c98b
BN
804 pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)",
805 args->agno, args->inode_bufs_queued);
4c0a98ae 806
2556c98b
BN
807 args->queuing_done = 1;
808 pf_start_io_workers(args);
809 pf_start_processing(args);
810 pthread_mutex_unlock(&args->lock);
811
812 /* now wait for the readers to finish */
813 for (i = 0; i < PF_THREAD_COUNT; i++)
814 if (args->io_threads[i])
815 pthread_join(args->io_threads[i], NULL);
816
2556c98b 817 pftrace("prefetch for AG %d finished", args->agno);
4c0a98ae 818
2556c98b
BN
819 pthread_mutex_lock(&args->lock);
820
bb34c934 821 ASSERT(btree_is_empty(args->io_queue));
2556c98b
BN
822
823 args->prefetch_done = 1;
b97ad969
JM
824 next_args = args->next_args;
825 args->next_args = NULL;
2556c98b
BN
826 pthread_mutex_unlock(&args->lock);
827
b97ad969
JM
828 if (next_args)
829 pf_create_prefetch_thread(next_args);
830
2556c98b 831 return NULL;
cb5b3ef4
MV
832}
833
2556c98b
BN
834static int
835pf_create_prefetch_thread(
836 prefetch_args_t *args)
837{
838 int err;
839
2556c98b 840 pftrace("creating queue thread for AG %d", args->agno);
4c0a98ae 841
2556c98b
BN
842 err = pthread_create(&args->queuing_thread, NULL,
843 pf_queuing_worker, args);
844 if (err != 0) {
845 do_warn(_("failed to create prefetch thread: %s\n"),
846 strerror(err));
e8ff6275
JM
847 pftrace("failed to create prefetch thread for AG %d: %s",
848 args->agno, strerror(err));
53dc81db 849 args->queuing_thread = 0;
b97ad969 850 pf_skip_prefetch_thread(args);
2556c98b
BN
851 }
852
853 return err == 0;
854}
cb5b3ef4
MV
855
856void
2556c98b
BN
857init_prefetch(
858 xfs_mount_t *pmp)
cb5b3ef4 859{
2556c98b 860 mp = pmp;
75c8b434 861 mp_fd = libxfs_device_to_fd(mp->m_ddev_targp->dev);
2556c98b
BN
862 pf_max_bytes = sysconf(_SC_PAGE_SIZE) << 7;
863 pf_max_bbs = pf_max_bytes >> BBSHIFT;
864 pf_max_fsbs = pf_max_bytes >> mp->m_sb.sb_blocklog;
865 pf_batch_bytes = DEF_BATCH_BYTES;
866 pf_batch_fsbs = DEF_BATCH_BYTES >> (mp->m_sb.sb_blocklog + 1);
867}
cb5b3ef4 868
2556c98b
BN
869prefetch_args_t *
870start_inode_prefetch(
871 xfs_agnumber_t agno,
872 int dirs_only,
873 prefetch_args_t *prev_args)
874{
875 prefetch_args_t *args;
edf3f9d0 876 long max_queue;
cb5b3ef4 877
2556c98b
BN
878 if (!do_prefetch || agno >= mp->m_sb.sb_agcount)
879 return NULL;
cb5b3ef4 880
2556c98b
BN
881 args = calloc(1, sizeof(prefetch_args_t));
882
bb34c934 883 btree_init(&args->io_queue);
5e656dbb
BN
884 if (pthread_mutex_init(&args->lock, NULL) != 0)
885 do_error(_("failed to initialize prefetch mutex\n"));
886 if (pthread_cond_init(&args->start_reading, NULL) != 0)
887 do_error(_("failed to initialize prefetch cond var\n"));
888 if (pthread_cond_init(&args->start_processing, NULL) != 0)
889 do_error(_("failed to initialize prefetch cond var\n"));
2556c98b
BN
890 args->agno = agno;
891 args->dirs_only = dirs_only;
892
893 /*
894 * use only 1/8 of the libxfs cache as we are only counting inodes
895 * and not any other associated metadata like directories
896 */
897
edf3f9d0 898 max_queue = libxfs_bcache->c_maxcount / thread_count / 8;
ff105f75
DC
899 if (mp->m_inode_cluster_size > mp->m_sb.sb_blocksize)
900 max_queue = max_queue *
901 (mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog) /
902 mp->m_ialloc_blks;
edf3f9d0
BN
903
904 sem_init(&args->ra_count, 0, max_queue);
2556c98b
BN
905
906 if (!prev_args) {
907 if (!pf_create_prefetch_thread(args))
908 return NULL;
909 } else {
910 pthread_mutex_lock(&prev_args->lock);
911 if (prev_args->prefetch_done) {
b97ad969 912 pthread_mutex_unlock(&prev_args->lock);
2556c98b
BN
913 if (!pf_create_prefetch_thread(args))
914 args = NULL;
e8ff6275 915 } else {
2556c98b 916 prev_args->next_args = args;
e8ff6275
JM
917 pftrace("queued AG %d after AG %d",
918 args->agno, prev_args->agno);
b97ad969 919 pthread_mutex_unlock(&prev_args->lock);
e8ff6275 920 }
cb5b3ef4 921 }
2556c98b
BN
922
923 return args;
cb5b3ef4
MV
924}
925
71014d19
DC
926/*
927 * prefetch_ag_range runs a prefetch-and-process loop across a range of AGs. It
928 * begins with @start+ag, and finishes with @end_ag - 1 (i.e. does not prefetch
929 * or process @end_ag). The function starts prefetch on the first AG, then loops
930 * starting prefetch on the next AG and then blocks processing the current AG as
931 * the prefetch queue brings inodes into the processing queue.
932 *
933 * There is only one prefetch taking place at a time, so the prefetch on the
934 * next AG only starts once the current AG has been completely prefetched. Hence
935 * the prefetch of the next AG will start some time before the processing of the
936 * current AG finishes, ensuring that when we iterate an start processing the
937 * next AG there is already a significant queue of inodes to process.
938 *
939 * Prefetch is done this way to prevent it from running too far ahead of the
940 * processing. Allowing it to do so can cause cache thrashing, where new
941 * prefetch causes previously prefetched buffers to be reclaimed before the
942 * processing thread uses them. This results in reading all the inodes and
943 * metadata twice per phase and it greatly slows down the processing. Hence we
944 * have to carefully control how far ahead we prefetch...
945 */
946static void
947prefetch_ag_range(
62843f36 948 struct workqueue *work,
71014d19
DC
949 xfs_agnumber_t start_ag,
950 xfs_agnumber_t end_ag,
951 bool dirs_only,
62843f36 952 void (*func)(struct workqueue *,
71014d19
DC
953 xfs_agnumber_t, void *))
954{
955 int i;
956 struct prefetch_args *pf_args[2];
957
958 pf_args[start_ag & 1] = start_inode_prefetch(start_ag, dirs_only, NULL);
959 for (i = start_ag; i < end_ag; i++) {
960 /* Don't prefetch end_ag */
961 if (i + 1 < end_ag)
962 pf_args[(~i) & 1] = start_inode_prefetch(i + 1,
963 dirs_only, pf_args[i & 1]);
964 func(work, i, pf_args[i & 1]);
965 }
966}
967
968struct pf_work_args {
969 xfs_agnumber_t start_ag;
970 xfs_agnumber_t end_ag;
971 bool dirs_only;
62843f36 972 void (*func)(struct workqueue *, xfs_agnumber_t, void *);
71014d19
DC
973};
974
975static void
976prefetch_ag_range_work(
62843f36 977 struct workqueue *work,
71014d19
DC
978 xfs_agnumber_t unused,
979 void *args)
980{
981 struct pf_work_args *wargs = args;
982
f8149110 983 prefetch_ag_range(work, wargs->start_ag, wargs->end_ag,
71014d19
DC
984 wargs->dirs_only, wargs->func);
985 free(args);
986}
987
1164bde5
DC
988/*
989 * Do inode prefetch in the most optimal way for the context under which repair
990 * has been run.
991 */
992void
993do_inode_prefetch(
994 struct xfs_mount *mp,
995 int stride,
62843f36 996 void (*func)(struct workqueue *,
1164bde5
DC
997 xfs_agnumber_t, void *),
998 bool check_cache,
999 bool dirs_only)
1000{
71014d19 1001 int i;
62843f36
DW
1002 struct workqueue queue;
1003 struct workqueue *queues;
f994d14f 1004 int queues_started = 0;
1164bde5
DC
1005
1006 /*
1007 * If the previous phases of repair have not overflowed the buffer
1008 * cache, then we don't need to re-read any of the metadata in the
1009 * filesystem - it's all in the cache. In that case, run a thread per
1010 * CPU to maximise parallelism of the queue to be processed.
1011 */
1012 if (check_cache && !libxfs_bcache_overflowed()) {
62843f36 1013 queue.wq_ctx = mp;
1164bde5
DC
1014 create_work_queue(&queue, mp, libxfs_nproc());
1015 for (i = 0; i < mp->m_sb.sb_agcount; i++)
1016 queue_work(&queue, func, i, NULL);
1017 destroy_work_queue(&queue);
1018 return;
1019 }
1020
1021 /*
1022 * single threaded behaviour - single prefetch thread, processed
1023 * directly after each AG is queued.
1024 */
1025 if (!stride) {
62843f36 1026 queue.wq_ctx = mp;
71014d19
DC
1027 prefetch_ag_range(&queue, 0, mp->m_sb.sb_agcount,
1028 dirs_only, func);
1164bde5
DC
1029 return;
1030 }
1031
1032 /*
1033 * create one worker thread for each segment of the volume
1034 */
62843f36 1035 queues = malloc(thread_count * sizeof(struct workqueue));
71014d19
DC
1036 for (i = 0; i < thread_count; i++) {
1037 struct pf_work_args *wargs;
1038
1039 wargs = malloc(sizeof(struct pf_work_args));
1040 wargs->start_ag = i * stride;
1041 wargs->end_ag = min((i + 1) * stride,
1042 mp->m_sb.sb_agcount);
1043 wargs->dirs_only = dirs_only;
1044 wargs->func = func;
1045
1164bde5 1046 create_work_queue(&queues[i], mp, 1);
71014d19 1047 queue_work(&queues[i], prefetch_ag_range_work, 0, wargs);
f994d14f 1048 queues_started++;
71014d19
DC
1049
1050 if (wargs->end_ag >= mp->m_sb.sb_agcount)
1051 break;
1164bde5 1052 }
71014d19 1053
1164bde5
DC
1054 /*
1055 * wait for workers to complete
1056 */
f994d14f 1057 for (i = 0; i < queues_started; i++)
1164bde5
DC
1058 destroy_work_queue(&queues[i]);
1059 free(queues);
1060}
1061
cb5b3ef4 1062void
2556c98b
BN
1063wait_for_inode_prefetch(
1064 prefetch_args_t *args)
cb5b3ef4 1065{
2556c98b 1066 if (args == NULL)
cb5b3ef4 1067 return;
2556c98b
BN
1068
1069 pthread_mutex_lock(&args->lock);
1070
1071 while (!args->can_start_processing) {
2556c98b 1072 pftrace("waiting to start processing AG %d", args->agno);
4c0a98ae 1073
2556c98b 1074 pthread_cond_wait(&args->start_processing, &args->lock);
cb5b3ef4 1075 }
2556c98b 1076 pftrace("can start processing AG %d", args->agno);
4c0a98ae 1077
2556c98b
BN
1078 pthread_mutex_unlock(&args->lock);
1079}
cb5b3ef4 1080
2556c98b
BN
1081void
1082cleanup_inode_prefetch(
1083 prefetch_args_t *args)
1084{
1085 if (args == NULL)
1086 return;
cb5b3ef4 1087
2556c98b 1088 pftrace("waiting AG %d prefetch to finish", args->agno);
4c0a98ae 1089
2556c98b
BN
1090 if (args->queuing_thread)
1091 pthread_join(args->queuing_thread, NULL);
1092
2556c98b 1093 pftrace("AG %d prefetch done", args->agno);
4c0a98ae 1094
b97ad969
JM
1095 ASSERT(args->next_args == NULL);
1096
2556c98b
BN
1097 pthread_mutex_destroy(&args->lock);
1098 pthread_cond_destroy(&args->start_reading);
1099 pthread_cond_destroy(&args->start_processing);
1100 sem_destroy(&args->ra_count);
bb34c934 1101 btree_destroy(args->io_queue);
2556c98b
BN
1102
1103 free(args);
cb5b3ef4
MV
1104}
1105
2556c98b
BN
1106#ifdef XR_PF_TRACE
1107
4c0a98ae
BN
1108static FILE *pf_trace_file;
1109
1110void
1111pftrace_init(void)
1112{
1113 pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w");
1114 setvbuf(pf_trace_file, NULL, _IOLBF, 1024);
1115}
1116
1117void
1118pftrace_done(void)
1119{
1120 fclose(pf_trace_file);
1121}
1122
cb5b3ef4 1123void
2556c98b 1124_pftrace(const char *func, const char *msg, ...)
cb5b3ef4 1125{
2556c98b
BN
1126 char buf[200];
1127 struct timeval tv;
1128 va_list args;
cb5b3ef4 1129
2556c98b 1130 gettimeofday(&tv, NULL);
cb5b3ef4 1131
2556c98b
BN
1132 va_start(args, msg);
1133 vsnprintf(buf, sizeof(buf), msg, args);
1134 buf[sizeof(buf)-1] = '\0';
1135 va_end(args);
cb5b3ef4 1136
4c0a98ae
BN
1137 fprintf(pf_trace_file, "%lu.%06lu %s: %s\n", tv.tv_sec, tv.tv_usec,
1138 func, buf);
cb5b3ef4 1139}
2556c98b
BN
1140
1141#endif