]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - repair/prefetch.c
xfs_repair: clear pthread_t when pthread_create fails
[thirdparty/xfsprogs-dev.git] / repair / prefetch.c
CommitLineData
6b803e5a 1#include "libxfs.h"
2556c98b 2#include <pthread.h>
cb5b3ef4 3#include "avl.h"
379397bf 4#include "btree.h"
cb5b3ef4
MV
5#include "globals.h"
6#include "agheader.h"
7#include "incore.h"
cb5b3ef4 8#include "dir2.h"
cb5b3ef4
MV
9#include "protos.h"
10#include "err_protos.h"
11#include "dinode.h"
12#include "bmap.h"
13#include "versions.h"
2556c98b
BN
14#include "threads.h"
15#include "prefetch.h"
16#include "progress.h"
cb5b3ef4
MV
17
18int do_prefetch = 1;
19
2556c98b
BN
20/*
21 * Performs prefetching by priming the libxfs cache by using a dedicate thread
22 * scanning inodes and reading blocks in ahead of time they are required.
23 *
24 * Any I/O errors can be safely ignored.
25 */
cb5b3ef4 26
2556c98b
BN
27static xfs_mount_t *mp;
28static int mp_fd;
29static int pf_max_bytes;
30static int pf_max_bbs;
31static int pf_max_fsbs;
32static int pf_batch_bytes;
33static int pf_batch_fsbs;
cb5b3ef4 34
69ec88b5
BN
35static void pf_read_inode_dirs(prefetch_args_t *, xfs_buf_t *);
36
a040d7c9
BN
37/*
38 * Buffer priorities for the libxfs cache
39 *
40 * Directory metadata is ranked higher than other metadata as it's used
41 * in phases 3, 4 and 6, while other metadata is only used in 3 & 4.
42 */
69ec88b5 43
a040d7c9
BN
44/* intermediate directory btree nodes - can't be queued */
45#define B_DIR_BMAP CACHE_PREFETCH_PRIORITY + 7
46/* directory metadata in secondary queue */
47#define B_DIR_META_2 CACHE_PREFETCH_PRIORITY + 6
48/* dir metadata that had to fetched from the primary queue to avoid stalling */
49#define B_DIR_META_H CACHE_PREFETCH_PRIORITY + 5
50/* single block of directory metadata (can't batch read) */
51#define B_DIR_META_S CACHE_PREFETCH_PRIORITY + 4
52/* dir metadata with more than one block fetched in a single I/O */
53#define B_DIR_META CACHE_PREFETCH_PRIORITY + 3
54/* inode clusters with directory inodes */
55#define B_DIR_INODE CACHE_PREFETCH_PRIORITY + 2
56/* intermediate extent btree nodes */
57#define B_BMAP CACHE_PREFETCH_PRIORITY + 1
58/* inode clusters without any directory entries */
59#define B_INODE CACHE_PREFETCH_PRIORITY
69ec88b5 60
a040d7c9
BN
61/*
62 * Test if bit 0 or 2 is set in the "priority tag" of the buffer to see if
63 * the buffer is for an inode or other metadata.
64 */
65#define B_IS_INODE(f) (((f) & 5) == 0)
cb5b3ef4 66
2556c98b
BN
67#define DEF_BATCH_BYTES 0x10000
68
69#define MAX_BUFS 128
70
69ec88b5 71#define IO_THRESHOLD (MAX_BUFS * 2)
2556c98b
BN
72
73typedef enum pf_which {
74 PF_PRIMARY,
75 PF_SECONDARY,
76 PF_META_ONLY
77} pf_which_t;
78
79
80static inline void
81pf_start_processing(
82 prefetch_args_t *args)
83{
84 if (!args->can_start_processing) {
2556c98b 85 pftrace("signalling processing for AG %d", args->agno);
4c0a98ae 86
2556c98b
BN
87 args->can_start_processing = 1;
88 pthread_cond_signal(&args->start_processing);
cb5b3ef4 89 }
2556c98b
BN
90}
91
92static inline void
93pf_start_io_workers(
94 prefetch_args_t *args)
95{
96 if (!args->can_start_reading) {
2556c98b 97 pftrace("signalling reading for AG %d", args->agno);
4c0a98ae 98
2556c98b
BN
99 args->can_start_reading = 1;
100 pthread_cond_broadcast(&args->start_reading);
cb5b3ef4 101 }
cb5b3ef4
MV
102}
103
2556c98b 104
cb5b3ef4 105static void
2556c98b
BN
106pf_queue_io(
107 prefetch_args_t *args,
dd9093de
DC
108 struct xfs_buf_map *map,
109 int nmaps,
2556c98b 110 int flag)
cb5b3ef4 111{
dd9093de
DC
112 struct xfs_buf *bp;
113 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, map[0].bm_bn);
cb5b3ef4 114
2ae22647
CH
115 /*
116 * Never block on a buffer lock here, given that the actual repair
117 * code might lock buffers in a different order from us. Given that
118 * the lock holder is either reading it from disk himself or
119 * completely overwriting it this behaviour is perfectly fine.
120 */
dd9093de 121 bp = libxfs_getbuf_map(mp->m_dev, map, nmaps, LIBXFS_GETBUF_TRYLOCK);
2ae22647
CH
122 if (!bp)
123 return;
124
2556c98b 125 if (bp->b_flags & LIBXFS_B_UPTODATE) {
69ec88b5
BN
126 if (B_IS_INODE(flag))
127 pf_read_inode_dirs(args, bp);
b3563c19
BN
128 XFS_BUF_SET_PRIORITY(bp, XFS_BUF_PRIORITY(bp) +
129 CACHE_PREFETCH_PRIORITY);
2556c98b 130 libxfs_putbuf(bp);
cb5b3ef4
MV
131 return;
132 }
69ec88b5 133 XFS_BUF_SET_PRIORITY(bp, flag);
cb5b3ef4 134
2556c98b 135 pthread_mutex_lock(&args->lock);
cb5b3ef4 136
bb34c934
BN
137 btree_insert(args->io_queue, fsbno, bp);
138
2556c98b 139 if (fsbno > args->last_bno_read) {
379397bf 140 if (B_IS_INODE(flag)) {
2556c98b
BN
141 args->inode_bufs_queued++;
142 if (args->inode_bufs_queued == IO_THRESHOLD)
143 pf_start_io_workers(args);
cb5b3ef4 144 }
2556c98b 145 } else {
08cee623 146 ASSERT(!B_IS_INODE(flag));
69ec88b5 147 XFS_BUF_SET_PRIORITY(bp, B_DIR_META_2);
cb5b3ef4
MV
148 }
149
4c0a98ae
BN
150 pftrace("getbuf %c %p (%llu) in AG %d (fsbno = %lu) added to queue"
151 "(inode_bufs_queued = %d, last_bno = %lu)", B_IS_INODE(flag) ?
152 'I' : 'M', bp, (long long)XFS_BUF_ADDR(bp), args->agno, fsbno,
153 args->inode_bufs_queued, args->last_bno_read);
154
2556c98b 155 pf_start_processing(args);
cb5b3ef4 156
2556c98b 157 pthread_mutex_unlock(&args->lock);
cb5b3ef4
MV
158}
159
2556c98b
BN
160static int
161pf_read_bmbt_reclist(
162 prefetch_args_t *args,
163 xfs_bmbt_rec_t *rp,
164 int numrecs)
cb5b3ef4 165{
cb5b3ef4 166 int i;
e0a12bda 167 xfs_bmbt_irec_t irec;
5a35bf2c
DC
168 xfs_filblks_t cp = 0; /* prev count */
169 xfs_fileoff_t op = 0; /* prev offset */
dd9093de
DC
170#define MAP_ARRAY_SZ 4
171 struct xfs_buf_map map_array[MAP_ARRAY_SZ];
172 struct xfs_buf_map *map = map_array;
173 int max_extents = MAP_ARRAY_SZ;
24e04791 174 int nmaps = 0;
dd9093de
DC
175 unsigned int len = 0;
176 int ret = 0;
177
2556c98b 178
5e656dbb
BN
179 for (i = 0; i < numrecs; i++) {
180 libxfs_bmbt_disk_get_all(rp + i, &irec);
2556c98b 181
e0a12bda
BN
182 if (((i > 0) && (op + cp > irec.br_startoff)) ||
183 (irec.br_blockcount == 0) ||
184 (irec.br_startoff >= fs_max_file_offset))
dd9093de 185 goto out_free;
2556c98b 186
e0a12bda
BN
187 if (!verify_dfsbno(mp, irec.br_startblock) || !verify_dfsbno(mp,
188 irec.br_startblock + irec.br_blockcount - 1))
dd9093de 189 goto out_free;
2556c98b 190
e0a12bda 191 if (!args->dirs_only && ((irec.br_startoff +
ff105f75 192 irec.br_blockcount) >= mp->m_dir_geo->freeblk))
2556c98b
BN
193 break; /* only Phase 6 reads the free blocks */
194
e0a12bda
BN
195 op = irec.br_startoff;
196 cp = irec.br_blockcount;
2556c98b 197
e0a12bda 198 while (irec.br_blockcount) {
dd9093de 199 unsigned int bm_len;
4c0a98ae 200
2556c98b 201 pftrace("queuing dir extent in AG %d", args->agno);
4c0a98ae 202
ff105f75
DC
203 if (len + irec.br_blockcount >= mp->m_dir_geo->fsbcount)
204 bm_len = mp->m_dir_geo->fsbcount - len;
dd9093de
DC
205 else
206 bm_len = irec.br_blockcount;
207 len += bm_len;
208
209 map[nmaps].bm_bn = XFS_FSB_TO_DADDR(mp,
210 irec.br_startblock);
211 map[nmaps].bm_len = XFS_FSB_TO_BB(mp, bm_len);
212 nmaps++;
213
ff105f75 214 if (len == mp->m_dir_geo->fsbcount) {
dd9093de
DC
215 pf_queue_io(args, map, nmaps, B_DIR_META);
216 len = 0;
217 nmaps = 0;
218 }
219
220 irec.br_blockcount -= bm_len;
221 irec.br_startblock += bm_len;
222
223 /*
224 * Handle very fragmented dir2 blocks with dynamically
225 * allocated buffer maps.
226 */
227 if (nmaps >= max_extents) {
228 struct xfs_buf_map *old_map = NULL;
229
230 if (map == map_array) {
231 old_map = map;
232 map = NULL;
233 }
234 max_extents *= 2;
235 map = realloc(map, max_extents * sizeof(*map));
236 if (map == NULL) {
237 do_error(
238 _("couldn't malloc dir2 buffer list\n"));
239 exit(1);
240 }
241 if (old_map)
242 memcpy(map, old_map, sizeof(map_array));
243 }
244
2556c98b
BN
245 }
246 }
dd9093de
DC
247 ret = 1;
248out_free:
249 if (map != map_array)
250 free(map);
251 return ret;
2556c98b 252}
cb5b3ef4 253
2556c98b
BN
254/*
255 * simplified version of the main scan_lbtree. Returns 0 to stop.
256 */
257
258static int
259pf_scan_lbtree(
5a35bf2c 260 xfs_fsblock_t dbno,
2556c98b
BN
261 int level,
262 int isadir,
263 prefetch_args_t *args,
b3563c19 264 int (*func)(struct xfs_btree_block *block,
2556c98b
BN
265 int level,
266 int isadir,
267 prefetch_args_t *args))
268{
269 xfs_buf_t *bp;
270 int rc;
cb5b3ef4 271
2556c98b 272 bp = libxfs_readbuf(mp->m_dev, XFS_FSB_TO_DADDR(mp, dbno),
e0607266 273 XFS_FSB_TO_BB(mp, 1), 0, &xfs_bmbt_buf_ops);
2556c98b
BN
274 if (!bp)
275 return 0;
cb5b3ef4 276
69ec88b5
BN
277 XFS_BUF_SET_PRIORITY(bp, isadir ? B_DIR_BMAP : B_BMAP);
278
43ba1861
DW
279 /*
280 * If the verifier flagged a problem with the buffer, we can't trust
281 * its contents for the purposes of reading ahead. Stop prefetching
282 * the tree and mark the buffer unchecked so that the next read of the
283 * buffer will retain the error status and be acted upon appropriately.
284 */
285 if (bp->b_error) {
286 bp->b_flags |= LIBXFS_B_UNCHECKED;
287 libxfs_putbuf(bp);
288 return 0;
289 }
290
b3563c19 291 rc = (*func)(XFS_BUF_TO_BLOCK(bp), level - 1, isadir, args);
cb5b3ef4 292
2556c98b 293 libxfs_putbuf(bp);
cb5b3ef4 294
2556c98b
BN
295 return rc;
296}
297
298static int
299pf_scanfunc_bmap(
b3563c19 300 struct xfs_btree_block *block,
2556c98b
BN
301 int level,
302 int isadir,
303 prefetch_args_t *args)
304{
2556c98b
BN
305 xfs_bmbt_ptr_t *pp;
306 int numrecs;
307 int i;
5a35bf2c 308 xfs_fsblock_t dbno;
2556c98b
BN
309
310 /*
311 * do some validation on the block contents
312 */
1c88e98c
DC
313 if ((block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC) &&
314 block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC)) ||
2556c98b
BN
315 (be16_to_cpu(block->bb_level) != level))
316 return 0;
317
318 numrecs = be16_to_cpu(block->bb_numrecs);
319
320 if (level == 0) {
321 if (numrecs > mp->m_bmap_dmxr[0] || !isadir)
322 return 0;
5e656dbb 323 return pf_read_bmbt_reclist(args,
b3563c19 324 XFS_BMBT_REC_ADDR(mp, block, 1), numrecs);
cb5b3ef4
MV
325 }
326
2556c98b
BN
327 if (numrecs > mp->m_bmap_dmxr[1])
328 return 0;
cb5b3ef4 329
b3563c19 330 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
2556c98b
BN
331
332 for (i = 0; i < numrecs; i++) {
fb36a55d 333 dbno = get_unaligned_be64(&pp[i]);
2556c98b
BN
334 if (!verify_dfsbno(mp, dbno))
335 return 0;
336 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
337 return 0;
cb5b3ef4 338 }
2556c98b 339 return 1;
cb5b3ef4
MV
340}
341
2556c98b
BN
342
343static void
344pf_read_btinode(
345 prefetch_args_t *args,
346 xfs_dinode_t *dino,
347 int isadir)
cb5b3ef4 348{
2556c98b
BN
349 xfs_bmdr_block_t *dib;
350 xfs_bmbt_ptr_t *pp;
351 int i;
352 int level;
353 int numrecs;
354 int dsize;
5a35bf2c 355 xfs_fsblock_t dbno;
2556c98b
BN
356
357 dib = (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dino);
358
359 level = be16_to_cpu(dib->bb_level);
360 numrecs = be16_to_cpu(dib->bb_numrecs);
361
362 if ((numrecs == 0) || (level == 0) ||
363 (level > XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))
cb5b3ef4 364 return;
2556c98b
BN
365 /*
366 * use bmdr/dfork_dsize since the root block is in the data fork
367 */
368 if (XFS_BMDR_SPACE_CALC(numrecs) > XFS_DFORK_DSIZE(dino, mp))
369 return;
370
371 dsize = XFS_DFORK_DSIZE(dino, mp);
e2f60652 372 pp = XFS_BMDR_PTR_ADDR(dib, 1, libxfs_bmdr_maxrecs(dsize, 0));
cb5b3ef4 373
2556c98b 374 for (i = 0; i < numrecs; i++) {
fb36a55d 375 dbno = get_unaligned_be64(&pp[i]);
2556c98b 376 if (!verify_dfsbno(mp, dbno))
cb5b3ef4 377 break;
2556c98b 378 if (!pf_scan_lbtree(dbno, level, isadir, args, pf_scanfunc_bmap))
cb5b3ef4 379 break;
2556c98b
BN
380 }
381}
382
383static void
384pf_read_exinode(
385 prefetch_args_t *args,
386 xfs_dinode_t *dino)
387{
388 pf_read_bmbt_reclist(args, (xfs_bmbt_rec_t *)XFS_DFORK_DPTR(dino),
56b2de80 389 be32_to_cpu(dino->di_nextents));
2556c98b 390}
cb5b3ef4 391
2556c98b
BN
392static void
393pf_read_inode_dirs(
394 prefetch_args_t *args,
395 xfs_buf_t *bp)
396{
397 xfs_dinode_t *dino;
398 int icnt = 0;
69ec88b5
BN
399 int hasdir = 0;
400 int isadir;
2556c98b 401
adbb3573 402 libxfs_readbuf_verify(bp, &xfs_inode_buf_ops);
e0607266
DC
403 if (bp->b_error)
404 return;
405
2556c98b 406 for (icnt = 0; icnt < (XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog); icnt++) {
56b2de80 407 dino = xfs_make_iptr(mp, bp, icnt);
2556c98b
BN
408
409 /*
410 * We are only prefetching directory contents in extents
411 * and btree nodes for other inodes
412 */
56b2de80 413 isadir = (be16_to_cpu(dino->di_mode) & S_IFMT) == S_IFDIR;
69ec88b5
BN
414 hasdir |= isadir;
415
56b2de80 416 if (dino->di_format <= XFS_DINODE_FMT_LOCAL)
69ec88b5
BN
417 continue;
418
56b2de80 419 if (!isadir && (dino->di_format == XFS_DINODE_FMT_EXTENTS ||
69ec88b5 420 args->dirs_only))
2556c98b
BN
421 continue;
422
423 /*
424 * do some checks on the inode to see if we can prefetch
425 * its directory data. It's a cut down version of
426 * process_dinode_int() in dinode.c.
427 */
56b2de80 428 if (dino->di_format > XFS_DINODE_FMT_BTREE)
2556c98b
BN
429 continue;
430
56b2de80 431 if (be16_to_cpu(dino->di_magic) != XFS_DINODE_MAGIC)
2556c98b
BN
432 continue;
433
e2f60652 434 if (!libxfs_dinode_good_version(mp, dino->di_version))
2556c98b
BN
435 continue;
436
56b2de80 437 if (be64_to_cpu(dino->di_size) <= XFS_DFORK_DSIZE(dino, mp))
2556c98b
BN
438 continue;
439
56b2de80 440 if ((dino->di_forkoff != 0) &&
49f693fa 441 (dino->di_forkoff >= XFS_LITINO(mp, dino->di_version) >> 3))
2556c98b
BN
442 continue;
443
56b2de80 444 switch (dino->di_format) {
2556c98b
BN
445 case XFS_DINODE_FMT_EXTENTS:
446 pf_read_exinode(args, dino);
cb5b3ef4 447 break;
2556c98b 448 case XFS_DINODE_FMT_BTREE:
69ec88b5 449 pf_read_btinode(args, dino, isadir);
cb5b3ef4 450 break;
cb5b3ef4
MV
451 }
452 }
69ec88b5
BN
453 if (hasdir)
454 XFS_BUF_SET_PRIORITY(bp, B_DIR_INODE);
cb5b3ef4
MV
455}
456
dd9093de
DC
457/*
458 * pf_batch_read must be called with the lock locked.
459 */
cb5b3ef4 460static void
2556c98b
BN
461pf_batch_read(
462 prefetch_args_t *args,
463 pf_which_t which,
464 void *buf)
cb5b3ef4 465{
2556c98b
BN
466 xfs_buf_t *bplist[MAX_BUFS];
467 unsigned int num;
468 off64_t first_off, last_off, next_off;
469 int len, size;
cb5b3ef4 470 int i;
2556c98b 471 int inode_bufs;
e33b06a3 472 unsigned long fsbno = 0;
379397bf 473 unsigned long max_fsbno;
2556c98b
BN
474 char *pbuf;
475
bb34c934 476 for (;;) {
379397bf 477 num = 0;
bb34c934
BN
478 if (which == PF_SECONDARY) {
479 bplist[0] = btree_find(args->io_queue, 0, &fsbno);
480 max_fsbno = MIN(fsbno + pf_max_fsbs,
481 args->last_bno_read);
482 } else {
483 bplist[0] = btree_find(args->io_queue,
484 args->last_bno_read, &fsbno);
485 max_fsbno = fsbno + pf_max_fsbs;
486 }
379397bf 487 while (bplist[num] && num < MAX_BUFS && fsbno < max_fsbno) {
dd9093de 488 /*
bbd32754
DC
489 * Discontiguous buffers need special handling, so stop
490 * gathering new buffers and process the list and this
491 * discontigous buffer immediately. This avoids the
492 * complexity of keeping a separate discontigous buffer
493 * list and seeking back over ranges we've already done
494 * optimised reads for.
dd9093de
DC
495 */
496 if ((bplist[num]->b_flags & LIBXFS_B_DISCONTIG)) {
bbd32754
DC
497 num++;
498 break;
499 }
500
501 if (which != PF_META_ONLY ||
dd9093de 502 !B_IS_INODE(XFS_BUF_PRIORITY(bplist[num])))
379397bf 503 num++;
e49f30a7
ES
504 if (num == MAX_BUFS)
505 break;
bb34c934 506 bplist[num] = btree_lookup_next(args->io_queue, &fsbno);
2556c98b 507 }
379397bf
BN
508 if (!num)
509 return;
cb5b3ef4 510
2556c98b
BN
511 /*
512 * do a big read if 25% of the potential buffer is useful,
513 * otherwise, find as many close together blocks and
514 * read them in one read
515 */
516 first_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[0]));
517 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
518 XFS_BUF_SIZE(bplist[num-1]);
2c350101 519 while (num > 1 && last_off - first_off > pf_max_bytes) {
2556c98b
BN
520 num--;
521 last_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[num-1])) +
522 XFS_BUF_SIZE(bplist[num-1]);
523 }
524 if (num < ((last_off - first_off) >> (mp->m_sb.sb_blocklog + 3))) {
525 /*
526 * not enough blocks for one big read, so determine
527 * the number of blocks that are close enough.
528 */
529 last_off = first_off + XFS_BUF_SIZE(bplist[0]);
530 for (i = 1; i < num; i++) {
531 next_off = LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) +
532 XFS_BUF_SIZE(bplist[i]);
533 if (next_off - last_off > pf_batch_bytes)
534 break;
535 last_off = next_off;
536 }
537 num = i;
538 }
cb5b3ef4 539
2556c98b 540 for (i = 0; i < num; i++) {
bb34c934 541 if (btree_delete(args->io_queue, XFS_DADDR_TO_FSB(mp,
2556c98b
BN
542 XFS_BUF_ADDR(bplist[i]))) == NULL)
543 do_error(_("prefetch corruption\n"));
cb5b3ef4
MV
544 }
545
2556c98b
BN
546 if (which == PF_PRIMARY) {
547 for (inode_bufs = 0, i = 0; i < num; i++) {
69ec88b5 548 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
2556c98b
BN
549 inode_bufs++;
550 }
551 args->inode_bufs_queued -= inode_bufs;
552 if (inode_bufs && (first_off >> mp->m_sb.sb_blocklog) >
553 pf_batch_fsbs)
554 args->last_bno_read = (first_off >> mp->m_sb.sb_blocklog);
555 }
556#ifdef XR_PF_TRACE
557 pftrace("reading bbs %llu to %llu (%d bufs) from %s queue in AG %d (last_bno = %lu, inode_bufs = %d)",
558 (long long)XFS_BUF_ADDR(bplist[0]),
559 (long long)XFS_BUF_ADDR(bplist[num-1]), num,
560 (which != PF_SECONDARY) ? "pri" : "sec", args->agno,
561 args->last_bno_read, args->inode_bufs_queued);
562#endif
563 pthread_mutex_unlock(&args->lock);
564
565 /*
566 * now read the data and put into the xfs_but_t's
567 */
2f9a125c 568 len = pread(mp_fd, buf, (int)(last_off - first_off), first_off);
bbd32754
DC
569
570 /*
571 * Check the last buffer on the list to see if we need to
572 * process a discontiguous buffer. The gather above loop
573 * guarantees that only the last buffer in the list will be a
574 * discontiguous buffer.
575 */
576 if ((bplist[num - 1]->b_flags & LIBXFS_B_DISCONTIG)) {
577 libxfs_readbufr_map(mp->m_ddev_targp, bplist[num - 1], 0);
578 bplist[num - 1]->b_flags |= LIBXFS_B_UNCHECKED;
579 libxfs_putbuf(bplist[num - 1]);
580 num--;
581 }
582
2556c98b
BN
583 if (len > 0) {
584 /*
585 * go through the xfs_buf_t list copying from the
586 * read buffer into the xfs_buf_t's and release them.
587 */
2556c98b
BN
588 for (i = 0; i < num; i++) {
589
590 pbuf = ((char *)buf) + (LIBXFS_BBTOOFF64(XFS_BUF_ADDR(bplist[i])) - first_off);
591 size = XFS_BUF_SIZE(bplist[i]);
592 if (len < size)
593 break;
594 memcpy(XFS_BUF_PTR(bplist[i]), pbuf, size);
adbb3573
DC
595 bplist[i]->b_flags |= (LIBXFS_B_UPTODATE |
596 LIBXFS_B_UNCHECKED);
2556c98b 597 len -= size;
69ec88b5 598 if (B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])))
2556c98b 599 pf_read_inode_dirs(args, bplist[i]);
69ec88b5
BN
600 else if (which == PF_META_ONLY)
601 XFS_BUF_SET_PRIORITY(bplist[i],
602 B_DIR_META_H);
603 else if (which == PF_PRIMARY && num == 1)
604 XFS_BUF_SET_PRIORITY(bplist[i],
605 B_DIR_META_S);
2556c98b
BN
606 }
607 }
608 for (i = 0; i < num; i++) {
2556c98b 609 pftrace("putbuf %c %p (%llu) in AG %d",
69ec88b5 610 B_IS_INODE(XFS_BUF_PRIORITY(bplist[i])) ? 'I' : 'M',
2556c98b
BN
611 bplist[i], (long long)XFS_BUF_ADDR(bplist[i]),
612 args->agno);
2556c98b
BN
613 libxfs_putbuf(bplist[i]);
614 }
615 pthread_mutex_lock(&args->lock);
616 if (which != PF_SECONDARY) {
2556c98b
BN
617 pftrace("inode_bufs_queued for AG %d = %d", args->agno,
618 args->inode_bufs_queued);
2556c98b
BN
619 /*
620 * if primary inode queue running low, process metadata
621 * in boths queues to avoid I/O starvation as the
622 * processing thread would be waiting for a metadata
623 * buffer
624 */
625 if (which == PF_PRIMARY && !args->queuing_done &&
626 args->inode_bufs_queued < IO_THRESHOLD) {
2556c98b
BN
627 pftrace("reading metadata bufs from primary queue for AG %d",
628 args->agno);
4c0a98ae 629
2556c98b 630 pf_batch_read(args, PF_META_ONLY, buf);
4c0a98ae 631
2556c98b
BN
632 pftrace("reading bufs from secondary queue for AG %d",
633 args->agno);
4c0a98ae 634
2556c98b
BN
635 pf_batch_read(args, PF_SECONDARY, buf);
636 }
cb5b3ef4 637 }
cb5b3ef4 638 }
2556c98b
BN
639}
640
641static void *
642pf_io_worker(
643 void *param)
644{
645 prefetch_args_t *args = param;
646 void *buf = memalign(libxfs_device_alignment(),
647 pf_max_bytes);
648
649 if (buf == NULL)
650 return NULL;
cb5b3ef4 651
2556c98b 652 pthread_mutex_lock(&args->lock);
bb34c934 653 while (!args->queuing_done || !btree_is_empty(args->io_queue)) {
2556c98b 654 pftrace("waiting to start prefetch I/O for AG %d", args->agno);
4c0a98ae 655
2556c98b
BN
656 while (!args->can_start_reading && !args->queuing_done)
657 pthread_cond_wait(&args->start_reading, &args->lock);
4c0a98ae 658
2556c98b 659 pftrace("starting prefetch I/O for AG %d", args->agno);
4c0a98ae 660
2556c98b
BN
661 pf_batch_read(args, PF_PRIMARY, buf);
662 pf_batch_read(args, PF_SECONDARY, buf);
663
2556c98b 664 pftrace("ran out of bufs to prefetch for AG %d", args->agno);
4c0a98ae 665
2556c98b
BN
666 if (!args->queuing_done)
667 args->can_start_reading = 0;
cb5b3ef4 668 }
2556c98b 669 pthread_mutex_unlock(&args->lock);
cb5b3ef4 670
2556c98b
BN
671 free(buf);
672
2556c98b 673 pftrace("finished prefetch I/O for AG %d", args->agno);
4c0a98ae 674
2556c98b 675 return NULL;
cb5b3ef4
MV
676}
677
2556c98b
BN
678static int
679pf_create_prefetch_thread(
680 prefetch_args_t *args);
681
682static void *
683pf_queuing_worker(
684 void *param)
cb5b3ef4 685{
2556c98b
BN
686 prefetch_args_t *args = param;
687 int num_inos;
688 ino_tree_node_t *irec;
689 ino_tree_node_t *cur_irec;
690 int blks_per_cluster;
2556c98b 691 xfs_agblock_t bno;
cb5b3ef4 692 int i;
2556c98b 693 int err;
870b18fd 694 uint64_t sparse;
2556c98b 695
ff105f75 696 blks_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
2556c98b
BN
697 if (blks_per_cluster == 0)
698 blks_per_cluster = 1;
2556c98b
BN
699
700 for (i = 0; i < PF_THREAD_COUNT; i++) {
701 err = pthread_create(&args->io_threads[i], NULL,
702 pf_io_worker, args);
703 if (err != 0) {
704 do_warn(_("failed to create prefetch thread: %s\n"),
705 strerror(err));
53dc81db 706 args->io_threads[i] = 0;
2556c98b
BN
707 if (i == 0) {
708 pf_start_processing(args);
709 return NULL;
710 }
711 /*
712 * since we have at least one I/O thread, use them for
713 * prefetch
714 */
715 break;
716 }
cb5b3ef4 717 }
2556c98b 718 pftrace("starting prefetch for AG %d", args->agno);
cb5b3ef4 719
2556c98b
BN
720 for (irec = findfirst_inode_rec(args->agno); irec != NULL;
721 irec = next_ino_rec(irec)) {
cb5b3ef4 722
2556c98b 723 cur_irec = irec;
cb5b3ef4 724
2556c98b 725 num_inos = XFS_INODES_PER_CHUNK;
ff105f75 726 while (num_inos < mp->m_ialloc_inos && irec != NULL) {
2556c98b
BN
727 irec = next_ino_rec(irec);
728 num_inos += XFS_INODES_PER_CHUNK;
729 }
cb5b3ef4 730
2556c98b
BN
731 if (args->dirs_only && cur_irec->ino_isa_dir == 0)
732 continue;
733#ifdef XR_PF_TRACE
734 sem_getvalue(&args->ra_count, &i);
735 pftrace("queuing irec %p in AG %d, sem count = %d",
736 irec, args->agno, i);
737#endif
3724f674 738 err = sem_trywait(&args->ra_count);
004e18d4 739 if (err < 0 && errno == EAGAIN) {
3724f674
CH
740 /*
741 * Kick the queue once we have reached the limit;
742 * without this the threads processing the inodes
743 * might get stuck on a buffer that has been locked
744 * and added to the I/O queue but is waiting for
745 * the thread to be woken.
746 */
747 pf_start_io_workers(args);
748 sem_wait(&args->ra_count);
749 }
2556c98b
BN
750
751 num_inos = 0;
752 bno = XFS_AGINO_TO_AGBNO(mp, cur_irec->ino_startnum);
870b18fd 753 sparse = cur_irec->ir_sparse;
2556c98b
BN
754
755 do {
dd9093de
DC
756 struct xfs_buf_map map;
757
758 map.bm_bn = XFS_AGB_TO_DADDR(mp, args->agno, bno);
759 map.bm_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
870b18fd
BF
760
761 /*
762 * Queue I/O for each non-sparse cluster. We can check
763 * sparse state in cluster sized chunks as cluster size
764 * is the min. granularity of sparse irec regions.
765 */
c782bf02 766 if ((sparse & ((1ULL << inodes_per_cluster) - 1)) == 0)
870b18fd
BF
767 pf_queue_io(args, &map, 1,
768 (cur_irec->ino_isa_dir != 0) ?
769 B_DIR_INODE : B_INODE);
770
2556c98b 771 bno += blks_per_cluster;
edf3f9d0 772 num_inos += inodes_per_cluster;
870b18fd 773 sparse >>= inodes_per_cluster;
ff105f75 774 } while (num_inos < mp->m_ialloc_inos);
cb5b3ef4
MV
775 }
776
2556c98b 777 pthread_mutex_lock(&args->lock);
cb5b3ef4 778
2556c98b
BN
779 pftrace("finished queuing inodes for AG %d (inode_bufs_queued = %d)",
780 args->agno, args->inode_bufs_queued);
4c0a98ae 781
2556c98b
BN
782 args->queuing_done = 1;
783 pf_start_io_workers(args);
784 pf_start_processing(args);
785 pthread_mutex_unlock(&args->lock);
786
787 /* now wait for the readers to finish */
788 for (i = 0; i < PF_THREAD_COUNT; i++)
789 if (args->io_threads[i])
790 pthread_join(args->io_threads[i], NULL);
791
2556c98b 792 pftrace("prefetch for AG %d finished", args->agno);
4c0a98ae 793
2556c98b
BN
794 pthread_mutex_lock(&args->lock);
795
bb34c934 796 ASSERT(btree_is_empty(args->io_queue));
2556c98b
BN
797
798 args->prefetch_done = 1;
799 if (args->next_args)
800 pf_create_prefetch_thread(args->next_args);
801
802 pthread_mutex_unlock(&args->lock);
803
804 return NULL;
cb5b3ef4
MV
805}
806
2556c98b
BN
807static int
808pf_create_prefetch_thread(
809 prefetch_args_t *args)
810{
811 int err;
812
2556c98b 813 pftrace("creating queue thread for AG %d", args->agno);
4c0a98ae 814
2556c98b
BN
815 err = pthread_create(&args->queuing_thread, NULL,
816 pf_queuing_worker, args);
817 if (err != 0) {
818 do_warn(_("failed to create prefetch thread: %s\n"),
819 strerror(err));
53dc81db 820 args->queuing_thread = 0;
2556c98b
BN
821 cleanup_inode_prefetch(args);
822 }
823
824 return err == 0;
825}
cb5b3ef4
MV
826
827void
2556c98b
BN
828init_prefetch(
829 xfs_mount_t *pmp)
cb5b3ef4 830{
2556c98b 831 mp = pmp;
75c8b434 832 mp_fd = libxfs_device_to_fd(mp->m_ddev_targp->dev);
2556c98b
BN
833 pf_max_bytes = sysconf(_SC_PAGE_SIZE) << 7;
834 pf_max_bbs = pf_max_bytes >> BBSHIFT;
835 pf_max_fsbs = pf_max_bytes >> mp->m_sb.sb_blocklog;
836 pf_batch_bytes = DEF_BATCH_BYTES;
837 pf_batch_fsbs = DEF_BATCH_BYTES >> (mp->m_sb.sb_blocklog + 1);
838}
cb5b3ef4 839
2556c98b
BN
840prefetch_args_t *
841start_inode_prefetch(
842 xfs_agnumber_t agno,
843 int dirs_only,
844 prefetch_args_t *prev_args)
845{
846 prefetch_args_t *args;
edf3f9d0 847 long max_queue;
cb5b3ef4 848
2556c98b
BN
849 if (!do_prefetch || agno >= mp->m_sb.sb_agcount)
850 return NULL;
cb5b3ef4 851
2556c98b
BN
852 args = calloc(1, sizeof(prefetch_args_t));
853
bb34c934 854 btree_init(&args->io_queue);
5e656dbb
BN
855 if (pthread_mutex_init(&args->lock, NULL) != 0)
856 do_error(_("failed to initialize prefetch mutex\n"));
857 if (pthread_cond_init(&args->start_reading, NULL) != 0)
858 do_error(_("failed to initialize prefetch cond var\n"));
859 if (pthread_cond_init(&args->start_processing, NULL) != 0)
860 do_error(_("failed to initialize prefetch cond var\n"));
2556c98b
BN
861 args->agno = agno;
862 args->dirs_only = dirs_only;
863
864 /*
865 * use only 1/8 of the libxfs cache as we are only counting inodes
866 * and not any other associated metadata like directories
867 */
868
edf3f9d0 869 max_queue = libxfs_bcache->c_maxcount / thread_count / 8;
ff105f75
DC
870 if (mp->m_inode_cluster_size > mp->m_sb.sb_blocksize)
871 max_queue = max_queue *
872 (mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog) /
873 mp->m_ialloc_blks;
edf3f9d0
BN
874
875 sem_init(&args->ra_count, 0, max_queue);
2556c98b
BN
876
877 if (!prev_args) {
878 if (!pf_create_prefetch_thread(args))
879 return NULL;
880 } else {
881 pthread_mutex_lock(&prev_args->lock);
882 if (prev_args->prefetch_done) {
883 if (!pf_create_prefetch_thread(args))
884 args = NULL;
885 } else
886 prev_args->next_args = args;
887 pthread_mutex_unlock(&prev_args->lock);
cb5b3ef4 888 }
2556c98b
BN
889
890 return args;
cb5b3ef4
MV
891}
892
71014d19
DC
893/*
894 * prefetch_ag_range runs a prefetch-and-process loop across a range of AGs. It
895 * begins with @start+ag, and finishes with @end_ag - 1 (i.e. does not prefetch
896 * or process @end_ag). The function starts prefetch on the first AG, then loops
897 * starting prefetch on the next AG and then blocks processing the current AG as
898 * the prefetch queue brings inodes into the processing queue.
899 *
900 * There is only one prefetch taking place at a time, so the prefetch on the
901 * next AG only starts once the current AG has been completely prefetched. Hence
902 * the prefetch of the next AG will start some time before the processing of the
903 * current AG finishes, ensuring that when we iterate an start processing the
904 * next AG there is already a significant queue of inodes to process.
905 *
906 * Prefetch is done this way to prevent it from running too far ahead of the
907 * processing. Allowing it to do so can cause cache thrashing, where new
908 * prefetch causes previously prefetched buffers to be reclaimed before the
909 * processing thread uses them. This results in reading all the inodes and
910 * metadata twice per phase and it greatly slows down the processing. Hence we
911 * have to carefully control how far ahead we prefetch...
912 */
913static void
914prefetch_ag_range(
915 struct work_queue *work,
916 xfs_agnumber_t start_ag,
917 xfs_agnumber_t end_ag,
918 bool dirs_only,
919 void (*func)(struct work_queue *,
920 xfs_agnumber_t, void *))
921{
922 int i;
923 struct prefetch_args *pf_args[2];
924
925 pf_args[start_ag & 1] = start_inode_prefetch(start_ag, dirs_only, NULL);
926 for (i = start_ag; i < end_ag; i++) {
927 /* Don't prefetch end_ag */
928 if (i + 1 < end_ag)
929 pf_args[(~i) & 1] = start_inode_prefetch(i + 1,
930 dirs_only, pf_args[i & 1]);
931 func(work, i, pf_args[i & 1]);
932 }
933}
934
935struct pf_work_args {
936 xfs_agnumber_t start_ag;
937 xfs_agnumber_t end_ag;
938 bool dirs_only;
939 void (*func)(struct work_queue *, xfs_agnumber_t, void *);
940};
941
942static void
943prefetch_ag_range_work(
944 struct work_queue *work,
945 xfs_agnumber_t unused,
946 void *args)
947{
948 struct pf_work_args *wargs = args;
949
f8149110 950 prefetch_ag_range(work, wargs->start_ag, wargs->end_ag,
71014d19
DC
951 wargs->dirs_only, wargs->func);
952 free(args);
953}
954
1164bde5
DC
955/*
956 * Do inode prefetch in the most optimal way for the context under which repair
957 * has been run.
958 */
959void
960do_inode_prefetch(
961 struct xfs_mount *mp,
962 int stride,
963 void (*func)(struct work_queue *,
964 xfs_agnumber_t, void *),
965 bool check_cache,
966 bool dirs_only)
967{
71014d19 968 int i;
1164bde5
DC
969 struct work_queue queue;
970 struct work_queue *queues;
f994d14f 971 int queues_started = 0;
1164bde5
DC
972
973 /*
974 * If the previous phases of repair have not overflowed the buffer
975 * cache, then we don't need to re-read any of the metadata in the
976 * filesystem - it's all in the cache. In that case, run a thread per
977 * CPU to maximise parallelism of the queue to be processed.
978 */
979 if (check_cache && !libxfs_bcache_overflowed()) {
980 queue.mp = mp;
981 create_work_queue(&queue, mp, libxfs_nproc());
982 for (i = 0; i < mp->m_sb.sb_agcount; i++)
983 queue_work(&queue, func, i, NULL);
984 destroy_work_queue(&queue);
985 return;
986 }
987
988 /*
989 * single threaded behaviour - single prefetch thread, processed
990 * directly after each AG is queued.
991 */
992 if (!stride) {
993 queue.mp = mp;
71014d19
DC
994 prefetch_ag_range(&queue, 0, mp->m_sb.sb_agcount,
995 dirs_only, func);
1164bde5
DC
996 return;
997 }
998
999 /*
1000 * create one worker thread for each segment of the volume
1001 */
1002 queues = malloc(thread_count * sizeof(work_queue_t));
71014d19
DC
1003 for (i = 0; i < thread_count; i++) {
1004 struct pf_work_args *wargs;
1005
1006 wargs = malloc(sizeof(struct pf_work_args));
1007 wargs->start_ag = i * stride;
1008 wargs->end_ag = min((i + 1) * stride,
1009 mp->m_sb.sb_agcount);
1010 wargs->dirs_only = dirs_only;
1011 wargs->func = func;
1012
1164bde5 1013 create_work_queue(&queues[i], mp, 1);
71014d19 1014 queue_work(&queues[i], prefetch_ag_range_work, 0, wargs);
f994d14f 1015 queues_started++;
71014d19
DC
1016
1017 if (wargs->end_ag >= mp->m_sb.sb_agcount)
1018 break;
1164bde5 1019 }
71014d19 1020
1164bde5
DC
1021 /*
1022 * wait for workers to complete
1023 */
f994d14f 1024 for (i = 0; i < queues_started; i++)
1164bde5
DC
1025 destroy_work_queue(&queues[i]);
1026 free(queues);
1027}
1028
cb5b3ef4 1029void
2556c98b
BN
1030wait_for_inode_prefetch(
1031 prefetch_args_t *args)
cb5b3ef4 1032{
2556c98b 1033 if (args == NULL)
cb5b3ef4 1034 return;
2556c98b
BN
1035
1036 pthread_mutex_lock(&args->lock);
1037
1038 while (!args->can_start_processing) {
2556c98b 1039 pftrace("waiting to start processing AG %d", args->agno);
4c0a98ae 1040
2556c98b 1041 pthread_cond_wait(&args->start_processing, &args->lock);
cb5b3ef4 1042 }
2556c98b 1043 pftrace("can start processing AG %d", args->agno);
4c0a98ae 1044
2556c98b
BN
1045 pthread_mutex_unlock(&args->lock);
1046}
cb5b3ef4 1047
2556c98b
BN
1048void
1049cleanup_inode_prefetch(
1050 prefetch_args_t *args)
1051{
1052 if (args == NULL)
1053 return;
cb5b3ef4 1054
2556c98b 1055 pftrace("waiting AG %d prefetch to finish", args->agno);
4c0a98ae 1056
2556c98b
BN
1057 if (args->queuing_thread)
1058 pthread_join(args->queuing_thread, NULL);
1059
2556c98b 1060 pftrace("AG %d prefetch done", args->agno);
4c0a98ae 1061
2556c98b
BN
1062 pthread_mutex_destroy(&args->lock);
1063 pthread_cond_destroy(&args->start_reading);
1064 pthread_cond_destroy(&args->start_processing);
1065 sem_destroy(&args->ra_count);
bb34c934 1066 btree_destroy(args->io_queue);
2556c98b
BN
1067
1068 free(args);
cb5b3ef4
MV
1069}
1070
2556c98b
BN
1071#ifdef XR_PF_TRACE
1072
4c0a98ae
BN
1073static FILE *pf_trace_file;
1074
1075void
1076pftrace_init(void)
1077{
1078 pf_trace_file = fopen("/tmp/xfs_repair_prefetch.trace", "w");
1079 setvbuf(pf_trace_file, NULL, _IOLBF, 1024);
1080}
1081
1082void
1083pftrace_done(void)
1084{
1085 fclose(pf_trace_file);
1086}
1087
cb5b3ef4 1088void
2556c98b 1089_pftrace(const char *func, const char *msg, ...)
cb5b3ef4 1090{
2556c98b
BN
1091 char buf[200];
1092 struct timeval tv;
1093 va_list args;
cb5b3ef4 1094
2556c98b 1095 gettimeofday(&tv, NULL);
cb5b3ef4 1096
2556c98b
BN
1097 va_start(args, msg);
1098 vsnprintf(buf, sizeof(buf), msg, args);
1099 buf[sizeof(buf)-1] = '\0';
1100 va_end(args);
cb5b3ef4 1101
4c0a98ae
BN
1102 fprintf(pf_trace_file, "%lu.%06lu %s: %s\n", tv.tv_sec, tv.tv_usec,
1103 func, buf);
cb5b3ef4 1104}
2556c98b
BN
1105
1106#endif