]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - libxfs/xfs_rmap_btree.c
xfs: dynamically allocate cursors based on maxlevels
[thirdparty/xfsprogs-dev.git] / libxfs / xfs_rmap_btree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2014 Red Hat, Inc.
4 * All Rights Reserved.
5 */
6 #include "libxfs_priv.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_trans.h"
14 #include "xfs_alloc.h"
15 #include "xfs_btree.h"
16 #include "xfs_btree_staging.h"
17 #include "xfs_rmap.h"
18 #include "xfs_rmap_btree.h"
19 #include "xfs_trace.h"
20 #include "xfs_ag.h"
21 #include "xfs_ag_resv.h"
22
23 /*
24 * Reverse map btree.
25 *
26 * This is a per-ag tree used to track the owner(s) of a given extent. With
27 * reflink it is possible for there to be multiple owners, which is a departure
28 * from classic XFS. Owner records for data extents are inserted when the
29 * extent is mapped and removed when an extent is unmapped. Owner records for
30 * all other block types (i.e. metadata) are inserted when an extent is
31 * allocated and removed when an extent is freed. There can only be one owner
32 * of a metadata extent, usually an inode or some other metadata structure like
33 * an AG btree.
34 *
35 * The rmap btree is part of the free space management, so blocks for the tree
36 * are sourced from the agfl. Hence we need transaction reservation support for
37 * this tree so that the freelist is always large enough. This also impacts on
38 * the minimum space we need to leave free in the AG.
39 *
40 * The tree is ordered by [ag block, owner, offset]. This is a large key size,
41 * but it is the only way to enforce unique keys when a block can be owned by
42 * multiple files at any offset. There's no need to order/search by extent
43 * size for online updating/management of the tree. It is intended that most
44 * reverse lookups will be to find the owner(s) of a particular block, or to
45 * try to recover tree and file data from corrupt primary metadata.
46 */
47
48 static struct xfs_btree_cur *
49 xfs_rmapbt_dup_cursor(
50 struct xfs_btree_cur *cur)
51 {
52 return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
53 cur->bc_ag.agbp, cur->bc_ag.pag);
54 }
55
56 STATIC void
57 xfs_rmapbt_set_root(
58 struct xfs_btree_cur *cur,
59 const union xfs_btree_ptr *ptr,
60 int inc)
61 {
62 struct xfs_buf *agbp = cur->bc_ag.agbp;
63 struct xfs_agf *agf = agbp->b_addr;
64 int btnum = cur->bc_btnum;
65
66 ASSERT(ptr->s != 0);
67
68 agf->agf_roots[btnum] = ptr->s;
69 be32_add_cpu(&agf->agf_levels[btnum], inc);
70 cur->bc_ag.pag->pagf_levels[btnum] += inc;
71
72 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
73 }
74
75 STATIC int
76 xfs_rmapbt_alloc_block(
77 struct xfs_btree_cur *cur,
78 const union xfs_btree_ptr *start,
79 union xfs_btree_ptr *new,
80 int *stat)
81 {
82 struct xfs_buf *agbp = cur->bc_ag.agbp;
83 struct xfs_agf *agf = agbp->b_addr;
84 struct xfs_perag *pag = cur->bc_ag.pag;
85 int error;
86 xfs_agblock_t bno;
87
88 /* Allocate the new block from the freelist. If we can't, give up. */
89 error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_ag.agbp,
90 &bno, 1);
91 if (error)
92 return error;
93
94 trace_xfs_rmapbt_alloc_block(cur->bc_mp, pag->pag_agno, bno, 1);
95 if (bno == NULLAGBLOCK) {
96 *stat = 0;
97 return 0;
98 }
99
100 xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false);
101
102 new->s = cpu_to_be32(bno);
103 be32_add_cpu(&agf->agf_rmap_blocks, 1);
104 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
105
106 xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno);
107
108 *stat = 1;
109 return 0;
110 }
111
112 STATIC int
113 xfs_rmapbt_free_block(
114 struct xfs_btree_cur *cur,
115 struct xfs_buf *bp)
116 {
117 struct xfs_buf *agbp = cur->bc_ag.agbp;
118 struct xfs_agf *agf = agbp->b_addr;
119 struct xfs_perag *pag = cur->bc_ag.pag;
120 xfs_agblock_t bno;
121 int error;
122
123 bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
124 trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno,
125 bno, 1);
126 be32_add_cpu(&agf->agf_rmap_blocks, -1);
127 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
128 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
129 if (error)
130 return error;
131
132 xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1,
133 XFS_EXTENT_BUSY_SKIP_DISCARD);
134
135 xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
136 return 0;
137 }
138
139 STATIC int
140 xfs_rmapbt_get_minrecs(
141 struct xfs_btree_cur *cur,
142 int level)
143 {
144 return cur->bc_mp->m_rmap_mnr[level != 0];
145 }
146
147 STATIC int
148 xfs_rmapbt_get_maxrecs(
149 struct xfs_btree_cur *cur,
150 int level)
151 {
152 return cur->bc_mp->m_rmap_mxr[level != 0];
153 }
154
155 STATIC void
156 xfs_rmapbt_init_key_from_rec(
157 union xfs_btree_key *key,
158 const union xfs_btree_rec *rec)
159 {
160 key->rmap.rm_startblock = rec->rmap.rm_startblock;
161 key->rmap.rm_owner = rec->rmap.rm_owner;
162 key->rmap.rm_offset = rec->rmap.rm_offset;
163 }
164
165 /*
166 * The high key for a reverse mapping record can be computed by shifting
167 * the startblock and offset to the highest value that would still map
168 * to that record. In practice this means that we add blockcount-1 to
169 * the startblock for all records, and if the record is for a data/attr
170 * fork mapping, we add blockcount-1 to the offset too.
171 */
172 STATIC void
173 xfs_rmapbt_init_high_key_from_rec(
174 union xfs_btree_key *key,
175 const union xfs_btree_rec *rec)
176 {
177 uint64_t off;
178 int adj;
179
180 adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
181
182 key->rmap.rm_startblock = rec->rmap.rm_startblock;
183 be32_add_cpu(&key->rmap.rm_startblock, adj);
184 key->rmap.rm_owner = rec->rmap.rm_owner;
185 key->rmap.rm_offset = rec->rmap.rm_offset;
186 if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
187 XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
188 return;
189 off = be64_to_cpu(key->rmap.rm_offset);
190 off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
191 key->rmap.rm_offset = cpu_to_be64(off);
192 }
193
194 STATIC void
195 xfs_rmapbt_init_rec_from_cur(
196 struct xfs_btree_cur *cur,
197 union xfs_btree_rec *rec)
198 {
199 rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
200 rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
201 rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
202 rec->rmap.rm_offset = cpu_to_be64(
203 xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
204 }
205
206 STATIC void
207 xfs_rmapbt_init_ptr_from_cur(
208 struct xfs_btree_cur *cur,
209 union xfs_btree_ptr *ptr)
210 {
211 struct xfs_agf *agf = cur->bc_ag.agbp->b_addr;
212
213 ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
214
215 ptr->s = agf->agf_roots[cur->bc_btnum];
216 }
217
218 STATIC int64_t
219 xfs_rmapbt_key_diff(
220 struct xfs_btree_cur *cur,
221 const union xfs_btree_key *key)
222 {
223 struct xfs_rmap_irec *rec = &cur->bc_rec.r;
224 const struct xfs_rmap_key *kp = &key->rmap;
225 __u64 x, y;
226 int64_t d;
227
228 d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
229 if (d)
230 return d;
231
232 x = be64_to_cpu(kp->rm_owner);
233 y = rec->rm_owner;
234 if (x > y)
235 return 1;
236 else if (y > x)
237 return -1;
238
239 x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
240 y = rec->rm_offset;
241 if (x > y)
242 return 1;
243 else if (y > x)
244 return -1;
245 return 0;
246 }
247
248 STATIC int64_t
249 xfs_rmapbt_diff_two_keys(
250 struct xfs_btree_cur *cur,
251 const union xfs_btree_key *k1,
252 const union xfs_btree_key *k2)
253 {
254 const struct xfs_rmap_key *kp1 = &k1->rmap;
255 const struct xfs_rmap_key *kp2 = &k2->rmap;
256 int64_t d;
257 __u64 x, y;
258
259 d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
260 be32_to_cpu(kp2->rm_startblock);
261 if (d)
262 return d;
263
264 x = be64_to_cpu(kp1->rm_owner);
265 y = be64_to_cpu(kp2->rm_owner);
266 if (x > y)
267 return 1;
268 else if (y > x)
269 return -1;
270
271 x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
272 y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
273 if (x > y)
274 return 1;
275 else if (y > x)
276 return -1;
277 return 0;
278 }
279
280 static xfs_failaddr_t
281 xfs_rmapbt_verify(
282 struct xfs_buf *bp)
283 {
284 struct xfs_mount *mp = bp->b_mount;
285 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
286 struct xfs_perag *pag = bp->b_pag;
287 xfs_failaddr_t fa;
288 unsigned int level;
289
290 /*
291 * magic number and level verification
292 *
293 * During growfs operations, we can't verify the exact level or owner as
294 * the perag is not fully initialised and hence not attached to the
295 * buffer. In this case, check against the maximum tree depth.
296 *
297 * Similarly, during log recovery we will have a perag structure
298 * attached, but the agf information will not yet have been initialised
299 * from the on disk AGF. Again, we can only check against maximum limits
300 * in this case.
301 */
302 if (!xfs_verify_magic(bp, block->bb_magic))
303 return __this_address;
304
305 if (!xfs_has_rmapbt(mp))
306 return __this_address;
307 fa = xfs_btree_sblock_v5hdr_verify(bp);
308 if (fa)
309 return fa;
310
311 level = be16_to_cpu(block->bb_level);
312 if (pag && pag->pagf_init) {
313 if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
314 return __this_address;
315 } else if (level >= mp->m_rmap_maxlevels)
316 return __this_address;
317
318 return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
319 }
320
321 static void
322 xfs_rmapbt_read_verify(
323 struct xfs_buf *bp)
324 {
325 xfs_failaddr_t fa;
326
327 if (!xfs_btree_sblock_verify_crc(bp))
328 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
329 else {
330 fa = xfs_rmapbt_verify(bp);
331 if (fa)
332 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
333 }
334
335 if (bp->b_error)
336 trace_xfs_btree_corrupt(bp, _RET_IP_);
337 }
338
339 static void
340 xfs_rmapbt_write_verify(
341 struct xfs_buf *bp)
342 {
343 xfs_failaddr_t fa;
344
345 fa = xfs_rmapbt_verify(bp);
346 if (fa) {
347 trace_xfs_btree_corrupt(bp, _RET_IP_);
348 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
349 return;
350 }
351 xfs_btree_sblock_calc_crc(bp);
352
353 }
354
355 const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
356 .name = "xfs_rmapbt",
357 .magic = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
358 .verify_read = xfs_rmapbt_read_verify,
359 .verify_write = xfs_rmapbt_write_verify,
360 .verify_struct = xfs_rmapbt_verify,
361 };
362
363 STATIC int
364 xfs_rmapbt_keys_inorder(
365 struct xfs_btree_cur *cur,
366 const union xfs_btree_key *k1,
367 const union xfs_btree_key *k2)
368 {
369 uint32_t x;
370 uint32_t y;
371 uint64_t a;
372 uint64_t b;
373
374 x = be32_to_cpu(k1->rmap.rm_startblock);
375 y = be32_to_cpu(k2->rmap.rm_startblock);
376 if (x < y)
377 return 1;
378 else if (x > y)
379 return 0;
380 a = be64_to_cpu(k1->rmap.rm_owner);
381 b = be64_to_cpu(k2->rmap.rm_owner);
382 if (a < b)
383 return 1;
384 else if (a > b)
385 return 0;
386 a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
387 b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
388 if (a <= b)
389 return 1;
390 return 0;
391 }
392
393 STATIC int
394 xfs_rmapbt_recs_inorder(
395 struct xfs_btree_cur *cur,
396 const union xfs_btree_rec *r1,
397 const union xfs_btree_rec *r2)
398 {
399 uint32_t x;
400 uint32_t y;
401 uint64_t a;
402 uint64_t b;
403
404 x = be32_to_cpu(r1->rmap.rm_startblock);
405 y = be32_to_cpu(r2->rmap.rm_startblock);
406 if (x < y)
407 return 1;
408 else if (x > y)
409 return 0;
410 a = be64_to_cpu(r1->rmap.rm_owner);
411 b = be64_to_cpu(r2->rmap.rm_owner);
412 if (a < b)
413 return 1;
414 else if (a > b)
415 return 0;
416 a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
417 b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
418 if (a <= b)
419 return 1;
420 return 0;
421 }
422
423 static const struct xfs_btree_ops xfs_rmapbt_ops = {
424 .rec_len = sizeof(struct xfs_rmap_rec),
425 .key_len = 2 * sizeof(struct xfs_rmap_key),
426
427 .dup_cursor = xfs_rmapbt_dup_cursor,
428 .set_root = xfs_rmapbt_set_root,
429 .alloc_block = xfs_rmapbt_alloc_block,
430 .free_block = xfs_rmapbt_free_block,
431 .get_minrecs = xfs_rmapbt_get_minrecs,
432 .get_maxrecs = xfs_rmapbt_get_maxrecs,
433 .init_key_from_rec = xfs_rmapbt_init_key_from_rec,
434 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
435 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
436 .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur,
437 .key_diff = xfs_rmapbt_key_diff,
438 .buf_ops = &xfs_rmapbt_buf_ops,
439 .diff_two_keys = xfs_rmapbt_diff_two_keys,
440 .keys_inorder = xfs_rmapbt_keys_inorder,
441 .recs_inorder = xfs_rmapbt_recs_inorder,
442 };
443
444 static struct xfs_btree_cur *
445 xfs_rmapbt_init_common(
446 struct xfs_mount *mp,
447 struct xfs_trans *tp,
448 struct xfs_perag *pag)
449 {
450 struct xfs_btree_cur *cur;
451
452 /* Overlapping btree; 2 keys per pointer. */
453 cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP,
454 mp->m_rmap_maxlevels);
455 cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
456 cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
457 cur->bc_ops = &xfs_rmapbt_ops;
458
459 /* take a reference for the cursor */
460 atomic_inc(&pag->pag_ref);
461 cur->bc_ag.pag = pag;
462
463 return cur;
464 }
465
466 /* Create a new reverse mapping btree cursor. */
467 struct xfs_btree_cur *
468 xfs_rmapbt_init_cursor(
469 struct xfs_mount *mp,
470 struct xfs_trans *tp,
471 struct xfs_buf *agbp,
472 struct xfs_perag *pag)
473 {
474 struct xfs_agf *agf = agbp->b_addr;
475 struct xfs_btree_cur *cur;
476
477 cur = xfs_rmapbt_init_common(mp, tp, pag);
478 cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
479 cur->bc_ag.agbp = agbp;
480 return cur;
481 }
482
483 /* Create a new reverse mapping btree cursor with a fake root for staging. */
484 struct xfs_btree_cur *
485 xfs_rmapbt_stage_cursor(
486 struct xfs_mount *mp,
487 struct xbtree_afakeroot *afake,
488 struct xfs_perag *pag)
489 {
490 struct xfs_btree_cur *cur;
491
492 cur = xfs_rmapbt_init_common(mp, NULL, pag);
493 xfs_btree_stage_afakeroot(cur, afake);
494 return cur;
495 }
496
497 /*
498 * Install a new reverse mapping btree root. Caller is responsible for
499 * invalidating and freeing the old btree blocks.
500 */
501 void
502 xfs_rmapbt_commit_staged_btree(
503 struct xfs_btree_cur *cur,
504 struct xfs_trans *tp,
505 struct xfs_buf *agbp)
506 {
507 struct xfs_agf *agf = agbp->b_addr;
508 struct xbtree_afakeroot *afake = cur->bc_ag.afake;
509
510 ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
511
512 agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
513 agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
514 agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
515 xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
516 XFS_AGF_RMAP_BLOCKS);
517 xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops);
518 }
519
520 /*
521 * Calculate number of records in an rmap btree block.
522 */
523 int
524 xfs_rmapbt_maxrecs(
525 int blocklen,
526 int leaf)
527 {
528 blocklen -= XFS_RMAP_BLOCK_LEN;
529
530 if (leaf)
531 return blocklen / sizeof(struct xfs_rmap_rec);
532 return blocklen /
533 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
534 }
535
536 /* Compute the maximum height of an rmap btree. */
537 void
538 xfs_rmapbt_compute_maxlevels(
539 struct xfs_mount *mp)
540 {
541 /*
542 * On a non-reflink filesystem, the maximum number of rmap
543 * records is the number of blocks in the AG, hence the max
544 * rmapbt height is log_$maxrecs($agblocks). However, with
545 * reflink each AG block can have up to 2^32 (per the refcount
546 * record format) owners, which means that theoretically we
547 * could face up to 2^64 rmap records.
548 *
549 * That effectively means that the max rmapbt height must be
550 * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG
551 * blocks to feed the rmapbt long before the rmapbt reaches
552 * maximum height. The reflink code uses ag_resv_critical to
553 * disallow reflinking when less than 10% of the per-AG metadata
554 * block reservation since the fallback is a regular file copy.
555 */
556 if (xfs_has_reflink(mp))
557 mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
558 else
559 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
560 mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
561 }
562
563 /* Calculate the refcount btree size for some records. */
564 xfs_extlen_t
565 xfs_rmapbt_calc_size(
566 struct xfs_mount *mp,
567 unsigned long long len)
568 {
569 return xfs_btree_calc_size(mp->m_rmap_mnr, len);
570 }
571
572 /*
573 * Calculate the maximum refcount btree size.
574 */
575 xfs_extlen_t
576 xfs_rmapbt_max_size(
577 struct xfs_mount *mp,
578 xfs_agblock_t agblocks)
579 {
580 /* Bail out if we're uninitialized, which can happen in mkfs. */
581 if (mp->m_rmap_mxr[0] == 0)
582 return 0;
583
584 return xfs_rmapbt_calc_size(mp, agblocks);
585 }
586
587 /*
588 * Figure out how many blocks to reserve and how many are used by this btree.
589 */
590 int
591 xfs_rmapbt_calc_reserves(
592 struct xfs_mount *mp,
593 struct xfs_trans *tp,
594 struct xfs_perag *pag,
595 xfs_extlen_t *ask,
596 xfs_extlen_t *used)
597 {
598 struct xfs_buf *agbp;
599 struct xfs_agf *agf;
600 xfs_agblock_t agblocks;
601 xfs_extlen_t tree_len;
602 int error;
603
604 if (!xfs_has_rmapbt(mp))
605 return 0;
606
607 error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp);
608 if (error)
609 return error;
610
611 agf = agbp->b_addr;
612 agblocks = be32_to_cpu(agf->agf_length);
613 tree_len = be32_to_cpu(agf->agf_rmap_blocks);
614 xfs_trans_brelse(tp, agbp);
615
616 /*
617 * The log is permanently allocated, so the space it occupies will
618 * never be available for the kinds of things that would require btree
619 * expansion. We therefore can pretend the space isn't there.
620 */
621 if (mp->m_sb.sb_logstart &&
622 XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == pag->pag_agno)
623 agblocks -= mp->m_sb.sb_logblocks;
624
625 /* Reserve 1% of the AG or enough for 1 block per record. */
626 *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
627 *used += tree_len;
628
629 return error;
630 }