]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - libxfs/xfs_rmap_btree.c
libxfs: refactor manage_zones()
[thirdparty/xfsprogs-dev.git] / libxfs / xfs_rmap_btree.c
CommitLineData
37b3b4d6 1// SPDX-License-Identifier: GPL-2.0
b3a96b46
DW
2/*
3 * Copyright (c) 2014 Red Hat, Inc.
4 * All Rights Reserved.
b3a96b46
DW
5 */
6#include "libxfs_priv.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_log_format.h"
11#include "xfs_trans_resv.h"
12#include "xfs_bit.h"
13#include "xfs_sb.h"
14#include "xfs_mount.h"
15#include "xfs_defer.h"
16#include "xfs_inode.h"
17#include "xfs_trans.h"
18#include "xfs_alloc.h"
19#include "xfs_btree.h"
936ca687 20#include "xfs_rmap.h"
b3a96b46
DW
21#include "xfs_rmap_btree.h"
22#include "xfs_trace.h"
23#include "xfs_cksum.h"
02cc8b2a 24#include "xfs_ag_resv.h"
b3a96b46 25
936ca687
DW
26/*
27 * Reverse map btree.
28 *
29 * This is a per-ag tree used to track the owner(s) of a given extent. With
30 * reflink it is possible for there to be multiple owners, which is a departure
31 * from classic XFS. Owner records for data extents are inserted when the
32 * extent is mapped and removed when an extent is unmapped. Owner records for
33 * all other block types (i.e. metadata) are inserted when an extent is
34 * allocated and removed when an extent is freed. There can only be one owner
35 * of a metadata extent, usually an inode or some other metadata structure like
36 * an AG btree.
37 *
38 * The rmap btree is part of the free space management, so blocks for the tree
39 * are sourced from the agfl. Hence we need transaction reservation support for
40 * this tree so that the freelist is always large enough. This also impacts on
41 * the minimum space we need to leave free in the AG.
42 *
43 * The tree is ordered by [ag block, owner, offset]. This is a large key size,
44 * but it is the only way to enforce unique keys when a block can be owned by
45 * multiple files at any offset. There's no need to order/search by extent
46 * size for online updating/management of the tree. It is intended that most
47 * reverse lookups will be to find the owner(s) of a particular block, or to
48 * try to recover tree and file data from corrupt primary metadata.
49 */
50
b3a96b46
DW
51static struct xfs_btree_cur *
52xfs_rmapbt_dup_cursor(
53 struct xfs_btree_cur *cur)
54{
55 return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
56 cur->bc_private.a.agbp, cur->bc_private.a.agno);
57}
58
936ca687
DW
59STATIC void
60xfs_rmapbt_set_root(
61 struct xfs_btree_cur *cur,
62 union xfs_btree_ptr *ptr,
63 int inc)
64{
65 struct xfs_buf *agbp = cur->bc_private.a.agbp;
66 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
67 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
68 int btnum = cur->bc_btnum;
69 struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
70
71 ASSERT(ptr->s != 0);
72
73 agf->agf_roots[btnum] = ptr->s;
74 be32_add_cpu(&agf->agf_levels[btnum], inc);
75 pag->pagf_levels[btnum] += inc;
76 xfs_perag_put(pag);
77
78 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
79}
80
81STATIC int
82xfs_rmapbt_alloc_block(
83 struct xfs_btree_cur *cur,
84 union xfs_btree_ptr *start,
85 union xfs_btree_ptr *new,
86 int *stat)
87{
8511b71a
DW
88 struct xfs_buf *agbp = cur->bc_private.a.agbp;
89 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
936ca687
DW
90 int error;
91 xfs_agblock_t bno;
92
936ca687
DW
93 /* Allocate the new block from the freelist. If we can't, give up. */
94 error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
95 &bno, 1);
97b3ffd0 96 if (error)
936ca687 97 return error;
936ca687
DW
98
99 trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
100 bno, 1);
101 if (bno == NULLAGBLOCK) {
936ca687
DW
102 *stat = 0;
103 return 0;
104 }
105
106 xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1,
107 false);
108
109 xfs_trans_agbtree_delta(cur->bc_tp, 1);
110 new->s = cpu_to_be32(bno);
8511b71a
DW
111 be32_add_cpu(&agf->agf_rmap_blocks, 1);
112 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
936ca687 113
9760cac2
BF
114 xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno);
115
936ca687
DW
116 *stat = 1;
117 return 0;
118}
119
120STATIC int
121xfs_rmapbt_free_block(
122 struct xfs_btree_cur *cur,
123 struct xfs_buf *bp)
124{
125 struct xfs_buf *agbp = cur->bc_private.a.agbp;
126 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
127 xfs_agblock_t bno;
128 int error;
129
130 bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
131 trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
132 bno, 1);
8511b71a
DW
133 be32_add_cpu(&agf->agf_rmap_blocks, -1);
134 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
936ca687
DW
135 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
136 if (error)
137 return error;
138
139 xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
140 XFS_EXTENT_BUSY_SKIP_DISCARD);
141 xfs_trans_agbtree_delta(cur->bc_tp, -1);
142
9760cac2
BF
143 xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno);
144
936ca687
DW
145 return 0;
146}
147
148STATIC int
149xfs_rmapbt_get_minrecs(
150 struct xfs_btree_cur *cur,
151 int level)
152{
153 return cur->bc_mp->m_rmap_mnr[level != 0];
154}
155
156STATIC int
157xfs_rmapbt_get_maxrecs(
158 struct xfs_btree_cur *cur,
159 int level)
160{
161 return cur->bc_mp->m_rmap_mxr[level != 0];
162}
163
164STATIC void
165xfs_rmapbt_init_key_from_rec(
166 union xfs_btree_key *key,
167 union xfs_btree_rec *rec)
168{
169 key->rmap.rm_startblock = rec->rmap.rm_startblock;
170 key->rmap.rm_owner = rec->rmap.rm_owner;
171 key->rmap.rm_offset = rec->rmap.rm_offset;
172}
173
634b234e
DW
174/*
175 * The high key for a reverse mapping record can be computed by shifting
176 * the startblock and offset to the highest value that would still map
177 * to that record. In practice this means that we add blockcount-1 to
178 * the startblock for all records, and if the record is for a data/attr
179 * fork mapping, we add blockcount-1 to the offset too.
180 */
181STATIC void
182xfs_rmapbt_init_high_key_from_rec(
183 union xfs_btree_key *key,
184 union xfs_btree_rec *rec)
185{
4a492e72 186 uint64_t off;
634b234e
DW
187 int adj;
188
189 adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
190
191 key->rmap.rm_startblock = rec->rmap.rm_startblock;
192 be32_add_cpu(&key->rmap.rm_startblock, adj);
193 key->rmap.rm_owner = rec->rmap.rm_owner;
194 key->rmap.rm_offset = rec->rmap.rm_offset;
195 if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
196 XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
197 return;
198 off = be64_to_cpu(key->rmap.rm_offset);
199 off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
200 key->rmap.rm_offset = cpu_to_be64(off);
201}
202
936ca687
DW
203STATIC void
204xfs_rmapbt_init_rec_from_cur(
205 struct xfs_btree_cur *cur,
206 union xfs_btree_rec *rec)
207{
208 rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
209 rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
210 rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
211 rec->rmap.rm_offset = cpu_to_be64(
212 xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
213}
214
215STATIC void
216xfs_rmapbt_init_ptr_from_cur(
217 struct xfs_btree_cur *cur,
218 union xfs_btree_ptr *ptr)
219{
220 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
221
222 ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
936ca687
DW
223
224 ptr->s = agf->agf_roots[cur->bc_btnum];
225}
226
4a492e72 227STATIC int64_t
936ca687
DW
228xfs_rmapbt_key_diff(
229 struct xfs_btree_cur *cur,
230 union xfs_btree_key *key)
231{
232 struct xfs_rmap_irec *rec = &cur->bc_rec.r;
233 struct xfs_rmap_key *kp = &key->rmap;
234 __u64 x, y;
4a492e72 235 int64_t d;
936ca687 236
4a492e72 237 d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
936ca687
DW
238 if (d)
239 return d;
240
241 x = be64_to_cpu(kp->rm_owner);
242 y = rec->rm_owner;
243 if (x > y)
244 return 1;
245 else if (y > x)
246 return -1;
247
248 x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
249 y = rec->rm_offset;
250 if (x > y)
251 return 1;
252 else if (y > x)
253 return -1;
254 return 0;
255}
256
4a492e72 257STATIC int64_t
634b234e
DW
258xfs_rmapbt_diff_two_keys(
259 struct xfs_btree_cur *cur,
260 union xfs_btree_key *k1,
261 union xfs_btree_key *k2)
262{
263 struct xfs_rmap_key *kp1 = &k1->rmap;
264 struct xfs_rmap_key *kp2 = &k2->rmap;
4a492e72 265 int64_t d;
634b234e
DW
266 __u64 x, y;
267
4a492e72 268 d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
634b234e
DW
269 be32_to_cpu(kp2->rm_startblock);
270 if (d)
271 return d;
272
273 x = be64_to_cpu(kp1->rm_owner);
274 y = be64_to_cpu(kp2->rm_owner);
275 if (x > y)
276 return 1;
277 else if (y > x)
278 return -1;
279
280 x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
281 y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
282 if (x > y)
283 return 1;
284 else if (y > x)
285 return -1;
286 return 0;
287}
288
bc01119d 289static xfs_failaddr_t
b3a96b46
DW
290xfs_rmapbt_verify(
291 struct xfs_buf *bp)
292{
293 struct xfs_mount *mp = bp->b_target->bt_mount;
294 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
295 struct xfs_perag *pag = bp->b_pag;
bc01119d 296 xfs_failaddr_t fa;
b3a96b46
DW
297 unsigned int level;
298
299 /*
300 * magic number and level verification
301 *
302 * During growfs operations, we can't verify the exact level or owner as
303 * the perag is not fully initialised and hence not attached to the
304 * buffer. In this case, check against the maximum tree depth.
305 *
306 * Similarly, during log recovery we will have a perag structure
307 * attached, but the agf information will not yet have been initialised
308 * from the on disk AGF. Again, we can only check against maximum limits
309 * in this case.
310 */
311 if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
bc01119d 312 return __this_address;
b3a96b46
DW
313
314 if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
bc01119d
DW
315 return __this_address;
316 fa = xfs_btree_sblock_v5hdr_verify(bp);
317 if (fa)
318 return fa;
b3a96b46
DW
319
320 level = be16_to_cpu(block->bb_level);
321 if (pag && pag->pagf_init) {
322 if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
bc01119d 323 return __this_address;
b3a96b46 324 } else if (level >= mp->m_rmap_maxlevels)
bc01119d 325 return __this_address;
b3a96b46
DW
326
327 return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
328}
329
330static void
331xfs_rmapbt_read_verify(
332 struct xfs_buf *bp)
333{
1e697959
DW
334 xfs_failaddr_t fa;
335
b3a96b46 336 if (!xfs_btree_sblock_verify_crc(bp))
1e697959
DW
337 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
338 else {
339 fa = xfs_rmapbt_verify(bp);
340 if (fa)
341 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
342 }
b3a96b46 343
7e6c95f1 344 if (bp->b_error)
b3a96b46 345 trace_xfs_btree_corrupt(bp, _RET_IP_);
b3a96b46
DW
346}
347
348static void
349xfs_rmapbt_write_verify(
350 struct xfs_buf *bp)
351{
1e697959
DW
352 xfs_failaddr_t fa;
353
354 fa = xfs_rmapbt_verify(bp);
355 if (fa) {
b3a96b46 356 trace_xfs_btree_corrupt(bp, _RET_IP_);
1e697959 357 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
b3a96b46
DW
358 return;
359 }
360 xfs_btree_sblock_calc_crc(bp);
361
362}
363
364const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
365 .name = "xfs_rmapbt",
366 .verify_read = xfs_rmapbt_read_verify,
367 .verify_write = xfs_rmapbt_write_verify,
95d9582b 368 .verify_struct = xfs_rmapbt_verify,
b3a96b46
DW
369};
370
936ca687
DW
371STATIC int
372xfs_rmapbt_keys_inorder(
373 struct xfs_btree_cur *cur,
374 union xfs_btree_key *k1,
375 union xfs_btree_key *k2)
376{
4a492e72
DW
377 uint32_t x;
378 uint32_t y;
379 uint64_t a;
380 uint64_t b;
936ca687
DW
381
382 x = be32_to_cpu(k1->rmap.rm_startblock);
383 y = be32_to_cpu(k2->rmap.rm_startblock);
384 if (x < y)
385 return 1;
386 else if (x > y)
387 return 0;
388 a = be64_to_cpu(k1->rmap.rm_owner);
389 b = be64_to_cpu(k2->rmap.rm_owner);
390 if (a < b)
391 return 1;
392 else if (a > b)
393 return 0;
394 a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
395 b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
396 if (a <= b)
397 return 1;
398 return 0;
399}
400
401STATIC int
402xfs_rmapbt_recs_inorder(
403 struct xfs_btree_cur *cur,
404 union xfs_btree_rec *r1,
405 union xfs_btree_rec *r2)
406{
4a492e72
DW
407 uint32_t x;
408 uint32_t y;
409 uint64_t a;
410 uint64_t b;
936ca687
DW
411
412 x = be32_to_cpu(r1->rmap.rm_startblock);
413 y = be32_to_cpu(r2->rmap.rm_startblock);
414 if (x < y)
415 return 1;
416 else if (x > y)
417 return 0;
418 a = be64_to_cpu(r1->rmap.rm_owner);
419 b = be64_to_cpu(r2->rmap.rm_owner);
420 if (a < b)
421 return 1;
422 else if (a > b)
423 return 0;
424 a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
425 b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
426 if (a <= b)
427 return 1;
428 return 0;
429}
936ca687 430
b3a96b46
DW
431static const struct xfs_btree_ops xfs_rmapbt_ops = {
432 .rec_len = sizeof(struct xfs_rmap_rec),
433 .key_len = 2 * sizeof(struct xfs_rmap_key),
434
435 .dup_cursor = xfs_rmapbt_dup_cursor,
936ca687
DW
436 .set_root = xfs_rmapbt_set_root,
437 .alloc_block = xfs_rmapbt_alloc_block,
438 .free_block = xfs_rmapbt_free_block,
439 .get_minrecs = xfs_rmapbt_get_minrecs,
440 .get_maxrecs = xfs_rmapbt_get_maxrecs,
441 .init_key_from_rec = xfs_rmapbt_init_key_from_rec,
634b234e 442 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
936ca687
DW
443 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
444 .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur,
445 .key_diff = xfs_rmapbt_key_diff,
b3a96b46 446 .buf_ops = &xfs_rmapbt_buf_ops,
634b234e 447 .diff_two_keys = xfs_rmapbt_diff_two_keys,
936ca687
DW
448 .keys_inorder = xfs_rmapbt_keys_inorder,
449 .recs_inorder = xfs_rmapbt_recs_inorder,
b3a96b46
DW
450};
451
452/*
453 * Allocate a new allocation btree cursor.
454 */
455struct xfs_btree_cur *
456xfs_rmapbt_init_cursor(
457 struct xfs_mount *mp,
458 struct xfs_trans *tp,
459 struct xfs_buf *agbp,
460 xfs_agnumber_t agno)
461{
462 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
463 struct xfs_btree_cur *cur;
464
465 cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
466 cur->bc_tp = tp;
467 cur->bc_mp = mp;
634b234e 468 /* Overlapping btree; 2 keys per pointer. */
b3a96b46 469 cur->bc_btnum = XFS_BTNUM_RMAP;
634b234e 470 cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
b3a96b46
DW
471 cur->bc_blocklog = mp->m_sb.sb_blocklog;
472 cur->bc_ops = &xfs_rmapbt_ops;
473 cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
5d8acc46 474 cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
b3a96b46
DW
475
476 cur->bc_private.a.agbp = agbp;
477 cur->bc_private.a.agno = agno;
478
479 return cur;
480}
481
482/*
483 * Calculate number of records in an rmap btree block.
484 */
485int
486xfs_rmapbt_maxrecs(
b3a96b46
DW
487 int blocklen,
488 int leaf)
489{
490 blocklen -= XFS_RMAP_BLOCK_LEN;
491
492 if (leaf)
493 return blocklen / sizeof(struct xfs_rmap_rec);
494 return blocklen /
634b234e 495 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
b3a96b46
DW
496}
497
498/* Compute the maximum height of an rmap btree. */
499void
500xfs_rmapbt_compute_maxlevels(
501 struct xfs_mount *mp)
502{
88ce0792
DW
503 /*
504 * On a non-reflink filesystem, the maximum number of rmap
505 * records is the number of blocks in the AG, hence the max
506 * rmapbt height is log_$maxrecs($agblocks). However, with
507 * reflink each AG block can have up to 2^32 (per the refcount
508 * record format) owners, which means that theoretically we
509 * could face up to 2^64 rmap records.
510 *
511 * That effectively means that the max rmapbt height must be
512 * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG
513 * blocks to feed the rmapbt long before the rmapbt reaches
514 * maximum height. The reflink code uses ag_resv_critical to
515 * disallow reflinking when less than 10% of the per-AG metadata
516 * block reservation since the fallback is a regular file copy.
517 */
518 if (xfs_sb_version_hasreflink(&mp->m_sb))
519 mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
520 else
1421de38 521 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
88ce0792 522 mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
b3a96b46 523}
02cc8b2a
DW
524
525/* Calculate the refcount btree size for some records. */
526xfs_extlen_t
527xfs_rmapbt_calc_size(
528 struct xfs_mount *mp,
529 unsigned long long len)
530{
1421de38 531 return xfs_btree_calc_size(mp->m_rmap_mnr, len);
02cc8b2a
DW
532}
533
534/*
535 * Calculate the maximum refcount btree size.
536 */
537xfs_extlen_t
538xfs_rmapbt_max_size(
f21c57ed
DW
539 struct xfs_mount *mp,
540 xfs_agblock_t agblocks)
02cc8b2a
DW
541{
542 /* Bail out if we're uninitialized, which can happen in mkfs. */
543 if (mp->m_rmap_mxr[0] == 0)
544 return 0;
545
f21c57ed 546 return xfs_rmapbt_calc_size(mp, agblocks);
02cc8b2a
DW
547}
548
549/*
550 * Figure out how many blocks to reserve and how many are used by this btree.
551 */
552int
553xfs_rmapbt_calc_reserves(
554 struct xfs_mount *mp,
0d802327 555 struct xfs_trans *tp,
02cc8b2a
DW
556 xfs_agnumber_t agno,
557 xfs_extlen_t *ask,
558 xfs_extlen_t *used)
559{
560 struct xfs_buf *agbp;
561 struct xfs_agf *agf;
f21c57ed 562 xfs_agblock_t agblocks;
02cc8b2a
DW
563 xfs_extlen_t tree_len;
564 int error;
565
566 if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
567 return 0;
568
0d802327 569 error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
02cc8b2a
DW
570 if (error)
571 return error;
572
573 agf = XFS_BUF_TO_AGF(agbp);
f21c57ed 574 agblocks = be32_to_cpu(agf->agf_length);
02cc8b2a 575 tree_len = be32_to_cpu(agf->agf_rmap_blocks);
0d802327 576 xfs_trans_brelse(tp, agbp);
02cc8b2a 577
f21c57ed
DW
578 /* Reserve 1% of the AG or enough for 1 block per record. */
579 *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
02cc8b2a
DW
580 *used += tree_len;
581
582 return error;
583}