]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - libxfs/xfs_rmap_btree.c
libxfs: refactor manage_zones()
[thirdparty/xfsprogs-dev.git] / libxfs / xfs_rmap_btree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2014 Red Hat, Inc.
4 * All Rights Reserved.
5 */
6 #include "libxfs_priv.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_sb.h"
14 #include "xfs_mount.h"
15 #include "xfs_defer.h"
16 #include "xfs_inode.h"
17 #include "xfs_trans.h"
18 #include "xfs_alloc.h"
19 #include "xfs_btree.h"
20 #include "xfs_rmap.h"
21 #include "xfs_rmap_btree.h"
22 #include "xfs_trace.h"
23 #include "xfs_cksum.h"
24 #include "xfs_ag_resv.h"
25
26 /*
27 * Reverse map btree.
28 *
29 * This is a per-ag tree used to track the owner(s) of a given extent. With
30 * reflink it is possible for there to be multiple owners, which is a departure
31 * from classic XFS. Owner records for data extents are inserted when the
32 * extent is mapped and removed when an extent is unmapped. Owner records for
33 * all other block types (i.e. metadata) are inserted when an extent is
34 * allocated and removed when an extent is freed. There can only be one owner
35 * of a metadata extent, usually an inode or some other metadata structure like
36 * an AG btree.
37 *
38 * The rmap btree is part of the free space management, so blocks for the tree
39 * are sourced from the agfl. Hence we need transaction reservation support for
40 * this tree so that the freelist is always large enough. This also impacts on
41 * the minimum space we need to leave free in the AG.
42 *
43 * The tree is ordered by [ag block, owner, offset]. This is a large key size,
44 * but it is the only way to enforce unique keys when a block can be owned by
45 * multiple files at any offset. There's no need to order/search by extent
46 * size for online updating/management of the tree. It is intended that most
47 * reverse lookups will be to find the owner(s) of a particular block, or to
48 * try to recover tree and file data from corrupt primary metadata.
49 */
50
51 static struct xfs_btree_cur *
52 xfs_rmapbt_dup_cursor(
53 struct xfs_btree_cur *cur)
54 {
55 return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
56 cur->bc_private.a.agbp, cur->bc_private.a.agno);
57 }
58
59 STATIC void
60 xfs_rmapbt_set_root(
61 struct xfs_btree_cur *cur,
62 union xfs_btree_ptr *ptr,
63 int inc)
64 {
65 struct xfs_buf *agbp = cur->bc_private.a.agbp;
66 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
67 xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
68 int btnum = cur->bc_btnum;
69 struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno);
70
71 ASSERT(ptr->s != 0);
72
73 agf->agf_roots[btnum] = ptr->s;
74 be32_add_cpu(&agf->agf_levels[btnum], inc);
75 pag->pagf_levels[btnum] += inc;
76 xfs_perag_put(pag);
77
78 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
79 }
80
81 STATIC int
82 xfs_rmapbt_alloc_block(
83 struct xfs_btree_cur *cur,
84 union xfs_btree_ptr *start,
85 union xfs_btree_ptr *new,
86 int *stat)
87 {
88 struct xfs_buf *agbp = cur->bc_private.a.agbp;
89 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
90 int error;
91 xfs_agblock_t bno;
92
93 /* Allocate the new block from the freelist. If we can't, give up. */
94 error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
95 &bno, 1);
96 if (error)
97 return error;
98
99 trace_xfs_rmapbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
100 bno, 1);
101 if (bno == NULLAGBLOCK) {
102 *stat = 0;
103 return 0;
104 }
105
106 xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1,
107 false);
108
109 xfs_trans_agbtree_delta(cur->bc_tp, 1);
110 new->s = cpu_to_be32(bno);
111 be32_add_cpu(&agf->agf_rmap_blocks, 1);
112 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
113
114 xfs_ag_resv_rmapbt_alloc(cur->bc_mp, cur->bc_private.a.agno);
115
116 *stat = 1;
117 return 0;
118 }
119
120 STATIC int
121 xfs_rmapbt_free_block(
122 struct xfs_btree_cur *cur,
123 struct xfs_buf *bp)
124 {
125 struct xfs_buf *agbp = cur->bc_private.a.agbp;
126 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
127 xfs_agblock_t bno;
128 int error;
129
130 bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
131 trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
132 bno, 1);
133 be32_add_cpu(&agf->agf_rmap_blocks, -1);
134 xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
135 error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
136 if (error)
137 return error;
138
139 xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
140 XFS_EXTENT_BUSY_SKIP_DISCARD);
141 xfs_trans_agbtree_delta(cur->bc_tp, -1);
142
143 xfs_ag_resv_rmapbt_free(cur->bc_mp, cur->bc_private.a.agno);
144
145 return 0;
146 }
147
148 STATIC int
149 xfs_rmapbt_get_minrecs(
150 struct xfs_btree_cur *cur,
151 int level)
152 {
153 return cur->bc_mp->m_rmap_mnr[level != 0];
154 }
155
156 STATIC int
157 xfs_rmapbt_get_maxrecs(
158 struct xfs_btree_cur *cur,
159 int level)
160 {
161 return cur->bc_mp->m_rmap_mxr[level != 0];
162 }
163
164 STATIC void
165 xfs_rmapbt_init_key_from_rec(
166 union xfs_btree_key *key,
167 union xfs_btree_rec *rec)
168 {
169 key->rmap.rm_startblock = rec->rmap.rm_startblock;
170 key->rmap.rm_owner = rec->rmap.rm_owner;
171 key->rmap.rm_offset = rec->rmap.rm_offset;
172 }
173
174 /*
175 * The high key for a reverse mapping record can be computed by shifting
176 * the startblock and offset to the highest value that would still map
177 * to that record. In practice this means that we add blockcount-1 to
178 * the startblock for all records, and if the record is for a data/attr
179 * fork mapping, we add blockcount-1 to the offset too.
180 */
181 STATIC void
182 xfs_rmapbt_init_high_key_from_rec(
183 union xfs_btree_key *key,
184 union xfs_btree_rec *rec)
185 {
186 uint64_t off;
187 int adj;
188
189 adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
190
191 key->rmap.rm_startblock = rec->rmap.rm_startblock;
192 be32_add_cpu(&key->rmap.rm_startblock, adj);
193 key->rmap.rm_owner = rec->rmap.rm_owner;
194 key->rmap.rm_offset = rec->rmap.rm_offset;
195 if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
196 XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
197 return;
198 off = be64_to_cpu(key->rmap.rm_offset);
199 off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
200 key->rmap.rm_offset = cpu_to_be64(off);
201 }
202
203 STATIC void
204 xfs_rmapbt_init_rec_from_cur(
205 struct xfs_btree_cur *cur,
206 union xfs_btree_rec *rec)
207 {
208 rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
209 rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
210 rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
211 rec->rmap.rm_offset = cpu_to_be64(
212 xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
213 }
214
215 STATIC void
216 xfs_rmapbt_init_ptr_from_cur(
217 struct xfs_btree_cur *cur,
218 union xfs_btree_ptr *ptr)
219 {
220 struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
221
222 ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
223
224 ptr->s = agf->agf_roots[cur->bc_btnum];
225 }
226
227 STATIC int64_t
228 xfs_rmapbt_key_diff(
229 struct xfs_btree_cur *cur,
230 union xfs_btree_key *key)
231 {
232 struct xfs_rmap_irec *rec = &cur->bc_rec.r;
233 struct xfs_rmap_key *kp = &key->rmap;
234 __u64 x, y;
235 int64_t d;
236
237 d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
238 if (d)
239 return d;
240
241 x = be64_to_cpu(kp->rm_owner);
242 y = rec->rm_owner;
243 if (x > y)
244 return 1;
245 else if (y > x)
246 return -1;
247
248 x = XFS_RMAP_OFF(be64_to_cpu(kp->rm_offset));
249 y = rec->rm_offset;
250 if (x > y)
251 return 1;
252 else if (y > x)
253 return -1;
254 return 0;
255 }
256
257 STATIC int64_t
258 xfs_rmapbt_diff_two_keys(
259 struct xfs_btree_cur *cur,
260 union xfs_btree_key *k1,
261 union xfs_btree_key *k2)
262 {
263 struct xfs_rmap_key *kp1 = &k1->rmap;
264 struct xfs_rmap_key *kp2 = &k2->rmap;
265 int64_t d;
266 __u64 x, y;
267
268 d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
269 be32_to_cpu(kp2->rm_startblock);
270 if (d)
271 return d;
272
273 x = be64_to_cpu(kp1->rm_owner);
274 y = be64_to_cpu(kp2->rm_owner);
275 if (x > y)
276 return 1;
277 else if (y > x)
278 return -1;
279
280 x = XFS_RMAP_OFF(be64_to_cpu(kp1->rm_offset));
281 y = XFS_RMAP_OFF(be64_to_cpu(kp2->rm_offset));
282 if (x > y)
283 return 1;
284 else if (y > x)
285 return -1;
286 return 0;
287 }
288
289 static xfs_failaddr_t
290 xfs_rmapbt_verify(
291 struct xfs_buf *bp)
292 {
293 struct xfs_mount *mp = bp->b_target->bt_mount;
294 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
295 struct xfs_perag *pag = bp->b_pag;
296 xfs_failaddr_t fa;
297 unsigned int level;
298
299 /*
300 * magic number and level verification
301 *
302 * During growfs operations, we can't verify the exact level or owner as
303 * the perag is not fully initialised and hence not attached to the
304 * buffer. In this case, check against the maximum tree depth.
305 *
306 * Similarly, during log recovery we will have a perag structure
307 * attached, but the agf information will not yet have been initialised
308 * from the on disk AGF. Again, we can only check against maximum limits
309 * in this case.
310 */
311 if (block->bb_magic != cpu_to_be32(XFS_RMAP_CRC_MAGIC))
312 return __this_address;
313
314 if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
315 return __this_address;
316 fa = xfs_btree_sblock_v5hdr_verify(bp);
317 if (fa)
318 return fa;
319
320 level = be16_to_cpu(block->bb_level);
321 if (pag && pag->pagf_init) {
322 if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
323 return __this_address;
324 } else if (level >= mp->m_rmap_maxlevels)
325 return __this_address;
326
327 return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
328 }
329
330 static void
331 xfs_rmapbt_read_verify(
332 struct xfs_buf *bp)
333 {
334 xfs_failaddr_t fa;
335
336 if (!xfs_btree_sblock_verify_crc(bp))
337 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
338 else {
339 fa = xfs_rmapbt_verify(bp);
340 if (fa)
341 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
342 }
343
344 if (bp->b_error)
345 trace_xfs_btree_corrupt(bp, _RET_IP_);
346 }
347
348 static void
349 xfs_rmapbt_write_verify(
350 struct xfs_buf *bp)
351 {
352 xfs_failaddr_t fa;
353
354 fa = xfs_rmapbt_verify(bp);
355 if (fa) {
356 trace_xfs_btree_corrupt(bp, _RET_IP_);
357 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
358 return;
359 }
360 xfs_btree_sblock_calc_crc(bp);
361
362 }
363
364 const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
365 .name = "xfs_rmapbt",
366 .verify_read = xfs_rmapbt_read_verify,
367 .verify_write = xfs_rmapbt_write_verify,
368 .verify_struct = xfs_rmapbt_verify,
369 };
370
371 STATIC int
372 xfs_rmapbt_keys_inorder(
373 struct xfs_btree_cur *cur,
374 union xfs_btree_key *k1,
375 union xfs_btree_key *k2)
376 {
377 uint32_t x;
378 uint32_t y;
379 uint64_t a;
380 uint64_t b;
381
382 x = be32_to_cpu(k1->rmap.rm_startblock);
383 y = be32_to_cpu(k2->rmap.rm_startblock);
384 if (x < y)
385 return 1;
386 else if (x > y)
387 return 0;
388 a = be64_to_cpu(k1->rmap.rm_owner);
389 b = be64_to_cpu(k2->rmap.rm_owner);
390 if (a < b)
391 return 1;
392 else if (a > b)
393 return 0;
394 a = XFS_RMAP_OFF(be64_to_cpu(k1->rmap.rm_offset));
395 b = XFS_RMAP_OFF(be64_to_cpu(k2->rmap.rm_offset));
396 if (a <= b)
397 return 1;
398 return 0;
399 }
400
401 STATIC int
402 xfs_rmapbt_recs_inorder(
403 struct xfs_btree_cur *cur,
404 union xfs_btree_rec *r1,
405 union xfs_btree_rec *r2)
406 {
407 uint32_t x;
408 uint32_t y;
409 uint64_t a;
410 uint64_t b;
411
412 x = be32_to_cpu(r1->rmap.rm_startblock);
413 y = be32_to_cpu(r2->rmap.rm_startblock);
414 if (x < y)
415 return 1;
416 else if (x > y)
417 return 0;
418 a = be64_to_cpu(r1->rmap.rm_owner);
419 b = be64_to_cpu(r2->rmap.rm_owner);
420 if (a < b)
421 return 1;
422 else if (a > b)
423 return 0;
424 a = XFS_RMAP_OFF(be64_to_cpu(r1->rmap.rm_offset));
425 b = XFS_RMAP_OFF(be64_to_cpu(r2->rmap.rm_offset));
426 if (a <= b)
427 return 1;
428 return 0;
429 }
430
431 static const struct xfs_btree_ops xfs_rmapbt_ops = {
432 .rec_len = sizeof(struct xfs_rmap_rec),
433 .key_len = 2 * sizeof(struct xfs_rmap_key),
434
435 .dup_cursor = xfs_rmapbt_dup_cursor,
436 .set_root = xfs_rmapbt_set_root,
437 .alloc_block = xfs_rmapbt_alloc_block,
438 .free_block = xfs_rmapbt_free_block,
439 .get_minrecs = xfs_rmapbt_get_minrecs,
440 .get_maxrecs = xfs_rmapbt_get_maxrecs,
441 .init_key_from_rec = xfs_rmapbt_init_key_from_rec,
442 .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
443 .init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
444 .init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur,
445 .key_diff = xfs_rmapbt_key_diff,
446 .buf_ops = &xfs_rmapbt_buf_ops,
447 .diff_two_keys = xfs_rmapbt_diff_two_keys,
448 .keys_inorder = xfs_rmapbt_keys_inorder,
449 .recs_inorder = xfs_rmapbt_recs_inorder,
450 };
451
452 /*
453 * Allocate a new allocation btree cursor.
454 */
455 struct xfs_btree_cur *
456 xfs_rmapbt_init_cursor(
457 struct xfs_mount *mp,
458 struct xfs_trans *tp,
459 struct xfs_buf *agbp,
460 xfs_agnumber_t agno)
461 {
462 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
463 struct xfs_btree_cur *cur;
464
465 cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
466 cur->bc_tp = tp;
467 cur->bc_mp = mp;
468 /* Overlapping btree; 2 keys per pointer. */
469 cur->bc_btnum = XFS_BTNUM_RMAP;
470 cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
471 cur->bc_blocklog = mp->m_sb.sb_blocklog;
472 cur->bc_ops = &xfs_rmapbt_ops;
473 cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
474 cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
475
476 cur->bc_private.a.agbp = agbp;
477 cur->bc_private.a.agno = agno;
478
479 return cur;
480 }
481
482 /*
483 * Calculate number of records in an rmap btree block.
484 */
485 int
486 xfs_rmapbt_maxrecs(
487 int blocklen,
488 int leaf)
489 {
490 blocklen -= XFS_RMAP_BLOCK_LEN;
491
492 if (leaf)
493 return blocklen / sizeof(struct xfs_rmap_rec);
494 return blocklen /
495 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
496 }
497
498 /* Compute the maximum height of an rmap btree. */
499 void
500 xfs_rmapbt_compute_maxlevels(
501 struct xfs_mount *mp)
502 {
503 /*
504 * On a non-reflink filesystem, the maximum number of rmap
505 * records is the number of blocks in the AG, hence the max
506 * rmapbt height is log_$maxrecs($agblocks). However, with
507 * reflink each AG block can have up to 2^32 (per the refcount
508 * record format) owners, which means that theoretically we
509 * could face up to 2^64 rmap records.
510 *
511 * That effectively means that the max rmapbt height must be
512 * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG
513 * blocks to feed the rmapbt long before the rmapbt reaches
514 * maximum height. The reflink code uses ag_resv_critical to
515 * disallow reflinking when less than 10% of the per-AG metadata
516 * block reservation since the fallback is a regular file copy.
517 */
518 if (xfs_sb_version_hasreflink(&mp->m_sb))
519 mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
520 else
521 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
522 mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
523 }
524
525 /* Calculate the refcount btree size for some records. */
526 xfs_extlen_t
527 xfs_rmapbt_calc_size(
528 struct xfs_mount *mp,
529 unsigned long long len)
530 {
531 return xfs_btree_calc_size(mp->m_rmap_mnr, len);
532 }
533
534 /*
535 * Calculate the maximum refcount btree size.
536 */
537 xfs_extlen_t
538 xfs_rmapbt_max_size(
539 struct xfs_mount *mp,
540 xfs_agblock_t agblocks)
541 {
542 /* Bail out if we're uninitialized, which can happen in mkfs. */
543 if (mp->m_rmap_mxr[0] == 0)
544 return 0;
545
546 return xfs_rmapbt_calc_size(mp, agblocks);
547 }
548
549 /*
550 * Figure out how many blocks to reserve and how many are used by this btree.
551 */
552 int
553 xfs_rmapbt_calc_reserves(
554 struct xfs_mount *mp,
555 struct xfs_trans *tp,
556 xfs_agnumber_t agno,
557 xfs_extlen_t *ask,
558 xfs_extlen_t *used)
559 {
560 struct xfs_buf *agbp;
561 struct xfs_agf *agf;
562 xfs_agblock_t agblocks;
563 xfs_extlen_t tree_len;
564 int error;
565
566 if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
567 return 0;
568
569 error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp);
570 if (error)
571 return error;
572
573 agf = XFS_BUF_TO_AGF(agbp);
574 agblocks = be32_to_cpu(agf->agf_length);
575 tree_len = be32_to_cpu(agf->agf_rmap_blocks);
576 xfs_trans_brelse(tp, agbp);
577
578 /* Reserve 1% of the AG or enough for 1 block per record. */
579 *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
580 *used += tree_len;
581
582 return error;
583 }