1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
6 #include "libxfs_priv.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_shared.h"
11 #include "xfs_trans_resv.h"
14 #include "xfs_mount.h"
15 #include "xfs_defer.h"
16 #include "xfs_btree.h"
18 #include "xfs_alloc_btree.h"
19 #include "xfs_alloc.h"
20 #include "xfs_errortag.h"
21 #include "xfs_trace.h"
22 #include "xfs_trans.h"
23 #include "xfs_ag_resv.h"
26 extern kmem_zone_t
*xfs_bmap_free_item_zone
;
28 struct workqueue_struct
*xfs_alloc_wq
;
30 #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
32 #define XFSA_FIXUP_BNO_OK 1
33 #define XFSA_FIXUP_CNT_OK 2
35 STATIC
int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t
*);
36 STATIC
int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t
*);
37 STATIC
int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t
*);
40 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
41 * the beginning of the block for a proper header with the location information
48 unsigned int size
= mp
->m_sb
.sb_sectsize
;
50 if (xfs_sb_version_hascrc(&mp
->m_sb
))
51 size
-= sizeof(struct xfs_agfl
);
53 return size
/ sizeof(xfs_agblock_t
);
60 if (xfs_sb_version_hasrmapbt(&mp
->m_sb
))
61 return XFS_RMAP_BLOCK(mp
) + 1;
62 if (xfs_sb_version_hasfinobt(&mp
->m_sb
))
63 return XFS_FIBT_BLOCK(mp
) + 1;
64 return XFS_IBT_BLOCK(mp
) + 1;
71 if (xfs_sb_version_hasreflink(&mp
->m_sb
))
72 return xfs_refc_block(mp
) + 1;
73 if (xfs_sb_version_hasrmapbt(&mp
->m_sb
))
74 return XFS_RMAP_BLOCK(mp
) + 1;
75 if (xfs_sb_version_hasfinobt(&mp
->m_sb
))
76 return XFS_FIBT_BLOCK(mp
) + 1;
77 return XFS_IBT_BLOCK(mp
) + 1;
81 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
82 * AGF buffer (PV 947395), we place constraints on the relationship among
83 * actual allocations for data blocks, freelist blocks, and potential file data
84 * bmap btree blocks. However, these restrictions may result in no actual space
85 * allocated for a delayed extent, for example, a data block in a certain AG is
86 * allocated but there is no additional block for the additional bmap btree
87 * block due to a split of the bmap btree of the file. The result of this may
88 * lead to an infinite loop when the file gets flushed to disk and all delayed
89 * extents need to be actually allocated. To get around this, we explicitly set
90 * aside a few blocks which will not be reserved in delayed allocation.
92 * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
93 * potential split of the file's bmap btree.
99 return mp
->m_sb
.sb_agcount
* (XFS_ALLOC_AGFL_RESERVE
+ 4);
103 * When deciding how much space to allocate out of an AG, we limit the
104 * allocation maximum size to the size the AG. However, we cannot use all the
105 * blocks in the AG - some are permanently used by metadata. These
106 * blocks are generally:
107 * - the AG superblock, AGF, AGI and AGFL
108 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally
109 * the AGI free inode and rmap btree root blocks.
110 * - blocks on the AGFL according to xfs_alloc_set_aside() limits
111 * - the rmapbt root block
113 * The AG headers are sector sized, so the amount of space they take up is
114 * dependent on filesystem geometry. The others are all single blocks.
117 xfs_alloc_ag_max_usable(
118 struct xfs_mount
*mp
)
122 blocks
= XFS_BB_TO_FSB(mp
, XFS_FSS_TO_BB(mp
, 4)); /* ag headers */
123 blocks
+= XFS_ALLOC_AGFL_RESERVE
;
124 blocks
+= 3; /* AGF, AGI btree root blocks */
125 if (xfs_sb_version_hasfinobt(&mp
->m_sb
))
126 blocks
++; /* finobt root block */
127 if (xfs_sb_version_hasrmapbt(&mp
->m_sb
))
128 blocks
++; /* rmap root block */
129 if (xfs_sb_version_hasreflink(&mp
->m_sb
))
130 blocks
++; /* refcount root block */
132 return mp
->m_sb
.sb_agblocks
- blocks
;
136 * Lookup the record equal to [bno, len] in the btree given by cur.
138 STATIC
int /* error */
140 struct xfs_btree_cur
*cur
, /* btree cursor */
141 xfs_agblock_t bno
, /* starting block of extent */
142 xfs_extlen_t len
, /* length of extent */
143 int *stat
) /* success/failure */
147 cur
->bc_rec
.a
.ar_startblock
= bno
;
148 cur
->bc_rec
.a
.ar_blockcount
= len
;
149 error
= xfs_btree_lookup(cur
, XFS_LOOKUP_EQ
, stat
);
150 cur
->bc_ag
.abt
.active
= (*stat
== 1);
155 * Lookup the first record greater than or equal to [bno, len]
156 * in the btree given by cur.
160 struct xfs_btree_cur
*cur
, /* btree cursor */
161 xfs_agblock_t bno
, /* starting block of extent */
162 xfs_extlen_t len
, /* length of extent */
163 int *stat
) /* success/failure */
167 cur
->bc_rec
.a
.ar_startblock
= bno
;
168 cur
->bc_rec
.a
.ar_blockcount
= len
;
169 error
= xfs_btree_lookup(cur
, XFS_LOOKUP_GE
, stat
);
170 cur
->bc_ag
.abt
.active
= (*stat
== 1);
175 * Lookup the first record less than or equal to [bno, len]
176 * in the btree given by cur.
180 struct xfs_btree_cur
*cur
, /* btree cursor */
181 xfs_agblock_t bno
, /* starting block of extent */
182 xfs_extlen_t len
, /* length of extent */
183 int *stat
) /* success/failure */
186 cur
->bc_rec
.a
.ar_startblock
= bno
;
187 cur
->bc_rec
.a
.ar_blockcount
= len
;
188 error
= xfs_btree_lookup(cur
, XFS_LOOKUP_LE
, stat
);
189 cur
->bc_ag
.abt
.active
= (*stat
== 1);
194 xfs_alloc_cur_active(
195 struct xfs_btree_cur
*cur
)
197 return cur
&& cur
->bc_ag
.abt
.active
;
201 * Update the record referred to by cur to the value given
203 * This either works (return 0) or gets an EFSCORRUPTED error.
205 STATIC
int /* error */
207 struct xfs_btree_cur
*cur
, /* btree cursor */
208 xfs_agblock_t bno
, /* starting block of extent */
209 xfs_extlen_t len
) /* length of extent */
211 union xfs_btree_rec rec
;
213 rec
.alloc
.ar_startblock
= cpu_to_be32(bno
);
214 rec
.alloc
.ar_blockcount
= cpu_to_be32(len
);
215 return xfs_btree_update(cur
, &rec
);
219 * Get the data from the pointed-to record.
223 struct xfs_btree_cur
*cur
, /* btree cursor */
224 xfs_agblock_t
*bno
, /* output: starting block of extent */
225 xfs_extlen_t
*len
, /* output: length of extent */
226 int *stat
) /* output: success/failure */
228 struct xfs_mount
*mp
= cur
->bc_mp
;
229 xfs_agnumber_t agno
= cur
->bc_ag
.agno
;
230 union xfs_btree_rec
*rec
;
233 error
= xfs_btree_get_rec(cur
, &rec
, stat
);
234 if (error
|| !(*stat
))
237 *bno
= be32_to_cpu(rec
->alloc
.ar_startblock
);
238 *len
= be32_to_cpu(rec
->alloc
.ar_blockcount
);
243 /* check for valid extent range, including overflow */
244 if (!xfs_verify_agbno(mp
, agno
, *bno
))
246 if (*bno
> *bno
+ *len
)
248 if (!xfs_verify_agbno(mp
, agno
, *bno
+ *len
- 1))
255 "%s Freespace BTree record corruption in AG %d detected!",
256 cur
->bc_btnum
== XFS_BTNUM_BNO
? "Block" : "Size", agno
);
258 "start block 0x%x block count 0x%x", *bno
, *len
);
259 return -EFSCORRUPTED
;
263 * Compute aligned version of the found extent.
264 * Takes alignment and min length into account.
267 xfs_alloc_compute_aligned(
268 xfs_alloc_arg_t
*args
, /* allocation argument structure */
269 xfs_agblock_t foundbno
, /* starting block in found extent */
270 xfs_extlen_t foundlen
, /* length in found extent */
271 xfs_agblock_t
*resbno
, /* result block number */
272 xfs_extlen_t
*reslen
, /* result length */
275 xfs_agblock_t bno
= foundbno
;
276 xfs_extlen_t len
= foundlen
;
280 /* Trim busy sections out of found extent */
281 busy
= xfs_extent_busy_trim(args
, &bno
, &len
, busy_gen
);
284 * If we have a largish extent that happens to start before min_agbno,
285 * see if we can shift it into range...
287 if (bno
< args
->min_agbno
&& bno
+ len
> args
->min_agbno
) {
288 diff
= args
->min_agbno
- bno
;
295 if (args
->alignment
> 1 && len
>= args
->minlen
) {
296 xfs_agblock_t aligned_bno
= roundup(bno
, args
->alignment
);
298 diff
= aligned_bno
- bno
;
300 *resbno
= aligned_bno
;
301 *reslen
= diff
>= len
? 0 : len
- diff
;
311 * Compute best start block and diff for "near" allocations.
312 * freelen >= wantlen already checked by caller.
314 STATIC xfs_extlen_t
/* difference value (absolute) */
315 xfs_alloc_compute_diff(
316 xfs_agblock_t wantbno
, /* target starting block */
317 xfs_extlen_t wantlen
, /* target length */
318 xfs_extlen_t alignment
, /* target alignment */
319 int datatype
, /* are we allocating data? */
320 xfs_agblock_t freebno
, /* freespace's starting block */
321 xfs_extlen_t freelen
, /* freespace's length */
322 xfs_agblock_t
*newbnop
) /* result: best start block from free */
324 xfs_agblock_t freeend
; /* end of freespace extent */
325 xfs_agblock_t newbno1
; /* return block number */
326 xfs_agblock_t newbno2
; /* other new block number */
327 xfs_extlen_t newlen1
=0; /* length with newbno1 */
328 xfs_extlen_t newlen2
=0; /* length with newbno2 */
329 xfs_agblock_t wantend
; /* end of target extent */
330 bool userdata
= datatype
& XFS_ALLOC_USERDATA
;
332 ASSERT(freelen
>= wantlen
);
333 freeend
= freebno
+ freelen
;
334 wantend
= wantbno
+ wantlen
;
336 * We want to allocate from the start of a free extent if it is past
337 * the desired block or if we are allocating user data and the free
338 * extent is before desired block. The second case is there to allow
339 * for contiguous allocation from the remaining free space if the file
340 * grows in the short term.
342 if (freebno
>= wantbno
|| (userdata
&& freeend
< wantend
)) {
343 if ((newbno1
= roundup(freebno
, alignment
)) >= freeend
)
344 newbno1
= NULLAGBLOCK
;
345 } else if (freeend
>= wantend
&& alignment
> 1) {
346 newbno1
= roundup(wantbno
, alignment
);
347 newbno2
= newbno1
- alignment
;
348 if (newbno1
>= freeend
)
349 newbno1
= NULLAGBLOCK
;
351 newlen1
= XFS_EXTLEN_MIN(wantlen
, freeend
- newbno1
);
352 if (newbno2
< freebno
)
353 newbno2
= NULLAGBLOCK
;
355 newlen2
= XFS_EXTLEN_MIN(wantlen
, freeend
- newbno2
);
356 if (newbno1
!= NULLAGBLOCK
&& newbno2
!= NULLAGBLOCK
) {
357 if (newlen1
< newlen2
||
358 (newlen1
== newlen2
&&
359 XFS_ABSDIFF(newbno1
, wantbno
) >
360 XFS_ABSDIFF(newbno2
, wantbno
)))
362 } else if (newbno2
!= NULLAGBLOCK
)
364 } else if (freeend
>= wantend
) {
366 } else if (alignment
> 1) {
367 newbno1
= roundup(freeend
- wantlen
, alignment
);
368 if (newbno1
> freeend
- wantlen
&&
369 newbno1
- alignment
>= freebno
)
370 newbno1
-= alignment
;
371 else if (newbno1
>= freeend
)
372 newbno1
= NULLAGBLOCK
;
374 newbno1
= freeend
- wantlen
;
376 return newbno1
== NULLAGBLOCK
? 0 : XFS_ABSDIFF(newbno1
, wantbno
);
380 * Fix up the length, based on mod and prod.
381 * len should be k * prod + mod for some k.
382 * If len is too small it is returned unchanged.
383 * If len hits maxlen it is left alone.
387 xfs_alloc_arg_t
*args
) /* allocation argument structure */
392 ASSERT(args
->mod
< args
->prod
);
394 ASSERT(rlen
>= args
->minlen
);
395 ASSERT(rlen
<= args
->maxlen
);
396 if (args
->prod
<= 1 || rlen
< args
->mod
|| rlen
== args
->maxlen
||
397 (args
->mod
== 0 && rlen
< args
->prod
))
399 k
= rlen
% args
->prod
;
403 rlen
= rlen
- (k
- args
->mod
);
405 rlen
= rlen
- args
->prod
+ (args
->mod
- k
);
406 /* casts to (int) catch length underflows */
407 if ((int)rlen
< (int)args
->minlen
)
409 ASSERT(rlen
>= args
->minlen
&& rlen
<= args
->maxlen
);
410 ASSERT(rlen
% args
->prod
== args
->mod
);
411 ASSERT(args
->pag
->pagf_freeblks
+ args
->pag
->pagf_flcount
>=
412 rlen
+ args
->minleft
);
417 * Update the two btrees, logically removing from freespace the extent
418 * starting at rbno, rlen blocks. The extent is contained within the
419 * actual (current) free extent fbno for flen blocks.
420 * Flags are passed in indicating whether the cursors are set to the
423 STATIC
int /* error code */
424 xfs_alloc_fixup_trees(
425 xfs_btree_cur_t
*cnt_cur
, /* cursor for by-size btree */
426 xfs_btree_cur_t
*bno_cur
, /* cursor for by-block btree */
427 xfs_agblock_t fbno
, /* starting block of free extent */
428 xfs_extlen_t flen
, /* length of free extent */
429 xfs_agblock_t rbno
, /* starting block of returned extent */
430 xfs_extlen_t rlen
, /* length of returned extent */
431 int flags
) /* flags, XFSA_FIXUP_... */
433 int error
; /* error code */
434 int i
; /* operation results */
435 xfs_agblock_t nfbno1
; /* first new free startblock */
436 xfs_agblock_t nfbno2
; /* second new free startblock */
437 xfs_extlen_t nflen1
=0; /* first new free length */
438 xfs_extlen_t nflen2
=0; /* second new free length */
439 struct xfs_mount
*mp
;
444 * Look up the record in the by-size tree if necessary.
446 if (flags
& XFSA_FIXUP_CNT_OK
) {
448 if ((error
= xfs_alloc_get_rec(cnt_cur
, &nfbno1
, &nflen1
, &i
)))
450 if (XFS_IS_CORRUPT(mp
,
454 return -EFSCORRUPTED
;
457 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, fbno
, flen
, &i
)))
459 if (XFS_IS_CORRUPT(mp
, i
!= 1))
460 return -EFSCORRUPTED
;
463 * Look up the record in the by-block tree if necessary.
465 if (flags
& XFSA_FIXUP_BNO_OK
) {
467 if ((error
= xfs_alloc_get_rec(bno_cur
, &nfbno1
, &nflen1
, &i
)))
469 if (XFS_IS_CORRUPT(mp
,
473 return -EFSCORRUPTED
;
476 if ((error
= xfs_alloc_lookup_eq(bno_cur
, fbno
, flen
, &i
)))
478 if (XFS_IS_CORRUPT(mp
, i
!= 1))
479 return -EFSCORRUPTED
;
483 if (bno_cur
->bc_nlevels
== 1 && cnt_cur
->bc_nlevels
== 1) {
484 struct xfs_btree_block
*bnoblock
;
485 struct xfs_btree_block
*cntblock
;
487 bnoblock
= XFS_BUF_TO_BLOCK(bno_cur
->bc_bufs
[0]);
488 cntblock
= XFS_BUF_TO_BLOCK(cnt_cur
->bc_bufs
[0]);
490 if (XFS_IS_CORRUPT(mp
,
491 bnoblock
->bb_numrecs
!=
492 cntblock
->bb_numrecs
))
493 return -EFSCORRUPTED
;
498 * Deal with all four cases: the allocated record is contained
499 * within the freespace record, so we can have new freespace
500 * at either (or both) end, or no freespace remaining.
502 if (rbno
== fbno
&& rlen
== flen
)
503 nfbno1
= nfbno2
= NULLAGBLOCK
;
504 else if (rbno
== fbno
) {
505 nfbno1
= rbno
+ rlen
;
506 nflen1
= flen
- rlen
;
507 nfbno2
= NULLAGBLOCK
;
508 } else if (rbno
+ rlen
== fbno
+ flen
) {
510 nflen1
= flen
- rlen
;
511 nfbno2
= NULLAGBLOCK
;
514 nflen1
= rbno
- fbno
;
515 nfbno2
= rbno
+ rlen
;
516 nflen2
= (fbno
+ flen
) - nfbno2
;
519 * Delete the entry from the by-size btree.
521 if ((error
= xfs_btree_delete(cnt_cur
, &i
)))
523 if (XFS_IS_CORRUPT(mp
, i
!= 1))
524 return -EFSCORRUPTED
;
526 * Add new by-size btree entry(s).
528 if (nfbno1
!= NULLAGBLOCK
) {
529 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, nfbno1
, nflen1
, &i
)))
531 if (XFS_IS_CORRUPT(mp
, i
!= 0))
532 return -EFSCORRUPTED
;
533 if ((error
= xfs_btree_insert(cnt_cur
, &i
)))
535 if (XFS_IS_CORRUPT(mp
, i
!= 1))
536 return -EFSCORRUPTED
;
538 if (nfbno2
!= NULLAGBLOCK
) {
539 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, nfbno2
, nflen2
, &i
)))
541 if (XFS_IS_CORRUPT(mp
, i
!= 0))
542 return -EFSCORRUPTED
;
543 if ((error
= xfs_btree_insert(cnt_cur
, &i
)))
545 if (XFS_IS_CORRUPT(mp
, i
!= 1))
546 return -EFSCORRUPTED
;
549 * Fix up the by-block btree entry(s).
551 if (nfbno1
== NULLAGBLOCK
) {
553 * No remaining freespace, just delete the by-block tree entry.
555 if ((error
= xfs_btree_delete(bno_cur
, &i
)))
557 if (XFS_IS_CORRUPT(mp
, i
!= 1))
558 return -EFSCORRUPTED
;
561 * Update the by-block entry to start later|be shorter.
563 if ((error
= xfs_alloc_update(bno_cur
, nfbno1
, nflen1
)))
566 if (nfbno2
!= NULLAGBLOCK
) {
568 * 2 resulting free entries, need to add one.
570 if ((error
= xfs_alloc_lookup_eq(bno_cur
, nfbno2
, nflen2
, &i
)))
572 if (XFS_IS_CORRUPT(mp
, i
!= 0))
573 return -EFSCORRUPTED
;
574 if ((error
= xfs_btree_insert(bno_cur
, &i
)))
576 if (XFS_IS_CORRUPT(mp
, i
!= 1))
577 return -EFSCORRUPTED
;
582 static xfs_failaddr_t
586 struct xfs_mount
*mp
= bp
->b_mount
;
587 struct xfs_agfl
*agfl
= XFS_BUF_TO_AGFL(bp
);
588 __be32
*agfl_bno
= xfs_buf_to_agfl_bno(bp
);
592 * There is no verification of non-crc AGFLs because mkfs does not
593 * initialise the AGFL to zero or NULL. Hence the only valid part of the
594 * AGFL is what the AGF says is active. We can't get to the AGF, so we
595 * can't verify just those entries are valid.
597 if (!xfs_sb_version_hascrc(&mp
->m_sb
))
600 if (!xfs_verify_magic(bp
, agfl
->agfl_magicnum
))
601 return __this_address
;
602 if (!uuid_equal(&agfl
->agfl_uuid
, &mp
->m_sb
.sb_meta_uuid
))
603 return __this_address
;
605 * during growfs operations, the perag is not fully initialised,
606 * so we can't use it for any useful checking. growfs ensures we can't
607 * use it by using uncached buffers that don't have the perag attached
608 * so we can detect and avoid this problem.
610 if (bp
->b_pag
&& be32_to_cpu(agfl
->agfl_seqno
) != bp
->b_pag
->pag_agno
)
611 return __this_address
;
613 for (i
= 0; i
< xfs_agfl_size(mp
); i
++) {
614 if (be32_to_cpu(agfl_bno
[i
]) != NULLAGBLOCK
&&
615 be32_to_cpu(agfl_bno
[i
]) >= mp
->m_sb
.sb_agblocks
)
616 return __this_address
;
619 if (!xfs_log_check_lsn(mp
, be64_to_cpu(XFS_BUF_TO_AGFL(bp
)->agfl_lsn
)))
620 return __this_address
;
625 xfs_agfl_read_verify(
628 struct xfs_mount
*mp
= bp
->b_mount
;
632 * There is no verification of non-crc AGFLs because mkfs does not
633 * initialise the AGFL to zero or NULL. Hence the only valid part of the
634 * AGFL is what the AGF says is active. We can't get to the AGF, so we
635 * can't verify just those entries are valid.
637 if (!xfs_sb_version_hascrc(&mp
->m_sb
))
640 if (!xfs_buf_verify_cksum(bp
, XFS_AGFL_CRC_OFF
))
641 xfs_verifier_error(bp
, -EFSBADCRC
, __this_address
);
643 fa
= xfs_agfl_verify(bp
);
645 xfs_verifier_error(bp
, -EFSCORRUPTED
, fa
);
650 xfs_agfl_write_verify(
653 struct xfs_mount
*mp
= bp
->b_mount
;
654 struct xfs_buf_log_item
*bip
= bp
->b_log_item
;
657 /* no verification of non-crc AGFLs */
658 if (!xfs_sb_version_hascrc(&mp
->m_sb
))
661 fa
= xfs_agfl_verify(bp
);
663 xfs_verifier_error(bp
, -EFSCORRUPTED
, fa
);
668 XFS_BUF_TO_AGFL(bp
)->agfl_lsn
= cpu_to_be64(bip
->bli_item
.li_lsn
);
670 xfs_buf_update_cksum(bp
, XFS_AGFL_CRC_OFF
);
673 const struct xfs_buf_ops xfs_agfl_buf_ops
= {
675 .magic
= { cpu_to_be32(XFS_AGFL_MAGIC
), cpu_to_be32(XFS_AGFL_MAGIC
) },
676 .verify_read
= xfs_agfl_read_verify
,
677 .verify_write
= xfs_agfl_write_verify
,
678 .verify_struct
= xfs_agfl_verify
,
682 * Read in the allocation group free block array.
686 xfs_mount_t
*mp
, /* mount point structure */
687 xfs_trans_t
*tp
, /* transaction pointer */
688 xfs_agnumber_t agno
, /* allocation group number */
689 xfs_buf_t
**bpp
) /* buffer for the ag free block array */
691 xfs_buf_t
*bp
; /* return value */
694 ASSERT(agno
!= NULLAGNUMBER
);
695 error
= xfs_trans_read_buf(
696 mp
, tp
, mp
->m_ddev_targp
,
697 XFS_AG_DADDR(mp
, agno
, XFS_AGFL_DADDR(mp
)),
698 XFS_FSS_TO_BB(mp
, 1), 0, &bp
, &xfs_agfl_buf_ops
);
701 xfs_buf_set_ref(bp
, XFS_AGFL_REF
);
707 xfs_alloc_update_counters(
708 struct xfs_trans
*tp
,
709 struct xfs_buf
*agbp
,
712 struct xfs_agf
*agf
= agbp
->b_addr
;
714 agbp
->b_pag
->pagf_freeblks
+= len
;
715 be32_add_cpu(&agf
->agf_freeblks
, len
);
717 xfs_trans_agblocks_delta(tp
, len
);
718 if (unlikely(be32_to_cpu(agf
->agf_freeblks
) >
719 be32_to_cpu(agf
->agf_length
))) {
720 xfs_buf_mark_corrupt(agbp
);
721 return -EFSCORRUPTED
;
724 xfs_alloc_log_agf(tp
, agbp
, XFS_AGF_FREEBLKS
);
729 * Block allocation algorithm and data structures.
731 struct xfs_alloc_cur
{
732 struct xfs_btree_cur
*cnt
; /* btree cursors */
733 struct xfs_btree_cur
*bnolt
;
734 struct xfs_btree_cur
*bnogt
;
735 xfs_extlen_t cur_len
;/* current search length */
736 xfs_agblock_t rec_bno
;/* extent startblock */
737 xfs_extlen_t rec_len
;/* extent length */
738 xfs_agblock_t bno
; /* alloc bno */
739 xfs_extlen_t len
; /* alloc len */
740 xfs_extlen_t diff
; /* diff from search bno */
741 unsigned int busy_gen
;/* busy state */
746 * Set up cursors, etc. in the extent allocation cursor. This function can be
747 * called multiple times to reset an initialized structure without having to
748 * reallocate cursors.
752 struct xfs_alloc_arg
*args
,
753 struct xfs_alloc_cur
*acur
)
758 ASSERT(args
->alignment
== 1 || args
->type
!= XFS_ALLOCTYPE_THIS_BNO
);
760 acur
->cur_len
= args
->maxlen
;
770 * Perform an initial cntbt lookup to check for availability of maxlen
771 * extents. If this fails, we'll return -ENOSPC to signal the caller to
772 * attempt a small allocation.
775 acur
->cnt
= xfs_allocbt_init_cursor(args
->mp
, args
->tp
,
776 args
->agbp
, args
->agno
, XFS_BTNUM_CNT
);
777 error
= xfs_alloc_lookup_ge(acur
->cnt
, 0, args
->maxlen
, &i
);
782 * Allocate the bnobt left and right search cursors.
785 acur
->bnolt
= xfs_allocbt_init_cursor(args
->mp
, args
->tp
,
786 args
->agbp
, args
->agno
, XFS_BTNUM_BNO
);
788 acur
->bnogt
= xfs_allocbt_init_cursor(args
->mp
, args
->tp
,
789 args
->agbp
, args
->agno
, XFS_BTNUM_BNO
);
790 return i
== 1 ? 0 : -ENOSPC
;
795 struct xfs_alloc_cur
*acur
,
798 int cur_error
= XFS_BTREE_NOERROR
;
801 cur_error
= XFS_BTREE_ERROR
;
804 xfs_btree_del_cursor(acur
->cnt
, cur_error
);
806 xfs_btree_del_cursor(acur
->bnolt
, cur_error
);
808 xfs_btree_del_cursor(acur
->bnogt
, cur_error
);
809 acur
->cnt
= acur
->bnolt
= acur
->bnogt
= NULL
;
813 * Check an extent for allocation and track the best available candidate in the
814 * allocation structure. The cursor is deactivated if it has entered an out of
815 * range state based on allocation arguments. Optionally return the extent
816 * extent geometry and allocation status if requested by the caller.
820 struct xfs_alloc_arg
*args
,
821 struct xfs_alloc_cur
*acur
,
822 struct xfs_btree_cur
*cur
,
826 xfs_agblock_t bno
, bnoa
, bnew
;
827 xfs_extlen_t len
, lena
, diff
= -1;
829 unsigned busy_gen
= 0;
830 bool deactivate
= false;
831 bool isbnobt
= cur
->bc_btnum
== XFS_BTNUM_BNO
;
835 error
= xfs_alloc_get_rec(cur
, &bno
, &len
, &i
);
838 if (XFS_IS_CORRUPT(args
->mp
, i
!= 1))
839 return -EFSCORRUPTED
;
842 * Check minlen and deactivate a cntbt cursor if out of acceptable size
843 * range (i.e., walking backwards looking for a minlen extent).
845 if (len
< args
->minlen
) {
846 deactivate
= !isbnobt
;
850 busy
= xfs_alloc_compute_aligned(args
, bno
, len
, &bnoa
, &lena
,
854 acur
->busy_gen
= busy_gen
;
855 /* deactivate a bnobt cursor outside of locality range */
856 if (bnoa
< args
->min_agbno
|| bnoa
> args
->max_agbno
) {
857 deactivate
= isbnobt
;
860 if (lena
< args
->minlen
)
863 args
->len
= XFS_EXTLEN_MIN(lena
, args
->maxlen
);
864 xfs_alloc_fix_len(args
);
865 ASSERT(args
->len
>= args
->minlen
);
866 if (args
->len
< acur
->len
)
870 * We have an aligned record that satisfies minlen and beats or matches
871 * the candidate extent size. Compare locality for near allocation mode.
873 ASSERT(args
->type
== XFS_ALLOCTYPE_NEAR_BNO
);
874 diff
= xfs_alloc_compute_diff(args
->agbno
, args
->len
,
875 args
->alignment
, args
->datatype
,
877 if (bnew
== NULLAGBLOCK
)
881 * Deactivate a bnobt cursor with worse locality than the current best.
883 if (diff
> acur
->diff
) {
884 deactivate
= isbnobt
;
888 ASSERT(args
->len
> acur
->len
||
889 (args
->len
== acur
->len
&& diff
<= acur
->diff
));
893 acur
->len
= args
->len
;
898 * We're done if we found a perfect allocation. This only deactivates
899 * the current cursor, but this is just an optimization to terminate a
900 * cntbt search that otherwise runs to the edge of the tree.
902 if (acur
->diff
== 0 && acur
->len
== args
->maxlen
)
906 cur
->bc_ag
.abt
.active
= false;
907 trace_xfs_alloc_cur_check(args
->mp
, cur
->bc_btnum
, bno
, len
, diff
,
913 * Complete an allocation of a candidate extent. Remove the extent from both
914 * trees and update the args structure.
917 xfs_alloc_cur_finish(
918 struct xfs_alloc_arg
*args
,
919 struct xfs_alloc_cur
*acur
)
921 struct xfs_agf __maybe_unused
*agf
= args
->agbp
->b_addr
;
924 ASSERT(acur
->cnt
&& acur
->bnolt
);
925 ASSERT(acur
->bno
>= acur
->rec_bno
);
926 ASSERT(acur
->bno
+ acur
->len
<= acur
->rec_bno
+ acur
->rec_len
);
927 ASSERT(acur
->rec_bno
+ acur
->rec_len
<= be32_to_cpu(agf
->agf_length
));
929 error
= xfs_alloc_fixup_trees(acur
->cnt
, acur
->bnolt
, acur
->rec_bno
,
930 acur
->rec_len
, acur
->bno
, acur
->len
, 0);
934 args
->agbno
= acur
->bno
;
935 args
->len
= acur
->len
;
938 trace_xfs_alloc_cur(args
);
943 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
944 * bno optimized lookup to search for extents with ideal size and locality.
947 xfs_alloc_cntbt_iter(
948 struct xfs_alloc_arg
*args
,
949 struct xfs_alloc_cur
*acur
)
951 struct xfs_btree_cur
*cur
= acur
->cnt
;
953 xfs_extlen_t len
, cur_len
;
957 if (!xfs_alloc_cur_active(cur
))
960 /* locality optimized lookup */
961 cur_len
= acur
->cur_len
;
962 error
= xfs_alloc_lookup_ge(cur
, args
->agbno
, cur_len
, &i
);
967 error
= xfs_alloc_get_rec(cur
, &bno
, &len
, &i
);
971 /* check the current record and update search length from it */
972 error
= xfs_alloc_cur_check(args
, acur
, cur
, &i
);
975 ASSERT(len
>= acur
->cur_len
);
979 * We looked up the first record >= [agbno, len] above. The agbno is a
980 * secondary key and so the current record may lie just before or after
981 * agbno. If it is past agbno, check the previous record too so long as
982 * the length matches as it may be closer. Don't check a smaller record
983 * because that could deactivate our cursor.
985 if (bno
> args
->agbno
) {
986 error
= xfs_btree_decrement(cur
, 0, &i
);
988 error
= xfs_alloc_get_rec(cur
, &bno
, &len
, &i
);
989 if (!error
&& i
&& len
== acur
->cur_len
)
990 error
= xfs_alloc_cur_check(args
, acur
, cur
,
998 * Increment the search key until we find at least one allocation
999 * candidate or if the extent we found was larger. Otherwise, double the
1000 * search key to optimize the search. Efficiency is more important here
1001 * than absolute best locality.
1004 if (!acur
->len
|| acur
->cur_len
>= cur_len
)
1007 acur
->cur_len
= cur_len
;
1013 * Deal with the case where only small freespaces remain. Either return the
1014 * contents of the last freespace record, or allocate space from the freelist if
1015 * there is nothing in the tree.
1017 STATIC
int /* error */
1018 xfs_alloc_ag_vextent_small(
1019 struct xfs_alloc_arg
*args
, /* allocation argument structure */
1020 struct xfs_btree_cur
*ccur
, /* optional by-size cursor */
1021 xfs_agblock_t
*fbnop
, /* result block number */
1022 xfs_extlen_t
*flenp
, /* result length */
1023 int *stat
) /* status: 0-freelist, 1-normal/none */
1025 struct xfs_agf
*agf
= args
->agbp
->b_addr
;
1027 xfs_agblock_t fbno
= NULLAGBLOCK
;
1028 xfs_extlen_t flen
= 0;
1032 * If a cntbt cursor is provided, try to allocate the largest record in
1033 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1034 * allocation. Make sure to respect minleft even when pulling from the
1038 error
= xfs_btree_decrement(ccur
, 0, &i
);
1042 error
= xfs_alloc_get_rec(ccur
, &fbno
, &flen
, &i
);
1045 if (XFS_IS_CORRUPT(args
->mp
, i
!= 1)) {
1046 error
= -EFSCORRUPTED
;
1052 if (args
->minlen
!= 1 || args
->alignment
!= 1 ||
1053 args
->resv
== XFS_AG_RESV_AGFL
||
1054 be32_to_cpu(agf
->agf_flcount
) <= args
->minleft
)
1057 error
= xfs_alloc_get_freelist(args
->tp
, args
->agbp
, &fbno
, 0);
1060 if (fbno
== NULLAGBLOCK
)
1063 xfs_extent_busy_reuse(args
->mp
, args
->agno
, fbno
, 1,
1064 (args
->datatype
& XFS_ALLOC_NOBUSY
));
1066 if (args
->datatype
& XFS_ALLOC_USERDATA
) {
1069 error
= xfs_trans_get_buf(args
->tp
, args
->mp
->m_ddev_targp
,
1070 XFS_AGB_TO_DADDR(args
->mp
, args
->agno
, fbno
),
1071 args
->mp
->m_bsize
, 0, &bp
);
1074 xfs_trans_binval(args
->tp
, bp
);
1076 *fbnop
= args
->agbno
= fbno
;
1077 *flenp
= args
->len
= 1;
1078 if (XFS_IS_CORRUPT(args
->mp
, fbno
>= be32_to_cpu(agf
->agf_length
))) {
1079 error
= -EFSCORRUPTED
;
1082 args
->wasfromfl
= 1;
1083 trace_xfs_alloc_small_freelist(args
);
1086 * If we're feeding an AGFL block to something that doesn't live in the
1087 * free space, we need to clear out the OWN_AG rmap.
1089 error
= xfs_rmap_free(args
->tp
, args
->agbp
, args
->agno
, fbno
, 1,
1090 &XFS_RMAP_OINFO_AG
);
1099 * Can't do the allocation, give up.
1101 if (flen
< args
->minlen
) {
1102 args
->agbno
= NULLAGBLOCK
;
1103 trace_xfs_alloc_small_notenough(args
);
1109 trace_xfs_alloc_small_done(args
);
1113 trace_xfs_alloc_small_error(args
);
1118 * Allocate a variable extent in the allocation group agno.
1119 * Type and bno are used to determine where in the allocation group the
1120 * extent will start.
1121 * Extent's length (returned in *len) will be between minlen and maxlen,
1122 * and of the form k * prod + mod unless there's nothing that large.
1123 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1125 STATIC
int /* error */
1126 xfs_alloc_ag_vextent(
1127 xfs_alloc_arg_t
*args
) /* argument structure for allocation */
1131 ASSERT(args
->minlen
> 0);
1132 ASSERT(args
->maxlen
> 0);
1133 ASSERT(args
->minlen
<= args
->maxlen
);
1134 ASSERT(args
->mod
< args
->prod
);
1135 ASSERT(args
->alignment
> 0);
1138 * Branch to correct routine based on the type.
1140 args
->wasfromfl
= 0;
1141 switch (args
->type
) {
1142 case XFS_ALLOCTYPE_THIS_AG
:
1143 error
= xfs_alloc_ag_vextent_size(args
);
1145 case XFS_ALLOCTYPE_NEAR_BNO
:
1146 error
= xfs_alloc_ag_vextent_near(args
);
1148 case XFS_ALLOCTYPE_THIS_BNO
:
1149 error
= xfs_alloc_ag_vextent_exact(args
);
1156 if (error
|| args
->agbno
== NULLAGBLOCK
)
1159 ASSERT(args
->len
>= args
->minlen
);
1160 ASSERT(args
->len
<= args
->maxlen
);
1161 ASSERT(!args
->wasfromfl
|| args
->resv
!= XFS_AG_RESV_AGFL
);
1162 ASSERT(args
->agbno
% args
->alignment
== 0);
1164 /* if not file data, insert new block into the reverse map btree */
1165 if (!xfs_rmap_should_skip_owner_update(&args
->oinfo
)) {
1166 error
= xfs_rmap_alloc(args
->tp
, args
->agbp
, args
->agno
,
1167 args
->agbno
, args
->len
, &args
->oinfo
);
1172 if (!args
->wasfromfl
) {
1173 error
= xfs_alloc_update_counters(args
->tp
, args
->agbp
,
1174 -((long)(args
->len
)));
1178 ASSERT(!xfs_extent_busy_search(args
->mp
, args
->agno
,
1179 args
->agbno
, args
->len
));
1182 xfs_ag_resv_alloc_extent(args
->pag
, args
->resv
, args
);
1184 XFS_STATS_INC(args
->mp
, xs_allocx
);
1185 XFS_STATS_ADD(args
->mp
, xs_allocb
, args
->len
);
1190 * Allocate a variable extent at exactly agno/bno.
1191 * Extent's length (returned in *len) will be between minlen and maxlen,
1192 * and of the form k * prod + mod unless there's nothing that large.
1193 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1195 STATIC
int /* error */
1196 xfs_alloc_ag_vextent_exact(
1197 xfs_alloc_arg_t
*args
) /* allocation argument structure */
1199 struct xfs_agf __maybe_unused
*agf
= args
->agbp
->b_addr
;
1200 xfs_btree_cur_t
*bno_cur
;/* by block-number btree cursor */
1201 xfs_btree_cur_t
*cnt_cur
;/* by count btree cursor */
1203 xfs_agblock_t fbno
; /* start block of found extent */
1204 xfs_extlen_t flen
; /* length of found extent */
1205 xfs_agblock_t tbno
; /* start block of busy extent */
1206 xfs_extlen_t tlen
; /* length of busy extent */
1207 xfs_agblock_t tend
; /* end block of busy extent */
1208 int i
; /* success/failure of operation */
1211 ASSERT(args
->alignment
== 1);
1214 * Allocate/initialize a cursor for the by-number freespace btree.
1216 bno_cur
= xfs_allocbt_init_cursor(args
->mp
, args
->tp
, args
->agbp
,
1217 args
->agno
, XFS_BTNUM_BNO
);
1220 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1221 * Look for the closest free block <= bno, it must contain bno
1222 * if any free block does.
1224 error
= xfs_alloc_lookup_le(bno_cur
, args
->agbno
, args
->minlen
, &i
);
1231 * Grab the freespace record.
1233 error
= xfs_alloc_get_rec(bno_cur
, &fbno
, &flen
, &i
);
1236 if (XFS_IS_CORRUPT(args
->mp
, i
!= 1)) {
1237 error
= -EFSCORRUPTED
;
1240 ASSERT(fbno
<= args
->agbno
);
1243 * Check for overlapping busy extents.
1247 xfs_extent_busy_trim(args
, &tbno
, &tlen
, &busy_gen
);
1250 * Give up if the start of the extent is busy, or the freespace isn't
1251 * long enough for the minimum request.
1253 if (tbno
> args
->agbno
)
1255 if (tlen
< args
->minlen
)
1258 if (tend
< args
->agbno
+ args
->minlen
)
1262 * End of extent will be smaller of the freespace end and the
1263 * maximal requested end.
1265 * Fix the length according to mod and prod if given.
1267 args
->len
= XFS_AGBLOCK_MIN(tend
, args
->agbno
+ args
->maxlen
)
1269 xfs_alloc_fix_len(args
);
1270 ASSERT(args
->agbno
+ args
->len
<= tend
);
1273 * We are allocating agbno for args->len
1274 * Allocate/initialize a cursor for the by-size btree.
1276 cnt_cur
= xfs_allocbt_init_cursor(args
->mp
, args
->tp
, args
->agbp
,
1277 args
->agno
, XFS_BTNUM_CNT
);
1278 ASSERT(args
->agbno
+ args
->len
<= be32_to_cpu(agf
->agf_length
));
1279 error
= xfs_alloc_fixup_trees(cnt_cur
, bno_cur
, fbno
, flen
, args
->agbno
,
1280 args
->len
, XFSA_FIXUP_BNO_OK
);
1282 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_ERROR
);
1286 xfs_btree_del_cursor(bno_cur
, XFS_BTREE_NOERROR
);
1287 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_NOERROR
);
1289 args
->wasfromfl
= 0;
1290 trace_xfs_alloc_exact_done(args
);
1294 /* Didn't find it, return null. */
1295 xfs_btree_del_cursor(bno_cur
, XFS_BTREE_NOERROR
);
1296 args
->agbno
= NULLAGBLOCK
;
1297 trace_xfs_alloc_exact_notfound(args
);
1301 xfs_btree_del_cursor(bno_cur
, XFS_BTREE_ERROR
);
1302 trace_xfs_alloc_exact_error(args
);
1307 * Search a given number of btree records in a given direction. Check each
1308 * record against the good extent we've already found.
1311 xfs_alloc_walk_iter(
1312 struct xfs_alloc_arg
*args
,
1313 struct xfs_alloc_cur
*acur
,
1314 struct xfs_btree_cur
*cur
,
1316 bool find_one
, /* quit on first candidate */
1317 int count
, /* rec count (-1 for infinite) */
1326 * Search so long as the cursor is active or we find a better extent.
1327 * The cursor is deactivated if it extends beyond the range of the
1328 * current allocation candidate.
1330 while (xfs_alloc_cur_active(cur
) && count
) {
1331 error
= xfs_alloc_cur_check(args
, acur
, cur
, &i
);
1339 if (!xfs_alloc_cur_active(cur
))
1343 error
= xfs_btree_increment(cur
, 0, &i
);
1345 error
= xfs_btree_decrement(cur
, 0, &i
);
1349 cur
->bc_ag
.abt
.active
= false;
1359 * Search the by-bno and by-size btrees in parallel in search of an extent with
1360 * ideal locality based on the NEAR mode ->agbno locality hint.
1363 xfs_alloc_ag_vextent_locality(
1364 struct xfs_alloc_arg
*args
,
1365 struct xfs_alloc_cur
*acur
,
1368 struct xfs_btree_cur
*fbcur
= NULL
;
1373 ASSERT(acur
->len
== 0);
1374 ASSERT(args
->type
== XFS_ALLOCTYPE_NEAR_BNO
);
1378 error
= xfs_alloc_lookup_ge(acur
->cnt
, args
->agbno
, acur
->cur_len
, &i
);
1381 error
= xfs_alloc_lookup_le(acur
->bnolt
, args
->agbno
, 0, &i
);
1384 error
= xfs_alloc_lookup_ge(acur
->bnogt
, args
->agbno
, 0, &i
);
1389 * Search the bnobt and cntbt in parallel. Search the bnobt left and
1390 * right and lookup the closest extent to the locality hint for each
1391 * extent size key in the cntbt. The entire search terminates
1392 * immediately on a bnobt hit because that means we've found best case
1393 * locality. Otherwise the search continues until the cntbt cursor runs
1394 * off the end of the tree. If no allocation candidate is found at this
1395 * point, give up on locality, walk backwards from the end of the cntbt
1396 * and take the first available extent.
1398 * The parallel tree searches balance each other out to provide fairly
1399 * consistent performance for various situations. The bnobt search can
1400 * have pathological behavior in the worst case scenario of larger
1401 * allocation requests and fragmented free space. On the other hand, the
1402 * bnobt is able to satisfy most smaller allocation requests much more
1403 * quickly than the cntbt. The cntbt search can sift through fragmented
1404 * free space and sets of free extents for larger allocation requests
1405 * more quickly than the bnobt. Since the locality hint is just a hint
1406 * and we don't want to scan the entire bnobt for perfect locality, the
1407 * cntbt search essentially bounds the bnobt search such that we can
1408 * find good enough locality at reasonable performance in most cases.
1410 while (xfs_alloc_cur_active(acur
->bnolt
) ||
1411 xfs_alloc_cur_active(acur
->bnogt
) ||
1412 xfs_alloc_cur_active(acur
->cnt
)) {
1414 trace_xfs_alloc_cur_lookup(args
);
1417 * Search the bnobt left and right. In the case of a hit, finish
1418 * the search in the opposite direction and we're done.
1420 error
= xfs_alloc_walk_iter(args
, acur
, acur
->bnolt
, false,
1425 trace_xfs_alloc_cur_left(args
);
1426 fbcur
= acur
->bnogt
;
1430 error
= xfs_alloc_walk_iter(args
, acur
, acur
->bnogt
, true, true,
1435 trace_xfs_alloc_cur_right(args
);
1436 fbcur
= acur
->bnolt
;
1442 * Check the extent with best locality based on the current
1443 * extent size search key and keep track of the best candidate.
1445 error
= xfs_alloc_cntbt_iter(args
, acur
);
1448 if (!xfs_alloc_cur_active(acur
->cnt
)) {
1449 trace_xfs_alloc_cur_lookup_done(args
);
1455 * If we failed to find anything due to busy extents, return empty
1456 * handed so the caller can flush and retry. If no busy extents were
1457 * found, walk backwards from the end of the cntbt as a last resort.
1459 if (!xfs_alloc_cur_active(acur
->cnt
) && !acur
->len
&& !acur
->busy
) {
1460 error
= xfs_btree_decrement(acur
->cnt
, 0, &i
);
1464 acur
->cnt
->bc_ag
.abt
.active
= true;
1471 * Search in the opposite direction for a better entry in the case of
1472 * a bnobt hit or walk backwards from the end of the cntbt.
1475 error
= xfs_alloc_walk_iter(args
, acur
, fbcur
, fbinc
, true, -1,
1487 /* Check the last block of the cnt btree for allocations. */
1489 xfs_alloc_ag_vextent_lastblock(
1490 struct xfs_alloc_arg
*args
,
1491 struct xfs_alloc_cur
*acur
,
1500 /* Randomly don't execute the first algorithm. */
1501 if (prandom_u32() & 1)
1506 * Start from the entry that lookup found, sequence through all larger
1507 * free blocks. If we're actually pointing at a record smaller than
1508 * maxlen, go to the start of this block, and skip all those smaller
1511 if (*len
|| args
->alignment
> 1) {
1512 acur
->cnt
->bc_ptrs
[0] = 1;
1514 error
= xfs_alloc_get_rec(acur
->cnt
, bno
, len
, &i
);
1517 if (XFS_IS_CORRUPT(args
->mp
, i
!= 1))
1518 return -EFSCORRUPTED
;
1519 if (*len
>= args
->minlen
)
1521 error
= xfs_btree_increment(acur
->cnt
, 0, &i
);
1525 ASSERT(*len
>= args
->minlen
);
1530 error
= xfs_alloc_walk_iter(args
, acur
, acur
->cnt
, true, false, -1, &i
);
1535 * It didn't work. We COULD be in a case where there's a good record
1536 * somewhere, so try again.
1541 trace_xfs_alloc_near_first(args
);
1547 * Allocate a variable extent near bno in the allocation group agno.
1548 * Extent's length (returned in len) will be between minlen and maxlen,
1549 * and of the form k * prod + mod unless there's nothing that large.
1550 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1553 xfs_alloc_ag_vextent_near(
1554 struct xfs_alloc_arg
*args
)
1556 struct xfs_alloc_cur acur
= {};
1557 int error
; /* error code */
1558 int i
; /* result code, temporary */
1562 /* handle uninitialized agbno range so caller doesn't have to */
1563 if (!args
->min_agbno
&& !args
->max_agbno
)
1564 args
->max_agbno
= args
->mp
->m_sb
.sb_agblocks
- 1;
1565 ASSERT(args
->min_agbno
<= args
->max_agbno
);
1567 /* clamp agbno to the range if it's outside */
1568 if (args
->agbno
< args
->min_agbno
)
1569 args
->agbno
= args
->min_agbno
;
1570 if (args
->agbno
> args
->max_agbno
)
1571 args
->agbno
= args
->max_agbno
;
1577 * Set up cursors and see if there are any free extents as big as
1578 * maxlen. If not, pick the last entry in the tree unless the tree is
1581 error
= xfs_alloc_cur_setup(args
, &acur
);
1582 if (error
== -ENOSPC
) {
1583 error
= xfs_alloc_ag_vextent_small(args
, acur
.cnt
, &bno
,
1587 if (i
== 0 || len
== 0) {
1588 trace_xfs_alloc_near_noentry(args
);
1598 * If the requested extent is large wrt the freespaces available
1599 * in this a.g., then the cursor will be pointing to a btree entry
1600 * near the right edge of the tree. If it's in the last btree leaf
1601 * block, then we just examine all the entries in that block
1602 * that are big enough, and pick the best one.
1604 if (xfs_btree_islastblock(acur
.cnt
, 0)) {
1605 bool allocated
= false;
1607 error
= xfs_alloc_ag_vextent_lastblock(args
, &acur
, &bno
, &len
,
1616 * Second algorithm. Combined cntbt and bnobt search to find ideal
1619 error
= xfs_alloc_ag_vextent_locality(args
, &acur
, &i
);
1624 * If we couldn't get anything, give up.
1628 trace_xfs_alloc_near_busy(args
);
1629 xfs_extent_busy_flush(args
->mp
, args
->pag
,
1633 trace_xfs_alloc_size_neither(args
);
1634 args
->agbno
= NULLAGBLOCK
;
1639 /* fix up btrees on a successful allocation */
1640 error
= xfs_alloc_cur_finish(args
, &acur
);
1643 xfs_alloc_cur_close(&acur
, error
);
1648 * Allocate a variable extent anywhere in the allocation group agno.
1649 * Extent's length (returned in len) will be between minlen and maxlen,
1650 * and of the form k * prod + mod unless there's nothing that large.
1651 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1653 STATIC
int /* error */
1654 xfs_alloc_ag_vextent_size(
1655 xfs_alloc_arg_t
*args
) /* allocation argument structure */
1657 struct xfs_agf
*agf
= args
->agbp
->b_addr
;
1658 xfs_btree_cur_t
*bno_cur
; /* cursor for bno btree */
1659 xfs_btree_cur_t
*cnt_cur
; /* cursor for cnt btree */
1660 int error
; /* error result */
1661 xfs_agblock_t fbno
; /* start of found freespace */
1662 xfs_extlen_t flen
; /* length of found freespace */
1663 int i
; /* temp status variable */
1664 xfs_agblock_t rbno
; /* returned block number */
1665 xfs_extlen_t rlen
; /* length of returned extent */
1671 * Allocate and initialize a cursor for the by-size btree.
1673 cnt_cur
= xfs_allocbt_init_cursor(args
->mp
, args
->tp
, args
->agbp
,
1674 args
->agno
, XFS_BTNUM_CNT
);
1679 * Look for an entry >= maxlen+alignment-1 blocks.
1681 if ((error
= xfs_alloc_lookup_ge(cnt_cur
, 0,
1682 args
->maxlen
+ args
->alignment
- 1, &i
)))
1686 * If none then we have to settle for a smaller extent. In the case that
1687 * there are no large extents, this will return the last entry in the
1688 * tree unless the tree is empty. In the case that there are only busy
1689 * large extents, this will return the largest small extent unless there
1690 * are no smaller extents available.
1693 error
= xfs_alloc_ag_vextent_small(args
, cnt_cur
,
1697 if (i
== 0 || flen
== 0) {
1698 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_NOERROR
);
1699 trace_xfs_alloc_size_noentry(args
);
1703 busy
= xfs_alloc_compute_aligned(args
, fbno
, flen
, &rbno
,
1707 * Search for a non-busy extent that is large enough.
1710 error
= xfs_alloc_get_rec(cnt_cur
, &fbno
, &flen
, &i
);
1713 if (XFS_IS_CORRUPT(args
->mp
, i
!= 1)) {
1714 error
= -EFSCORRUPTED
;
1718 busy
= xfs_alloc_compute_aligned(args
, fbno
, flen
,
1719 &rbno
, &rlen
, &busy_gen
);
1721 if (rlen
>= args
->maxlen
)
1724 error
= xfs_btree_increment(cnt_cur
, 0, &i
);
1729 * Our only valid extents must have been busy.
1730 * Make it unbusy by forcing the log out and
1733 xfs_btree_del_cursor(cnt_cur
,
1735 trace_xfs_alloc_size_busy(args
);
1736 xfs_extent_busy_flush(args
->mp
,
1737 args
->pag
, busy_gen
);
1744 * In the first case above, we got the last entry in the
1745 * by-size btree. Now we check to see if the space hits maxlen
1746 * once aligned; if not, we search left for something better.
1747 * This can't happen in the second case above.
1749 rlen
= XFS_EXTLEN_MIN(args
->maxlen
, rlen
);
1750 if (XFS_IS_CORRUPT(args
->mp
,
1753 rbno
+ rlen
> fbno
+ flen
))) {
1754 error
= -EFSCORRUPTED
;
1757 if (rlen
< args
->maxlen
) {
1758 xfs_agblock_t bestfbno
;
1759 xfs_extlen_t bestflen
;
1760 xfs_agblock_t bestrbno
;
1761 xfs_extlen_t bestrlen
;
1768 if ((error
= xfs_btree_decrement(cnt_cur
, 0, &i
)))
1772 if ((error
= xfs_alloc_get_rec(cnt_cur
, &fbno
, &flen
,
1775 if (XFS_IS_CORRUPT(args
->mp
, i
!= 1)) {
1776 error
= -EFSCORRUPTED
;
1779 if (flen
< bestrlen
)
1781 busy
= xfs_alloc_compute_aligned(args
, fbno
, flen
,
1782 &rbno
, &rlen
, &busy_gen
);
1783 rlen
= XFS_EXTLEN_MIN(args
->maxlen
, rlen
);
1784 if (XFS_IS_CORRUPT(args
->mp
,
1787 rbno
+ rlen
> fbno
+ flen
))) {
1788 error
= -EFSCORRUPTED
;
1791 if (rlen
> bestrlen
) {
1796 if (rlen
== args
->maxlen
)
1800 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, bestfbno
, bestflen
,
1803 if (XFS_IS_CORRUPT(args
->mp
, i
!= 1)) {
1804 error
= -EFSCORRUPTED
;
1812 args
->wasfromfl
= 0;
1814 * Fix up the length.
1817 if (rlen
< args
->minlen
) {
1819 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_NOERROR
);
1820 trace_xfs_alloc_size_busy(args
);
1821 xfs_extent_busy_flush(args
->mp
, args
->pag
, busy_gen
);
1826 xfs_alloc_fix_len(args
);
1829 if (XFS_IS_CORRUPT(args
->mp
, rlen
> flen
)) {
1830 error
= -EFSCORRUPTED
;
1834 * Allocate and initialize a cursor for the by-block tree.
1836 bno_cur
= xfs_allocbt_init_cursor(args
->mp
, args
->tp
, args
->agbp
,
1837 args
->agno
, XFS_BTNUM_BNO
);
1838 if ((error
= xfs_alloc_fixup_trees(cnt_cur
, bno_cur
, fbno
, flen
,
1839 rbno
, rlen
, XFSA_FIXUP_CNT_OK
)))
1841 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_NOERROR
);
1842 xfs_btree_del_cursor(bno_cur
, XFS_BTREE_NOERROR
);
1843 cnt_cur
= bno_cur
= NULL
;
1846 if (XFS_IS_CORRUPT(args
->mp
,
1847 args
->agbno
+ args
->len
>
1848 be32_to_cpu(agf
->agf_length
))) {
1849 error
= -EFSCORRUPTED
;
1852 trace_xfs_alloc_size_done(args
);
1856 trace_xfs_alloc_size_error(args
);
1858 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_ERROR
);
1860 xfs_btree_del_cursor(bno_cur
, XFS_BTREE_ERROR
);
1864 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_NOERROR
);
1865 trace_xfs_alloc_size_nominleft(args
);
1866 args
->agbno
= NULLAGBLOCK
;
1871 * Free the extent starting at agno/bno for length.
1875 struct xfs_trans
*tp
,
1876 struct xfs_buf
*agbp
,
1877 xfs_agnumber_t agno
,
1880 const struct xfs_owner_info
*oinfo
,
1881 enum xfs_ag_resv_type type
)
1883 struct xfs_mount
*mp
;
1884 struct xfs_btree_cur
*bno_cur
;
1885 struct xfs_btree_cur
*cnt_cur
;
1886 xfs_agblock_t gtbno
; /* start of right neighbor */
1887 xfs_extlen_t gtlen
; /* length of right neighbor */
1888 xfs_agblock_t ltbno
; /* start of left neighbor */
1889 xfs_extlen_t ltlen
; /* length of left neighbor */
1890 xfs_agblock_t nbno
; /* new starting block of freesp */
1891 xfs_extlen_t nlen
; /* new length of freespace */
1892 int haveleft
; /* have a left neighbor */
1893 int haveright
; /* have a right neighbor */
1897 bno_cur
= cnt_cur
= NULL
;
1900 if (!xfs_rmap_should_skip_owner_update(oinfo
)) {
1901 error
= xfs_rmap_free(tp
, agbp
, agno
, bno
, len
, oinfo
);
1907 * Allocate and initialize a cursor for the by-block btree.
1909 bno_cur
= xfs_allocbt_init_cursor(mp
, tp
, agbp
, agno
, XFS_BTNUM_BNO
);
1911 * Look for a neighboring block on the left (lower block numbers)
1912 * that is contiguous with this space.
1914 if ((error
= xfs_alloc_lookup_le(bno_cur
, bno
, len
, &haveleft
)))
1918 * There is a block to our left.
1920 if ((error
= xfs_alloc_get_rec(bno_cur
, <bno
, <len
, &i
)))
1922 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
1923 error
= -EFSCORRUPTED
;
1927 * It's not contiguous, though.
1929 if (ltbno
+ ltlen
< bno
)
1933 * If this failure happens the request to free this
1934 * space was invalid, it's (partly) already free.
1937 if (XFS_IS_CORRUPT(mp
, ltbno
+ ltlen
> bno
)) {
1938 error
= -EFSCORRUPTED
;
1944 * Look for a neighboring block on the right (higher block numbers)
1945 * that is contiguous with this space.
1947 if ((error
= xfs_btree_increment(bno_cur
, 0, &haveright
)))
1951 * There is a block to our right.
1953 if ((error
= xfs_alloc_get_rec(bno_cur
, >bno
, >len
, &i
)))
1955 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
1956 error
= -EFSCORRUPTED
;
1960 * It's not contiguous, though.
1962 if (bno
+ len
< gtbno
)
1966 * If this failure happens the request to free this
1967 * space was invalid, it's (partly) already free.
1970 if (XFS_IS_CORRUPT(mp
, bno
+ len
> gtbno
)) {
1971 error
= -EFSCORRUPTED
;
1977 * Now allocate and initialize a cursor for the by-size tree.
1979 cnt_cur
= xfs_allocbt_init_cursor(mp
, tp
, agbp
, agno
, XFS_BTNUM_CNT
);
1981 * Have both left and right contiguous neighbors.
1982 * Merge all three into a single free block.
1984 if (haveleft
&& haveright
) {
1986 * Delete the old by-size entry on the left.
1988 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, ltbno
, ltlen
, &i
)))
1990 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
1991 error
= -EFSCORRUPTED
;
1994 if ((error
= xfs_btree_delete(cnt_cur
, &i
)))
1996 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
1997 error
= -EFSCORRUPTED
;
2001 * Delete the old by-size entry on the right.
2003 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, gtbno
, gtlen
, &i
)))
2005 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2006 error
= -EFSCORRUPTED
;
2009 if ((error
= xfs_btree_delete(cnt_cur
, &i
)))
2011 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2012 error
= -EFSCORRUPTED
;
2016 * Delete the old by-block entry for the right block.
2018 if ((error
= xfs_btree_delete(bno_cur
, &i
)))
2020 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2021 error
= -EFSCORRUPTED
;
2025 * Move the by-block cursor back to the left neighbor.
2027 if ((error
= xfs_btree_decrement(bno_cur
, 0, &i
)))
2029 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2030 error
= -EFSCORRUPTED
;
2035 * Check that this is the right record: delete didn't
2036 * mangle the cursor.
2039 xfs_agblock_t xxbno
;
2042 if ((error
= xfs_alloc_get_rec(bno_cur
, &xxbno
, &xxlen
,
2045 if (XFS_IS_CORRUPT(mp
,
2049 error
= -EFSCORRUPTED
;
2055 * Update remaining by-block entry to the new, joined block.
2058 nlen
= len
+ ltlen
+ gtlen
;
2059 if ((error
= xfs_alloc_update(bno_cur
, nbno
, nlen
)))
2063 * Have only a left contiguous neighbor.
2064 * Merge it together with the new freespace.
2066 else if (haveleft
) {
2068 * Delete the old by-size entry on the left.
2070 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, ltbno
, ltlen
, &i
)))
2072 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2073 error
= -EFSCORRUPTED
;
2076 if ((error
= xfs_btree_delete(cnt_cur
, &i
)))
2078 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2079 error
= -EFSCORRUPTED
;
2083 * Back up the by-block cursor to the left neighbor, and
2084 * update its length.
2086 if ((error
= xfs_btree_decrement(bno_cur
, 0, &i
)))
2088 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2089 error
= -EFSCORRUPTED
;
2094 if ((error
= xfs_alloc_update(bno_cur
, nbno
, nlen
)))
2098 * Have only a right contiguous neighbor.
2099 * Merge it together with the new freespace.
2101 else if (haveright
) {
2103 * Delete the old by-size entry on the right.
2105 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, gtbno
, gtlen
, &i
)))
2107 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2108 error
= -EFSCORRUPTED
;
2111 if ((error
= xfs_btree_delete(cnt_cur
, &i
)))
2113 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2114 error
= -EFSCORRUPTED
;
2118 * Update the starting block and length of the right
2119 * neighbor in the by-block tree.
2123 if ((error
= xfs_alloc_update(bno_cur
, nbno
, nlen
)))
2127 * No contiguous neighbors.
2128 * Insert the new freespace into the by-block tree.
2133 if ((error
= xfs_btree_insert(bno_cur
, &i
)))
2135 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2136 error
= -EFSCORRUPTED
;
2140 xfs_btree_del_cursor(bno_cur
, XFS_BTREE_NOERROR
);
2143 * In all cases we need to insert the new freespace in the by-size tree.
2145 if ((error
= xfs_alloc_lookup_eq(cnt_cur
, nbno
, nlen
, &i
)))
2147 if (XFS_IS_CORRUPT(mp
, i
!= 0)) {
2148 error
= -EFSCORRUPTED
;
2151 if ((error
= xfs_btree_insert(cnt_cur
, &i
)))
2153 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2154 error
= -EFSCORRUPTED
;
2157 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_NOERROR
);
2161 * Update the freespace totals in the ag and superblock.
2163 error
= xfs_alloc_update_counters(tp
, agbp
, len
);
2164 xfs_ag_resv_free_extent(agbp
->b_pag
, type
, tp
, len
);
2168 XFS_STATS_INC(mp
, xs_freex
);
2169 XFS_STATS_ADD(mp
, xs_freeb
, len
);
2171 trace_xfs_free_extent(mp
, agno
, bno
, len
, type
, haveleft
, haveright
);
2176 trace_xfs_free_extent(mp
, agno
, bno
, len
, type
, -1, -1);
2178 xfs_btree_del_cursor(bno_cur
, XFS_BTREE_ERROR
);
2180 xfs_btree_del_cursor(cnt_cur
, XFS_BTREE_ERROR
);
2185 * Visible (exported) allocation/free functions.
2186 * Some of these are used just by xfs_alloc_btree.c and this file.
2190 * Compute and fill in value of m_ag_maxlevels.
2193 xfs_alloc_compute_maxlevels(
2194 xfs_mount_t
*mp
) /* file system mount structure */
2196 mp
->m_ag_maxlevels
= xfs_btree_compute_maxlevels(mp
->m_alloc_mnr
,
2197 (mp
->m_sb
.sb_agblocks
+ 1) / 2);
2201 * Find the length of the longest extent in an AG. The 'need' parameter
2202 * specifies how much space we're going to need for the AGFL and the
2203 * 'reserved' parameter tells us how many blocks in this AG are reserved for
2207 xfs_alloc_longest_free_extent(
2208 struct xfs_perag
*pag
,
2210 xfs_extlen_t reserved
)
2212 xfs_extlen_t delta
= 0;
2215 * If the AGFL needs a recharge, we'll have to subtract that from the
2218 if (need
> pag
->pagf_flcount
)
2219 delta
= need
- pag
->pagf_flcount
;
2222 * If we cannot maintain others' reservations with space from the
2223 * not-longest freesp extents, we'll have to subtract /that/ from
2224 * the longest extent too.
2226 if (pag
->pagf_freeblks
- pag
->pagf_longest
< reserved
)
2227 delta
+= reserved
- (pag
->pagf_freeblks
- pag
->pagf_longest
);
2230 * If the longest extent is long enough to satisfy all the
2231 * reservations and AGFL rules in place, we can return this extent.
2233 if (pag
->pagf_longest
> delta
)
2234 return min_t(xfs_extlen_t
, pag
->pag_mount
->m_ag_max_usable
,
2235 pag
->pagf_longest
- delta
);
2237 /* Otherwise, let the caller try for 1 block if there's space. */
2238 return pag
->pagf_flcount
> 0 || pag
->pagf_longest
> 0;
2242 * Compute the minimum length of the AGFL in the given AG. If @pag is NULL,
2243 * return the largest possible minimum length.
2246 xfs_alloc_min_freelist(
2247 struct xfs_mount
*mp
,
2248 struct xfs_perag
*pag
)
2250 /* AG btrees have at least 1 level. */
2251 static const uint8_t fake_levels
[XFS_BTNUM_AGF
] = {1, 1, 1};
2252 const uint8_t *levels
= pag
? pag
->pagf_levels
: fake_levels
;
2253 unsigned int min_free
;
2255 ASSERT(mp
->m_ag_maxlevels
> 0);
2257 /* space needed by-bno freespace btree */
2258 min_free
= min_t(unsigned int, levels
[XFS_BTNUM_BNOi
] + 1,
2259 mp
->m_ag_maxlevels
);
2260 /* space needed by-size freespace btree */
2261 min_free
+= min_t(unsigned int, levels
[XFS_BTNUM_CNTi
] + 1,
2262 mp
->m_ag_maxlevels
);
2263 /* space needed reverse mapping used space btree */
2264 if (xfs_sb_version_hasrmapbt(&mp
->m_sb
))
2265 min_free
+= min_t(unsigned int, levels
[XFS_BTNUM_RMAPi
] + 1,
2266 mp
->m_rmap_maxlevels
);
2272 * Check if the operation we are fixing up the freelist for should go ahead or
2273 * not. If we are freeing blocks, we always allow it, otherwise the allocation
2274 * is dependent on whether the size and shape of free space available will
2275 * permit the requested allocation to take place.
2278 xfs_alloc_space_available(
2279 struct xfs_alloc_arg
*args
,
2280 xfs_extlen_t min_free
,
2283 struct xfs_perag
*pag
= args
->pag
;
2284 xfs_extlen_t alloc_len
, longest
;
2285 xfs_extlen_t reservation
; /* blocks that are still reserved */
2287 xfs_extlen_t agflcount
;
2289 if (flags
& XFS_ALLOC_FLAG_FREEING
)
2292 reservation
= xfs_ag_resv_needed(pag
, args
->resv
);
2294 /* do we have enough contiguous free space for the allocation? */
2295 alloc_len
= args
->minlen
+ (args
->alignment
- 1) + args
->minalignslop
;
2296 longest
= xfs_alloc_longest_free_extent(pag
, min_free
, reservation
);
2297 if (longest
< alloc_len
)
2301 * Do we have enough free space remaining for the allocation? Don't
2302 * account extra agfl blocks because we are about to defer free them,
2303 * making them unavailable until the current transaction commits.
2305 agflcount
= min_t(xfs_extlen_t
, pag
->pagf_flcount
, min_free
);
2306 available
= (int)(pag
->pagf_freeblks
+ agflcount
-
2307 reservation
- min_free
- args
->minleft
);
2308 if (available
< (int)max(args
->total
, alloc_len
))
2312 * Clamp maxlen to the amount of free space available for the actual
2313 * extent allocation.
2315 if (available
< (int)args
->maxlen
&& !(flags
& XFS_ALLOC_FLAG_CHECK
)) {
2316 args
->maxlen
= available
;
2317 ASSERT(args
->maxlen
> 0);
2318 ASSERT(args
->maxlen
>= args
->minlen
);
2325 xfs_free_agfl_block(
2326 struct xfs_trans
*tp
,
2327 xfs_agnumber_t agno
,
2328 xfs_agblock_t agbno
,
2329 struct xfs_buf
*agbp
,
2330 struct xfs_owner_info
*oinfo
)
2335 error
= xfs_free_ag_extent(tp
, agbp
, agno
, agbno
, 1, oinfo
,
2340 error
= xfs_trans_get_buf(tp
, tp
->t_mountp
->m_ddev_targp
,
2341 XFS_AGB_TO_DADDR(tp
->t_mountp
, agno
, agbno
),
2342 tp
->t_mountp
->m_bsize
, 0, &bp
);
2345 xfs_trans_binval(tp
, bp
);
2351 * Check the agfl fields of the agf for inconsistency or corruption. The purpose
2352 * is to detect an agfl header padding mismatch between current and early v5
2353 * kernels. This problem manifests as a 1-slot size difference between the
2354 * on-disk flcount and the active [first, last] range of a wrapped agfl. This
2355 * may also catch variants of agfl count corruption unrelated to padding. Either
2356 * way, we'll reset the agfl and warn the user.
2358 * Return true if a reset is required before the agfl can be used, false
2362 xfs_agfl_needs_reset(
2363 struct xfs_mount
*mp
,
2364 struct xfs_agf
*agf
)
2366 uint32_t f
= be32_to_cpu(agf
->agf_flfirst
);
2367 uint32_t l
= be32_to_cpu(agf
->agf_fllast
);
2368 uint32_t c
= be32_to_cpu(agf
->agf_flcount
);
2369 int agfl_size
= xfs_agfl_size(mp
);
2372 /* no agfl header on v4 supers */
2373 if (!xfs_sb_version_hascrc(&mp
->m_sb
))
2377 * The agf read verifier catches severe corruption of these fields.
2378 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2379 * the verifier allows it.
2381 if (f
>= agfl_size
|| l
>= agfl_size
)
2387 * Check consistency between the on-disk count and the active range. An
2388 * agfl padding mismatch manifests as an inconsistent flcount.
2393 active
= agfl_size
- f
+ l
+ 1;
2401 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2402 * agfl content cannot be trusted. Warn the user that a repair is required to
2403 * recover leaked blocks.
2405 * The purpose of this mechanism is to handle filesystems affected by the agfl
2406 * header padding mismatch problem. A reset keeps the filesystem online with a
2407 * relatively minor free space accounting inconsistency rather than suffer the
2408 * inevitable crash from use of an invalid agfl block.
2412 struct xfs_trans
*tp
,
2413 struct xfs_buf
*agbp
,
2414 struct xfs_perag
*pag
)
2416 struct xfs_mount
*mp
= tp
->t_mountp
;
2417 struct xfs_agf
*agf
= agbp
->b_addr
;
2419 ASSERT(pag
->pagf_agflreset
);
2420 trace_xfs_agfl_reset(mp
, agf
, 0, _RET_IP_
);
2423 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2424 "Please unmount and run xfs_repair.",
2425 pag
->pag_agno
, pag
->pagf_flcount
);
2427 agf
->agf_flfirst
= 0;
2428 agf
->agf_fllast
= cpu_to_be32(xfs_agfl_size(mp
) - 1);
2429 agf
->agf_flcount
= 0;
2430 xfs_alloc_log_agf(tp
, agbp
, XFS_AGF_FLFIRST
| XFS_AGF_FLLAST
|
2433 pag
->pagf_flcount
= 0;
2434 pag
->pagf_agflreset
= false;
2438 * Defer an AGFL block free. This is effectively equivalent to
2439 * xfs_bmap_add_free() with some special handling particular to AGFL blocks.
2441 * Deferring AGFL frees helps prevent log reservation overruns due to too many
2442 * allocation operations in a transaction. AGFL frees are prone to this problem
2443 * because for one they are always freed one at a time. Further, an immediate
2444 * AGFL block free can cause a btree join and require another block free before
2445 * the real allocation can proceed. Deferring the free disconnects freeing up
2446 * the AGFL slot from freeing the block.
2449 xfs_defer_agfl_block(
2450 struct xfs_trans
*tp
,
2451 xfs_agnumber_t agno
,
2452 xfs_fsblock_t agbno
,
2453 struct xfs_owner_info
*oinfo
)
2455 struct xfs_mount
*mp
= tp
->t_mountp
;
2456 struct xfs_extent_free_item
*new; /* new element */
2458 ASSERT(xfs_bmap_free_item_zone
!= NULL
);
2459 ASSERT(oinfo
!= NULL
);
2461 new = kmem_cache_alloc(xfs_bmap_free_item_zone
,
2462 GFP_KERNEL
| __GFP_NOFAIL
);
2463 new->xefi_startblock
= XFS_AGB_TO_FSB(mp
, agno
, agbno
);
2464 new->xefi_blockcount
= 1;
2465 new->xefi_oinfo
= *oinfo
;
2466 new->xefi_skip_discard
= false;
2468 trace_xfs_agfl_free_defer(mp
, agno
, 0, agbno
, 1);
2470 xfs_defer_add(tp
, XFS_DEFER_OPS_TYPE_AGFL_FREE
, &new->xefi_list
);
2474 * Decide whether to use this allocation group for this allocation.
2475 * If so, fix up the btree freelist's size.
2478 xfs_alloc_fix_freelist(
2479 struct xfs_alloc_arg
*args
, /* allocation argument structure */
2480 int flags
) /* XFS_ALLOC_FLAG_... */
2482 struct xfs_mount
*mp
= args
->mp
;
2483 struct xfs_perag
*pag
= args
->pag
;
2484 struct xfs_trans
*tp
= args
->tp
;
2485 struct xfs_buf
*agbp
= NULL
;
2486 struct xfs_buf
*agflbp
= NULL
;
2487 struct xfs_alloc_arg targs
; /* local allocation arguments */
2488 xfs_agblock_t bno
; /* freelist block */
2489 xfs_extlen_t need
; /* total blocks needed in freelist */
2492 /* deferred ops (AGFL block frees) require permanent transactions */
2493 ASSERT(tp
->t_flags
& XFS_TRANS_PERM_LOG_RES
);
2495 if (!pag
->pagf_init
) {
2496 error
= xfs_alloc_read_agf(mp
, tp
, args
->agno
, flags
, &agbp
);
2498 /* Couldn't lock the AGF so skip this AG. */
2499 if (error
== -EAGAIN
)
2506 * If this is a metadata preferred pag and we are user data then try
2507 * somewhere else if we are not being asked to try harder at this
2510 if (pag
->pagf_metadata
&& (args
->datatype
& XFS_ALLOC_USERDATA
) &&
2511 (flags
& XFS_ALLOC_FLAG_TRYLOCK
)) {
2512 ASSERT(!(flags
& XFS_ALLOC_FLAG_FREEING
));
2513 goto out_agbp_relse
;
2516 need
= xfs_alloc_min_freelist(mp
, pag
);
2517 if (!xfs_alloc_space_available(args
, need
, flags
|
2518 XFS_ALLOC_FLAG_CHECK
))
2519 goto out_agbp_relse
;
2522 * Get the a.g. freespace buffer.
2523 * Can fail if we're not blocking on locks, and it's held.
2526 error
= xfs_alloc_read_agf(mp
, tp
, args
->agno
, flags
, &agbp
);
2528 /* Couldn't lock the AGF so skip this AG. */
2529 if (error
== -EAGAIN
)
2535 /* reset a padding mismatched agfl before final free space check */
2536 if (pag
->pagf_agflreset
)
2537 xfs_agfl_reset(tp
, agbp
, pag
);
2539 /* If there isn't enough total space or single-extent, reject it. */
2540 need
= xfs_alloc_min_freelist(mp
, pag
);
2541 if (!xfs_alloc_space_available(args
, need
, flags
))
2542 goto out_agbp_relse
;
2545 * Make the freelist shorter if it's too long.
2547 * Note that from this point onwards, we will always release the agf and
2548 * agfl buffers on error. This handles the case where we error out and
2549 * the buffers are clean or may not have been joined to the transaction
2550 * and hence need to be released manually. If they have been joined to
2551 * the transaction, then xfs_trans_brelse() will handle them
2552 * appropriately based on the recursion count and dirty state of the
2555 * XXX (dgc): When we have lots of free space, does this buy us
2556 * anything other than extra overhead when we need to put more blocks
2557 * back on the free list? Maybe we should only do this when space is
2558 * getting low or the AGFL is more than half full?
2560 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2561 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2562 * updating the rmapbt. Both flags are used in xfs_repair while we're
2563 * rebuilding the rmapbt, and neither are used by the kernel. They're
2564 * both required to ensure that rmaps are correctly recorded for the
2565 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and
2566 * repair/rmap.c in xfsprogs for details.
2568 memset(&targs
, 0, sizeof(targs
));
2569 /* struct copy below */
2570 if (flags
& XFS_ALLOC_FLAG_NORMAP
)
2571 targs
.oinfo
= XFS_RMAP_OINFO_SKIP_UPDATE
;
2573 targs
.oinfo
= XFS_RMAP_OINFO_AG
;
2574 while (!(flags
& XFS_ALLOC_FLAG_NOSHRINK
) && pag
->pagf_flcount
> need
) {
2575 error
= xfs_alloc_get_freelist(tp
, agbp
, &bno
, 0);
2577 goto out_agbp_relse
;
2579 /* defer agfl frees */
2580 xfs_defer_agfl_block(tp
, args
->agno
, bno
, &targs
.oinfo
);
2586 targs
.agno
= args
->agno
;
2587 targs
.alignment
= targs
.minlen
= targs
.prod
= 1;
2588 targs
.type
= XFS_ALLOCTYPE_THIS_AG
;
2590 error
= xfs_alloc_read_agfl(mp
, tp
, targs
.agno
, &agflbp
);
2592 goto out_agbp_relse
;
2594 /* Make the freelist longer if it's too short. */
2595 while (pag
->pagf_flcount
< need
) {
2597 targs
.maxlen
= need
- pag
->pagf_flcount
;
2598 targs
.resv
= XFS_AG_RESV_AGFL
;
2600 /* Allocate as many blocks as possible at once. */
2601 error
= xfs_alloc_ag_vextent(&targs
);
2603 goto out_agflbp_relse
;
2606 * Stop if we run out. Won't happen if callers are obeying
2607 * the restrictions correctly. Can happen for free calls
2608 * on a completely full ag.
2610 if (targs
.agbno
== NULLAGBLOCK
) {
2611 if (flags
& XFS_ALLOC_FLAG_FREEING
)
2613 goto out_agflbp_relse
;
2616 * Put each allocated block on the list.
2618 for (bno
= targs
.agbno
; bno
< targs
.agbno
+ targs
.len
; bno
++) {
2619 error
= xfs_alloc_put_freelist(tp
, agbp
,
2622 goto out_agflbp_relse
;
2625 xfs_trans_brelse(tp
, agflbp
);
2630 xfs_trans_brelse(tp
, agflbp
);
2633 xfs_trans_brelse(tp
, agbp
);
2640 * Get a block from the freelist.
2641 * Returns with the buffer for the block gotten.
2644 xfs_alloc_get_freelist(
2645 xfs_trans_t
*tp
, /* transaction pointer */
2646 xfs_buf_t
*agbp
, /* buffer containing the agf structure */
2647 xfs_agblock_t
*bnop
, /* block address retrieved from freelist */
2648 int btreeblk
) /* destination is a AGF btree */
2650 struct xfs_agf
*agf
= agbp
->b_addr
;
2651 xfs_buf_t
*agflbp
;/* buffer for a.g. freelist structure */
2652 xfs_agblock_t bno
; /* block number returned */
2656 xfs_mount_t
*mp
= tp
->t_mountp
;
2657 xfs_perag_t
*pag
; /* per allocation group data */
2660 * Freelist is empty, give up.
2662 if (!agf
->agf_flcount
) {
2663 *bnop
= NULLAGBLOCK
;
2667 * Read the array of free blocks.
2669 error
= xfs_alloc_read_agfl(mp
, tp
, be32_to_cpu(agf
->agf_seqno
),
2676 * Get the block number and update the data structures.
2678 agfl_bno
= xfs_buf_to_agfl_bno(agflbp
);
2679 bno
= be32_to_cpu(agfl_bno
[be32_to_cpu(agf
->agf_flfirst
)]);
2680 be32_add_cpu(&agf
->agf_flfirst
, 1);
2681 xfs_trans_brelse(tp
, agflbp
);
2682 if (be32_to_cpu(agf
->agf_flfirst
) == xfs_agfl_size(mp
))
2683 agf
->agf_flfirst
= 0;
2686 ASSERT(!pag
->pagf_agflreset
);
2687 be32_add_cpu(&agf
->agf_flcount
, -1);
2688 xfs_trans_agflist_delta(tp
, -1);
2689 pag
->pagf_flcount
--;
2691 logflags
= XFS_AGF_FLFIRST
| XFS_AGF_FLCOUNT
;
2693 be32_add_cpu(&agf
->agf_btreeblks
, 1);
2694 pag
->pagf_btreeblks
++;
2695 logflags
|= XFS_AGF_BTREEBLKS
;
2698 xfs_alloc_log_agf(tp
, agbp
, logflags
);
2705 * Log the given fields from the agf structure.
2709 xfs_trans_t
*tp
, /* transaction pointer */
2710 xfs_buf_t
*bp
, /* buffer for a.g. freelist header */
2711 int fields
) /* mask of fields to be logged (XFS_AGF_...) */
2713 int first
; /* first byte offset */
2714 int last
; /* last byte offset */
2715 static const short offsets
[] = {
2716 offsetof(xfs_agf_t
, agf_magicnum
),
2717 offsetof(xfs_agf_t
, agf_versionnum
),
2718 offsetof(xfs_agf_t
, agf_seqno
),
2719 offsetof(xfs_agf_t
, agf_length
),
2720 offsetof(xfs_agf_t
, agf_roots
[0]),
2721 offsetof(xfs_agf_t
, agf_levels
[0]),
2722 offsetof(xfs_agf_t
, agf_flfirst
),
2723 offsetof(xfs_agf_t
, agf_fllast
),
2724 offsetof(xfs_agf_t
, agf_flcount
),
2725 offsetof(xfs_agf_t
, agf_freeblks
),
2726 offsetof(xfs_agf_t
, agf_longest
),
2727 offsetof(xfs_agf_t
, agf_btreeblks
),
2728 offsetof(xfs_agf_t
, agf_uuid
),
2729 offsetof(xfs_agf_t
, agf_rmap_blocks
),
2730 offsetof(xfs_agf_t
, agf_refcount_blocks
),
2731 offsetof(xfs_agf_t
, agf_refcount_root
),
2732 offsetof(xfs_agf_t
, agf_refcount_level
),
2733 /* needed so that we don't log the whole rest of the structure: */
2734 offsetof(xfs_agf_t
, agf_spare64
),
2738 trace_xfs_agf(tp
->t_mountp
, bp
->b_addr
, fields
, _RET_IP_
);
2740 xfs_trans_buf_set_type(tp
, bp
, XFS_BLFT_AGF_BUF
);
2742 xfs_btree_offsets(fields
, offsets
, XFS_AGF_NUM_BITS
, &first
, &last
);
2743 xfs_trans_log_buf(tp
, bp
, (uint
)first
, (uint
)last
);
2747 * Interface for inode allocation to force the pag data to be initialized.
2750 xfs_alloc_pagf_init(
2751 xfs_mount_t
*mp
, /* file system mount structure */
2752 xfs_trans_t
*tp
, /* transaction pointer */
2753 xfs_agnumber_t agno
, /* allocation group number */
2754 int flags
) /* XFS_ALLOC_FLAGS_... */
2759 error
= xfs_alloc_read_agf(mp
, tp
, agno
, flags
, &bp
);
2761 xfs_trans_brelse(tp
, bp
);
2766 * Put the block on the freelist for the allocation group.
2769 xfs_alloc_put_freelist(
2770 xfs_trans_t
*tp
, /* transaction pointer */
2771 xfs_buf_t
*agbp
, /* buffer for a.g. freelist header */
2772 xfs_buf_t
*agflbp
,/* buffer for a.g. free block array */
2773 xfs_agblock_t bno
, /* block being freed */
2774 int btreeblk
) /* block came from a AGF btree */
2776 struct xfs_mount
*mp
= tp
->t_mountp
;
2777 struct xfs_agf
*agf
= agbp
->b_addr
;
2778 __be32
*blockp
;/* pointer to array entry */
2781 xfs_perag_t
*pag
; /* per allocation group data */
2785 if (!agflbp
&& (error
= xfs_alloc_read_agfl(mp
, tp
,
2786 be32_to_cpu(agf
->agf_seqno
), &agflbp
)))
2788 be32_add_cpu(&agf
->agf_fllast
, 1);
2789 if (be32_to_cpu(agf
->agf_fllast
) == xfs_agfl_size(mp
))
2790 agf
->agf_fllast
= 0;
2793 ASSERT(!pag
->pagf_agflreset
);
2794 be32_add_cpu(&agf
->agf_flcount
, 1);
2795 xfs_trans_agflist_delta(tp
, 1);
2796 pag
->pagf_flcount
++;
2798 logflags
= XFS_AGF_FLLAST
| XFS_AGF_FLCOUNT
;
2800 be32_add_cpu(&agf
->agf_btreeblks
, -1);
2801 pag
->pagf_btreeblks
--;
2802 logflags
|= XFS_AGF_BTREEBLKS
;
2805 xfs_alloc_log_agf(tp
, agbp
, logflags
);
2807 ASSERT(be32_to_cpu(agf
->agf_flcount
) <= xfs_agfl_size(mp
));
2809 agfl_bno
= xfs_buf_to_agfl_bno(agflbp
);
2810 blockp
= &agfl_bno
[be32_to_cpu(agf
->agf_fllast
)];
2811 *blockp
= cpu_to_be32(bno
);
2812 startoff
= (char *)blockp
- (char *)agflbp
->b_addr
;
2814 xfs_alloc_log_agf(tp
, agbp
, logflags
);
2816 xfs_trans_buf_set_type(tp
, agflbp
, XFS_BLFT_AGFL_BUF
);
2817 xfs_trans_log_buf(tp
, agflbp
, startoff
,
2818 startoff
+ sizeof(xfs_agblock_t
) - 1);
2822 static xfs_failaddr_t
2826 struct xfs_mount
*mp
= bp
->b_mount
;
2827 struct xfs_agf
*agf
= bp
->b_addr
;
2829 if (xfs_sb_version_hascrc(&mp
->m_sb
)) {
2830 if (!uuid_equal(&agf
->agf_uuid
, &mp
->m_sb
.sb_meta_uuid
))
2831 return __this_address
;
2832 if (!xfs_log_check_lsn(mp
, be64_to_cpu(agf
->agf_lsn
)))
2833 return __this_address
;
2836 if (!xfs_verify_magic(bp
, agf
->agf_magicnum
))
2837 return __this_address
;
2839 if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf
->agf_versionnum
)) &&
2840 be32_to_cpu(agf
->agf_freeblks
) <= be32_to_cpu(agf
->agf_length
) &&
2841 be32_to_cpu(agf
->agf_flfirst
) < xfs_agfl_size(mp
) &&
2842 be32_to_cpu(agf
->agf_fllast
) < xfs_agfl_size(mp
) &&
2843 be32_to_cpu(agf
->agf_flcount
) <= xfs_agfl_size(mp
)))
2844 return __this_address
;
2846 if (be32_to_cpu(agf
->agf_length
) > mp
->m_sb
.sb_dblocks
)
2847 return __this_address
;
2849 if (be32_to_cpu(agf
->agf_freeblks
) < be32_to_cpu(agf
->agf_longest
) ||
2850 be32_to_cpu(agf
->agf_freeblks
) > be32_to_cpu(agf
->agf_length
))
2851 return __this_address
;
2853 if (be32_to_cpu(agf
->agf_levels
[XFS_BTNUM_BNO
]) < 1 ||
2854 be32_to_cpu(agf
->agf_levels
[XFS_BTNUM_CNT
]) < 1 ||
2855 be32_to_cpu(agf
->agf_levels
[XFS_BTNUM_BNO
]) > XFS_BTREE_MAXLEVELS
||
2856 be32_to_cpu(agf
->agf_levels
[XFS_BTNUM_CNT
]) > XFS_BTREE_MAXLEVELS
)
2857 return __this_address
;
2859 if (xfs_sb_version_hasrmapbt(&mp
->m_sb
) &&
2860 (be32_to_cpu(agf
->agf_levels
[XFS_BTNUM_RMAP
]) < 1 ||
2861 be32_to_cpu(agf
->agf_levels
[XFS_BTNUM_RMAP
]) > XFS_BTREE_MAXLEVELS
))
2862 return __this_address
;
2864 if (xfs_sb_version_hasrmapbt(&mp
->m_sb
) &&
2865 be32_to_cpu(agf
->agf_rmap_blocks
) > be32_to_cpu(agf
->agf_length
))
2866 return __this_address
;
2869 * during growfs operations, the perag is not fully initialised,
2870 * so we can't use it for any useful checking. growfs ensures we can't
2871 * use it by using uncached buffers that don't have the perag attached
2872 * so we can detect and avoid this problem.
2874 if (bp
->b_pag
&& be32_to_cpu(agf
->agf_seqno
) != bp
->b_pag
->pag_agno
)
2875 return __this_address
;
2877 if (xfs_sb_version_haslazysbcount(&mp
->m_sb
) &&
2878 be32_to_cpu(agf
->agf_btreeblks
) > be32_to_cpu(agf
->agf_length
))
2879 return __this_address
;
2881 if (xfs_sb_version_hasreflink(&mp
->m_sb
) &&
2882 be32_to_cpu(agf
->agf_refcount_blocks
) >
2883 be32_to_cpu(agf
->agf_length
))
2884 return __this_address
;
2886 if (xfs_sb_version_hasreflink(&mp
->m_sb
) &&
2887 (be32_to_cpu(agf
->agf_refcount_level
) < 1 ||
2888 be32_to_cpu(agf
->agf_refcount_level
) > XFS_BTREE_MAXLEVELS
))
2889 return __this_address
;
2896 xfs_agf_read_verify(
2899 struct xfs_mount
*mp
= bp
->b_mount
;
2902 if (xfs_sb_version_hascrc(&mp
->m_sb
) &&
2903 !xfs_buf_verify_cksum(bp
, XFS_AGF_CRC_OFF
))
2904 xfs_verifier_error(bp
, -EFSBADCRC
, __this_address
);
2906 fa
= xfs_agf_verify(bp
);
2907 if (XFS_TEST_ERROR(fa
, mp
, XFS_ERRTAG_ALLOC_READ_AGF
))
2908 xfs_verifier_error(bp
, -EFSCORRUPTED
, fa
);
2913 xfs_agf_write_verify(
2916 struct xfs_mount
*mp
= bp
->b_mount
;
2917 struct xfs_buf_log_item
*bip
= bp
->b_log_item
;
2918 struct xfs_agf
*agf
= bp
->b_addr
;
2921 fa
= xfs_agf_verify(bp
);
2923 xfs_verifier_error(bp
, -EFSCORRUPTED
, fa
);
2927 if (!xfs_sb_version_hascrc(&mp
->m_sb
))
2931 agf
->agf_lsn
= cpu_to_be64(bip
->bli_item
.li_lsn
);
2933 xfs_buf_update_cksum(bp
, XFS_AGF_CRC_OFF
);
2936 const struct xfs_buf_ops xfs_agf_buf_ops
= {
2938 .magic
= { cpu_to_be32(XFS_AGF_MAGIC
), cpu_to_be32(XFS_AGF_MAGIC
) },
2939 .verify_read
= xfs_agf_read_verify
,
2940 .verify_write
= xfs_agf_write_verify
,
2941 .verify_struct
= xfs_agf_verify
,
2945 * Read in the allocation group header (free/alloc section).
2949 struct xfs_mount
*mp
, /* mount point structure */
2950 struct xfs_trans
*tp
, /* transaction pointer */
2951 xfs_agnumber_t agno
, /* allocation group number */
2952 int flags
, /* XFS_BUF_ */
2953 struct xfs_buf
**bpp
) /* buffer for the ag freelist header */
2957 trace_xfs_read_agf(mp
, agno
);
2959 ASSERT(agno
!= NULLAGNUMBER
);
2960 error
= xfs_trans_read_buf(mp
, tp
, mp
->m_ddev_targp
,
2961 XFS_AG_DADDR(mp
, agno
, XFS_AGF_DADDR(mp
)),
2962 XFS_FSS_TO_BB(mp
, 1), flags
, bpp
, &xfs_agf_buf_ops
);
2966 ASSERT(!(*bpp
)->b_error
);
2967 xfs_buf_set_ref(*bpp
, XFS_AGF_REF
);
2972 * Read in the allocation group header (free/alloc section).
2976 struct xfs_mount
*mp
, /* mount point structure */
2977 struct xfs_trans
*tp
, /* transaction pointer */
2978 xfs_agnumber_t agno
, /* allocation group number */
2979 int flags
, /* XFS_ALLOC_FLAG_... */
2980 struct xfs_buf
**bpp
) /* buffer for the ag freelist header */
2982 struct xfs_agf
*agf
; /* ag freelist header */
2983 struct xfs_perag
*pag
; /* per allocation group data */
2986 trace_xfs_alloc_read_agf(mp
, agno
);
2988 /* We don't support trylock when freeing. */
2989 ASSERT((flags
& (XFS_ALLOC_FLAG_FREEING
| XFS_ALLOC_FLAG_TRYLOCK
)) !=
2990 (XFS_ALLOC_FLAG_FREEING
| XFS_ALLOC_FLAG_TRYLOCK
));
2991 ASSERT(agno
!= NULLAGNUMBER
);
2992 error
= xfs_read_agf(mp
, tp
, agno
,
2993 (flags
& XFS_ALLOC_FLAG_TRYLOCK
) ? XBF_TRYLOCK
: 0,
2997 ASSERT(!(*bpp
)->b_error
);
2999 agf
= (*bpp
)->b_addr
;
3000 pag
= (*bpp
)->b_pag
;
3001 if (!pag
->pagf_init
) {
3002 pag
->pagf_freeblks
= be32_to_cpu(agf
->agf_freeblks
);
3003 pag
->pagf_btreeblks
= be32_to_cpu(agf
->agf_btreeblks
);
3004 pag
->pagf_flcount
= be32_to_cpu(agf
->agf_flcount
);
3005 pag
->pagf_longest
= be32_to_cpu(agf
->agf_longest
);
3006 pag
->pagf_levels
[XFS_BTNUM_BNOi
] =
3007 be32_to_cpu(agf
->agf_levels
[XFS_BTNUM_BNOi
]);
3008 pag
->pagf_levels
[XFS_BTNUM_CNTi
] =
3009 be32_to_cpu(agf
->agf_levels
[XFS_BTNUM_CNTi
]);
3010 pag
->pagf_levels
[XFS_BTNUM_RMAPi
] =
3011 be32_to_cpu(agf
->agf_levels
[XFS_BTNUM_RMAPi
]);
3012 pag
->pagf_refcount_level
= be32_to_cpu(agf
->agf_refcount_level
);
3014 pag
->pagf_agflreset
= xfs_agfl_needs_reset(mp
, agf
);
3017 else if (!XFS_FORCED_SHUTDOWN(mp
)) {
3018 ASSERT(pag
->pagf_freeblks
== be32_to_cpu(agf
->agf_freeblks
));
3019 ASSERT(pag
->pagf_btreeblks
== be32_to_cpu(agf
->agf_btreeblks
));
3020 ASSERT(pag
->pagf_flcount
== be32_to_cpu(agf
->agf_flcount
));
3021 ASSERT(pag
->pagf_longest
== be32_to_cpu(agf
->agf_longest
));
3022 ASSERT(pag
->pagf_levels
[XFS_BTNUM_BNOi
] ==
3023 be32_to_cpu(agf
->agf_levels
[XFS_BTNUM_BNOi
]));
3024 ASSERT(pag
->pagf_levels
[XFS_BTNUM_CNTi
] ==
3025 be32_to_cpu(agf
->agf_levels
[XFS_BTNUM_CNTi
]));
3032 * Allocate an extent (variable-size).
3033 * Depending on the allocation type, we either look in a single allocation
3034 * group or loop over the allocation groups to find the result.
3038 struct xfs_alloc_arg
*args
) /* allocation argument structure */
3040 xfs_agblock_t agsize
; /* allocation group size */
3042 int flags
; /* XFS_ALLOC_FLAG_... locking flags */
3043 struct xfs_mount
*mp
; /* mount structure pointer */
3044 xfs_agnumber_t sagno
; /* starting allocation group number */
3045 xfs_alloctype_t type
; /* input allocation type */
3047 xfs_agnumber_t rotorstep
= xfs_rotorstep
; /* inode32 agf stepper */
3050 type
= args
->otype
= args
->type
;
3051 args
->agbno
= NULLAGBLOCK
;
3053 * Just fix this up, for the case where the last a.g. is shorter
3054 * (or there's only one a.g.) and the caller couldn't easily figure
3055 * that out (xfs_bmap_alloc).
3057 agsize
= mp
->m_sb
.sb_agblocks
;
3058 if (args
->maxlen
> agsize
)
3059 args
->maxlen
= agsize
;
3060 if (args
->alignment
== 0)
3061 args
->alignment
= 1;
3062 ASSERT(XFS_FSB_TO_AGNO(mp
, args
->fsbno
) < mp
->m_sb
.sb_agcount
);
3063 ASSERT(XFS_FSB_TO_AGBNO(mp
, args
->fsbno
) < agsize
);
3064 ASSERT(args
->minlen
<= args
->maxlen
);
3065 ASSERT(args
->minlen
<= agsize
);
3066 ASSERT(args
->mod
< args
->prod
);
3067 if (XFS_FSB_TO_AGNO(mp
, args
->fsbno
) >= mp
->m_sb
.sb_agcount
||
3068 XFS_FSB_TO_AGBNO(mp
, args
->fsbno
) >= agsize
||
3069 args
->minlen
> args
->maxlen
|| args
->minlen
> agsize
||
3070 args
->mod
>= args
->prod
) {
3071 args
->fsbno
= NULLFSBLOCK
;
3072 trace_xfs_alloc_vextent_badargs(args
);
3077 case XFS_ALLOCTYPE_THIS_AG
:
3078 case XFS_ALLOCTYPE_NEAR_BNO
:
3079 case XFS_ALLOCTYPE_THIS_BNO
:
3081 * These three force us into a single a.g.
3083 args
->agno
= XFS_FSB_TO_AGNO(mp
, args
->fsbno
);
3084 args
->pag
= xfs_perag_get(mp
, args
->agno
);
3085 error
= xfs_alloc_fix_freelist(args
, 0);
3087 trace_xfs_alloc_vextent_nofix(args
);
3091 trace_xfs_alloc_vextent_noagbp(args
);
3094 args
->agbno
= XFS_FSB_TO_AGBNO(mp
, args
->fsbno
);
3095 if ((error
= xfs_alloc_ag_vextent(args
)))
3098 case XFS_ALLOCTYPE_START_BNO
:
3100 * Try near allocation first, then anywhere-in-ag after
3101 * the first a.g. fails.
3103 if ((args
->datatype
& XFS_ALLOC_INITIAL_USER_DATA
) &&
3104 (mp
->m_flags
& XFS_MOUNT_32BITINODES
)) {
3105 args
->fsbno
= XFS_AGB_TO_FSB(mp
,
3106 ((mp
->m_agfrotor
/ rotorstep
) %
3107 mp
->m_sb
.sb_agcount
), 0);
3110 args
->agbno
= XFS_FSB_TO_AGBNO(mp
, args
->fsbno
);
3111 args
->type
= XFS_ALLOCTYPE_NEAR_BNO
;
3113 case XFS_ALLOCTYPE_FIRST_AG
:
3115 * Rotate through the allocation groups looking for a winner.
3117 if (type
== XFS_ALLOCTYPE_FIRST_AG
) {
3119 * Start with allocation group given by bno.
3121 args
->agno
= XFS_FSB_TO_AGNO(mp
, args
->fsbno
);
3122 args
->type
= XFS_ALLOCTYPE_THIS_AG
;
3127 * Start with the given allocation group.
3129 args
->agno
= sagno
= XFS_FSB_TO_AGNO(mp
, args
->fsbno
);
3130 flags
= XFS_ALLOC_FLAG_TRYLOCK
;
3133 * Loop over allocation groups twice; first time with
3134 * trylock set, second time without.
3137 args
->pag
= xfs_perag_get(mp
, args
->agno
);
3138 error
= xfs_alloc_fix_freelist(args
, flags
);
3140 trace_xfs_alloc_vextent_nofix(args
);
3144 * If we get a buffer back then the allocation will fly.
3147 if ((error
= xfs_alloc_ag_vextent(args
)))
3152 trace_xfs_alloc_vextent_loopfailed(args
);
3155 * Didn't work, figure out the next iteration.
3157 if (args
->agno
== sagno
&&
3158 type
== XFS_ALLOCTYPE_START_BNO
)
3159 args
->type
= XFS_ALLOCTYPE_THIS_AG
;
3161 * For the first allocation, we can try any AG to get
3162 * space. However, if we already have allocated a
3163 * block, we don't want to try AGs whose number is below
3164 * sagno. Otherwise, we may end up with out-of-order
3165 * locking of AGF, which might cause deadlock.
3167 if (++(args
->agno
) == mp
->m_sb
.sb_agcount
) {
3168 if (args
->tp
->t_firstblock
!= NULLFSBLOCK
)
3174 * Reached the starting a.g., must either be done
3175 * or switch to non-trylock mode.
3177 if (args
->agno
== sagno
) {
3179 args
->agbno
= NULLAGBLOCK
;
3180 trace_xfs_alloc_vextent_allfailed(args
);
3185 if (type
== XFS_ALLOCTYPE_START_BNO
) {
3186 args
->agbno
= XFS_FSB_TO_AGBNO(mp
,
3188 args
->type
= XFS_ALLOCTYPE_NEAR_BNO
;
3191 xfs_perag_put(args
->pag
);
3194 if (args
->agno
== sagno
)
3195 mp
->m_agfrotor
= (mp
->m_agfrotor
+ 1) %
3196 (mp
->m_sb
.sb_agcount
* rotorstep
);
3198 mp
->m_agfrotor
= (args
->agno
* rotorstep
+ 1) %
3199 (mp
->m_sb
.sb_agcount
* rotorstep
);
3206 if (args
->agbno
== NULLAGBLOCK
)
3207 args
->fsbno
= NULLFSBLOCK
;
3209 args
->fsbno
= XFS_AGB_TO_FSB(mp
, args
->agno
, args
->agbno
);
3211 ASSERT(args
->len
>= args
->minlen
);
3212 ASSERT(args
->len
<= args
->maxlen
);
3213 ASSERT(args
->agbno
% args
->alignment
== 0);
3214 XFS_AG_CHECK_DADDR(mp
, XFS_FSB_TO_DADDR(mp
, args
->fsbno
),
3219 xfs_perag_put(args
->pag
);
3222 xfs_perag_put(args
->pag
);
3226 /* Ensure that the freelist is at full capacity. */
3228 xfs_free_extent_fix_freelist(
3229 struct xfs_trans
*tp
,
3230 xfs_agnumber_t agno
,
3231 struct xfs_buf
**agbp
)
3233 struct xfs_alloc_arg args
;
3236 memset(&args
, 0, sizeof(struct xfs_alloc_arg
));
3238 args
.mp
= tp
->t_mountp
;
3242 * validate that the block number is legal - the enables us to detect
3243 * and handle a silent filesystem corruption rather than crashing.
3245 if (args
.agno
>= args
.mp
->m_sb
.sb_agcount
)
3246 return -EFSCORRUPTED
;
3248 args
.pag
= xfs_perag_get(args
.mp
, args
.agno
);
3251 error
= xfs_alloc_fix_freelist(&args
, XFS_ALLOC_FLAG_FREEING
);
3257 xfs_perag_put(args
.pag
);
3263 * Just break up the extent address and hand off to xfs_free_ag_extent
3264 * after fixing up the freelist.
3268 struct xfs_trans
*tp
,
3271 const struct xfs_owner_info
*oinfo
,
3272 enum xfs_ag_resv_type type
,
3275 struct xfs_mount
*mp
= tp
->t_mountp
;
3276 struct xfs_buf
*agbp
;
3277 xfs_agnumber_t agno
= XFS_FSB_TO_AGNO(mp
, bno
);
3278 xfs_agblock_t agbno
= XFS_FSB_TO_AGBNO(mp
, bno
);
3279 struct xfs_agf
*agf
;
3281 unsigned int busy_flags
= 0;
3284 ASSERT(type
!= XFS_AG_RESV_AGFL
);
3286 if (XFS_TEST_ERROR(false, mp
,
3287 XFS_ERRTAG_FREE_EXTENT
))
3290 error
= xfs_free_extent_fix_freelist(tp
, agno
, &agbp
);
3295 if (XFS_IS_CORRUPT(mp
, agbno
>= mp
->m_sb
.sb_agblocks
)) {
3296 error
= -EFSCORRUPTED
;
3300 /* validate the extent size is legal now we have the agf locked */
3301 if (XFS_IS_CORRUPT(mp
, agbno
+ len
> be32_to_cpu(agf
->agf_length
))) {
3302 error
= -EFSCORRUPTED
;
3306 error
= xfs_free_ag_extent(tp
, agbp
, agno
, agbno
, len
, oinfo
, type
);
3311 busy_flags
|= XFS_EXTENT_BUSY_SKIP_DISCARD
;
3312 xfs_extent_busy_insert(tp
, agno
, agbno
, len
, busy_flags
);
3316 xfs_trans_brelse(tp
, agbp
);
3320 struct xfs_alloc_query_range_info
{
3321 xfs_alloc_query_range_fn fn
;
3325 /* Format btree record and pass to our callback. */
3327 xfs_alloc_query_range_helper(
3328 struct xfs_btree_cur
*cur
,
3329 union xfs_btree_rec
*rec
,
3332 struct xfs_alloc_query_range_info
*query
= priv
;
3333 struct xfs_alloc_rec_incore irec
;
3335 irec
.ar_startblock
= be32_to_cpu(rec
->alloc
.ar_startblock
);
3336 irec
.ar_blockcount
= be32_to_cpu(rec
->alloc
.ar_blockcount
);
3337 return query
->fn(cur
, &irec
, query
->priv
);
3340 /* Find all free space within a given range of blocks. */
3342 xfs_alloc_query_range(
3343 struct xfs_btree_cur
*cur
,
3344 struct xfs_alloc_rec_incore
*low_rec
,
3345 struct xfs_alloc_rec_incore
*high_rec
,
3346 xfs_alloc_query_range_fn fn
,
3349 union xfs_btree_irec low_brec
;
3350 union xfs_btree_irec high_brec
;
3351 struct xfs_alloc_query_range_info query
;
3353 ASSERT(cur
->bc_btnum
== XFS_BTNUM_BNO
);
3354 low_brec
.a
= *low_rec
;
3355 high_brec
.a
= *high_rec
;
3358 return xfs_btree_query_range(cur
, &low_brec
, &high_brec
,
3359 xfs_alloc_query_range_helper
, &query
);
3362 /* Find all free space records. */
3364 xfs_alloc_query_all(
3365 struct xfs_btree_cur
*cur
,
3366 xfs_alloc_query_range_fn fn
,
3369 struct xfs_alloc_query_range_info query
;
3371 ASSERT(cur
->bc_btnum
== XFS_BTNUM_BNO
);
3374 return xfs_btree_query_all(cur
, xfs_alloc_query_range_helper
, &query
);
3377 /* Is there a record covering a given extent? */
3379 xfs_alloc_has_record(
3380 struct xfs_btree_cur
*cur
,
3385 union xfs_btree_irec low
;
3386 union xfs_btree_irec high
;
3388 memset(&low
, 0, sizeof(low
));
3389 low
.a
.ar_startblock
= bno
;
3390 memset(&high
, 0xFF, sizeof(high
));
3391 high
.a
.ar_startblock
= bno
+ len
- 1;
3393 return xfs_btree_has_record(cur
, &low
, &high
, exists
);
3397 * Walk all the blocks in the AGFL. The @walk_fn can return any negative
3398 * error code or XFS_ITER_*.
3402 struct xfs_mount
*mp
,
3403 struct xfs_agf
*agf
,
3404 struct xfs_buf
*agflbp
,
3405 xfs_agfl_walk_fn walk_fn
,
3412 agfl_bno
= xfs_buf_to_agfl_bno(agflbp
);
3413 i
= be32_to_cpu(agf
->agf_flfirst
);
3415 /* Nothing to walk in an empty AGFL. */
3416 if (agf
->agf_flcount
== cpu_to_be32(0))
3419 /* Otherwise, walk from first to last, wrapping as needed. */
3421 error
= walk_fn(mp
, be32_to_cpu(agfl_bno
[i
]), priv
);
3424 if (i
== be32_to_cpu(agf
->agf_fllast
))
3426 if (++i
== xfs_agfl_size(mp
))