]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - libxfs/xfs_alloc.c
xfsprogs: Release v6.7.0
[thirdparty/xfsprogs-dev.git] / libxfs / xfs_alloc.c
CommitLineData
37b3b4d6 1// SPDX-License-Identifier: GPL-2.0
2bd0ea18 2/*
5e656dbb 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
da23017d 4 * All Rights Reserved.
2bd0ea18 5 */
9c799827 6#include "libxfs_priv.h"
b626fb59
DC
7#include "xfs_fs.h"
8#include "xfs_format.h"
9#include "xfs_log_format.h"
10#include "xfs_shared.h"
11#include "xfs_trans_resv.h"
12#include "xfs_bit.h"
b626fb59 13#include "xfs_mount.h"
f944d3d0 14#include "xfs_defer.h"
b626fb59 15#include "xfs_btree.h"
631ac87a 16#include "xfs_rmap.h"
b626fb59
DC
17#include "xfs_alloc_btree.h"
18#include "xfs_alloc.h"
2cf10e4c 19#include "xfs_errortag.h"
b626fb59
DC
20#include "xfs_trace.h"
21#include "xfs_trans.h"
f93d2173 22#include "xfs_ag.h"
cf8ce220 23#include "xfs_ag_resv.h"
d5c1b462
BF
24#include "xfs_bmap.h"
25
7d84b02d 26struct kmem_cache *xfs_extfree_item_cache;
2bd0ea18 27
ff105f75
DC
28struct workqueue_struct *xfs_alloc_wq;
29
2bd0ea18 30#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
5e656dbb
BN
31
32#define XFSA_FIXUP_BNO_OK 1
33#define XFSA_FIXUP_CNT_OK 2
34
b8165508
DC
35/*
36 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
37 * the beginning of the block for a proper header with the location information
38 * and CRC.
39 */
40unsigned int
41xfs_agfl_size(
42 struct xfs_mount *mp)
43{
44 unsigned int size = mp->m_sb.sb_sectsize;
45
b16a427a 46 if (xfs_has_crc(mp))
b8165508
DC
47 size -= sizeof(struct xfs_agfl);
48
49 return size / sizeof(xfs_agblock_t);
50}
51
2a96beb9
DW
52unsigned int
53xfs_refc_block(
54 struct xfs_mount *mp)
55{
b16a427a 56 if (xfs_has_rmapbt(mp))
2a96beb9 57 return XFS_RMAP_BLOCK(mp) + 1;
b16a427a 58 if (xfs_has_finobt(mp))
2a96beb9
DW
59 return XFS_FIBT_BLOCK(mp) + 1;
60 return XFS_IBT_BLOCK(mp) + 1;
61}
62
ef5340cd
DW
63xfs_extlen_t
64xfs_prealloc_blocks(
65 struct xfs_mount *mp)
66{
b16a427a 67 if (xfs_has_reflink(mp))
2a96beb9 68 return xfs_refc_block(mp) + 1;
b16a427a 69 if (xfs_has_rmapbt(mp))
ef5340cd 70 return XFS_RMAP_BLOCK(mp) + 1;
b16a427a 71 if (xfs_has_finobt(mp))
ef5340cd
DW
72 return XFS_FIBT_BLOCK(mp) + 1;
73 return XFS_IBT_BLOCK(mp) + 1;
74}
75
b8a8d6e5 76/*
bc4d60eb
DW
77 * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
78 * guarantee that we can refill the AGFL prior to allocating space in a nearly
e4a32219 79 * full AG. Although the space described by the free space btrees, the
bc4d60eb
DW
80 * blocks used by the freesp btrees themselves, and the blocks owned by the
81 * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
82 * free space in the AG drop so low that the free space btrees cannot refill an
83 * empty AGFL up to the minimum level. Rather than grind through empty AGs
84 * until the fs goes down, we subtract this many AG blocks from the incore
85 * fdblocks to ensure user allocation does not overcommit the space the
86 * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to
87 * withhold space from xfs_mod_fdblocks, so we do not account for that here.
88 */
89#define XFS_ALLOCBT_AGFL_RESERVE 4
90
91/*
92 * Compute the number of blocks that we set aside to guarantee the ability to
93 * refill the AGFL and handle a full bmap btree split.
94 *
b8a8d6e5
DW
95 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
96 * AGF buffer (PV 947395), we place constraints on the relationship among
97 * actual allocations for data blocks, freelist blocks, and potential file data
98 * bmap btree blocks. However, these restrictions may result in no actual space
99 * allocated for a delayed extent, for example, a data block in a certain AG is
100 * allocated but there is no additional block for the additional bmap btree
101 * block due to a split of the bmap btree of the file. The result of this may
102 * lead to an infinite loop when the file gets flushed to disk and all delayed
103 * extents need to be actually allocated. To get around this, we explicitly set
104 * aside a few blocks which will not be reserved in delayed allocation.
105 *
bc4d60eb
DW
106 * For each AG, we need to reserve enough blocks to replenish a totally empty
107 * AGFL and 4 more to handle a potential split of the file's bmap btree.
b8a8d6e5
DW
108 */
109unsigned int
110xfs_alloc_set_aside(
111 struct xfs_mount *mp)
112{
bc4d60eb 113 return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4);
b8a8d6e5
DW
114}
115
116/*
117 * When deciding how much space to allocate out of an AG, we limit the
118 * allocation maximum size to the size the AG. However, we cannot use all the
119 * blocks in the AG - some are permanently used by metadata. These
120 * blocks are generally:
121 * - the AG superblock, AGF, AGI and AGFL
122 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally
123 * the AGI free inode and rmap btree root blocks.
124 * - blocks on the AGFL according to xfs_alloc_set_aside() limits
125 * - the rmapbt root block
126 *
127 * The AG headers are sector sized, so the amount of space they take up is
128 * dependent on filesystem geometry. The others are all single blocks.
129 */
130unsigned int
131xfs_alloc_ag_max_usable(
132 struct xfs_mount *mp)
133{
134 unsigned int blocks;
135
136 blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
bc4d60eb 137 blocks += XFS_ALLOCBT_AGFL_RESERVE;
b8a8d6e5 138 blocks += 3; /* AGF, AGI btree root blocks */
b16a427a 139 if (xfs_has_finobt(mp))
b8a8d6e5 140 blocks++; /* finobt root block */
b16a427a 141 if (xfs_has_rmapbt(mp))
bc4d60eb 142 blocks++; /* rmap root block */
b16a427a 143 if (xfs_has_reflink(mp))
868c70e3 144 blocks++; /* refcount root block */
b8a8d6e5
DW
145
146 return mp->m_sb.sb_agblocks - blocks;
147}
148
b194c7d8
BN
149/*
150 * Lookup the record equal to [bno, len] in the btree given by cur.
151 */
152STATIC int /* error */
153xfs_alloc_lookup_eq(
154 struct xfs_btree_cur *cur, /* btree cursor */
155 xfs_agblock_t bno, /* starting block of extent */
156 xfs_extlen_t len, /* length of extent */
157 int *stat) /* success/failure */
158{
7777d8c6
BF
159 int error;
160
b194c7d8
BN
161 cur->bc_rec.a.ar_startblock = bno;
162 cur->bc_rec.a.ar_blockcount = len;
7777d8c6 163 error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
63789af3 164 cur->bc_ag.abt.active = (*stat == 1);
7777d8c6 165 return error;
b194c7d8
BN
166}
167
168/*
169 * Lookup the first record greater than or equal to [bno, len]
170 * in the btree given by cur.
171 */
a2ceac1f 172int /* error */
b194c7d8
BN
173xfs_alloc_lookup_ge(
174 struct xfs_btree_cur *cur, /* btree cursor */
175 xfs_agblock_t bno, /* starting block of extent */
176 xfs_extlen_t len, /* length of extent */
177 int *stat) /* success/failure */
178{
7777d8c6
BF
179 int error;
180
b194c7d8
BN
181 cur->bc_rec.a.ar_startblock = bno;
182 cur->bc_rec.a.ar_blockcount = len;
7777d8c6 183 error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
63789af3 184 cur->bc_ag.abt.active = (*stat == 1);
7777d8c6 185 return error;
b194c7d8
BN
186}
187
188/*
189 * Lookup the first record less than or equal to [bno, len]
190 * in the btree given by cur.
191 */
1fe41a73 192int /* error */
b194c7d8
BN
193xfs_alloc_lookup_le(
194 struct xfs_btree_cur *cur, /* btree cursor */
195 xfs_agblock_t bno, /* starting block of extent */
196 xfs_extlen_t len, /* length of extent */
197 int *stat) /* success/failure */
198{
7777d8c6 199 int error;
b194c7d8
BN
200 cur->bc_rec.a.ar_startblock = bno;
201 cur->bc_rec.a.ar_blockcount = len;
7777d8c6 202 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
63789af3 203 cur->bc_ag.abt.active = (*stat == 1);
7777d8c6
BF
204 return error;
205}
206
207static inline bool
208xfs_alloc_cur_active(
209 struct xfs_btree_cur *cur)
210{
63789af3 211 return cur && cur->bc_ag.abt.active;
b194c7d8
BN
212}
213
214/*
215 * Update the record referred to by cur to the value given
216 * by [bno, len].
217 * This either works (return 0) or gets an EFSCORRUPTED error.
218 */
219STATIC int /* error */
220xfs_alloc_update(
221 struct xfs_btree_cur *cur, /* btree cursor */
222 xfs_agblock_t bno, /* starting block of extent */
223 xfs_extlen_t len) /* length of extent */
224{
225 union xfs_btree_rec rec;
226
227 rec.alloc.ar_startblock = cpu_to_be32(bno);
228 rec.alloc.ar_blockcount = cpu_to_be32(len);
229 return xfs_btree_update(cur, &rec);
230}
231
c7005aef
DW
232/* Convert the ondisk btree record to its incore representation. */
233void
234xfs_alloc_btrec_to_irec(
235 const union xfs_btree_rec *rec,
236 struct xfs_alloc_rec_incore *irec)
237{
238 irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
239 irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
240}
241
242/* Simple checks for free space records. */
243xfs_failaddr_t
244xfs_alloc_check_irec(
245 struct xfs_btree_cur *cur,
246 const struct xfs_alloc_rec_incore *irec)
247{
248 struct xfs_perag *pag = cur->bc_ag.pag;
249
250 if (irec->ar_blockcount == 0)
251 return __this_address;
252
253 /* check for valid extent range, including overflow */
254 if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount))
255 return __this_address;
256
257 return NULL;
258}
259
e70bf9ba
DW
260static inline int
261xfs_alloc_complain_bad_rec(
262 struct xfs_btree_cur *cur,
263 xfs_failaddr_t fa,
264 const struct xfs_alloc_rec_incore *irec)
265{
266 struct xfs_mount *mp = cur->bc_mp;
267
268 xfs_warn(mp,
269 "%s Freespace BTree record corruption in AG %d detected at %pS!",
270 cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
271 cur->bc_ag.pag->pag_agno, fa);
272 xfs_warn(mp,
273 "start block 0x%x block count 0x%x", irec->ar_startblock,
274 irec->ar_blockcount);
275 return -EFSCORRUPTED;
276}
277
b194c7d8
BN
278/*
279 * Get the data from the pointed-to record.
280 */
a2ceac1f 281int /* error */
b194c7d8
BN
282xfs_alloc_get_rec(
283 struct xfs_btree_cur *cur, /* btree cursor */
284 xfs_agblock_t *bno, /* output: starting block of extent */
285 xfs_extlen_t *len, /* output: length of extent */
286 int *stat) /* output: success/failure */
287{
c7005aef 288 struct xfs_alloc_rec_incore irec;
b194c7d8 289 union xfs_btree_rec *rec;
c7005aef 290 xfs_failaddr_t fa;
b194c7d8
BN
291 int error;
292
293 error = xfs_btree_get_rec(cur, &rec, stat);
3741f971
DW
294 if (error || !(*stat))
295 return error;
3741f971 296
c7005aef
DW
297 xfs_alloc_btrec_to_irec(rec, &irec);
298 fa = xfs_alloc_check_irec(cur, &irec);
299 if (fa)
e70bf9ba 300 return xfs_alloc_complain_bad_rec(cur, fa, &irec);
ec291989 301
c7005aef
DW
302 *bno = irec.ar_startblock;
303 *len = irec.ar_blockcount;
ec291989 304 return 0;
b194c7d8
BN
305}
306
2bd0ea18
NS
307/*
308 * Compute aligned version of the found extent.
309 * Takes alignment and min length into account.
310 */
cd80de04 311STATIC bool
2bd0ea18 312xfs_alloc_compute_aligned(
a2ceac1f 313 xfs_alloc_arg_t *args, /* allocation argument structure */
2bd0ea18
NS
314 xfs_agblock_t foundbno, /* starting block in found extent */
315 xfs_extlen_t foundlen, /* length in found extent */
2bd0ea18 316 xfs_agblock_t *resbno, /* result block number */
cd80de04
CH
317 xfs_extlen_t *reslen, /* result length */
318 unsigned *busy_gen)
2bd0ea18 319{
cd80de04
CH
320 xfs_agblock_t bno = foundbno;
321 xfs_extlen_t len = foundlen;
ff3263dd 322 xfs_extlen_t diff;
cd80de04 323 bool busy;
2bd0ea18 324
a2ceac1f 325 /* Trim busy sections out of found extent */
cd80de04 326 busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
a2ceac1f 327
ff3263dd
BF
328 /*
329 * If we have a largish extent that happens to start before min_agbno,
330 * see if we can shift it into range...
331 */
332 if (bno < args->min_agbno && bno + len > args->min_agbno) {
333 diff = args->min_agbno - bno;
334 if (len > diff) {
335 bno += diff;
336 len -= diff;
337 }
338 }
339
a2ceac1f
DC
340 if (args->alignment > 1 && len >= args->minlen) {
341 xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
ff3263dd
BF
342
343 diff = aligned_bno - bno;
a2ceac1f
DC
344
345 *resbno = aligned_bno;
346 *reslen = diff >= len ? 0 : len - diff;
2bd0ea18 347 } else {
a2ceac1f
DC
348 *resbno = bno;
349 *reslen = len;
2bd0ea18 350 }
cd80de04
CH
351
352 return busy;
2bd0ea18
NS
353}
354
355/*
356 * Compute best start block and diff for "near" allocations.
357 * freelen >= wantlen already checked by caller.
358 */
359STATIC xfs_extlen_t /* difference value (absolute) */
360xfs_alloc_compute_diff(
361 xfs_agblock_t wantbno, /* target starting block */
362 xfs_extlen_t wantlen, /* target length */
363 xfs_extlen_t alignment, /* target alignment */
1fccd5c8 364 int datatype, /* are we allocating data? */
2bd0ea18
NS
365 xfs_agblock_t freebno, /* freespace's starting block */
366 xfs_extlen_t freelen, /* freespace's length */
367 xfs_agblock_t *newbnop) /* result: best start block from free */
368{
369 xfs_agblock_t freeend; /* end of freespace extent */
370 xfs_agblock_t newbno1; /* return block number */
371 xfs_agblock_t newbno2; /* other new block number */
0e266570
NS
372 xfs_extlen_t newlen1=0; /* length with newbno1 */
373 xfs_extlen_t newlen2=0; /* length with newbno2 */
2bd0ea18 374 xfs_agblock_t wantend; /* end of target extent */
a85522b6 375 bool userdata = datatype & XFS_ALLOC_USERDATA;
2bd0ea18
NS
376
377 ASSERT(freelen >= wantlen);
378 freeend = freebno + freelen;
379 wantend = wantbno + wantlen;
84a62eea
DC
380 /*
381 * We want to allocate from the start of a free extent if it is past
382 * the desired block or if we are allocating user data and the free
383 * extent is before desired block. The second case is there to allow
384 * for contiguous allocation from the remaining free space if the file
385 * grows in the short term.
386 */
387 if (freebno >= wantbno || (userdata && freeend < wantend)) {
2bd0ea18
NS
388 if ((newbno1 = roundup(freebno, alignment)) >= freeend)
389 newbno1 = NULLAGBLOCK;
390 } else if (freeend >= wantend && alignment > 1) {
391 newbno1 = roundup(wantbno, alignment);
392 newbno2 = newbno1 - alignment;
393 if (newbno1 >= freeend)
394 newbno1 = NULLAGBLOCK;
395 else
396 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
397 if (newbno2 < freebno)
398 newbno2 = NULLAGBLOCK;
399 else
400 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
401 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
402 if (newlen1 < newlen2 ||
403 (newlen1 == newlen2 &&
404 XFS_ABSDIFF(newbno1, wantbno) >
405 XFS_ABSDIFF(newbno2, wantbno)))
406 newbno1 = newbno2;
407 } else if (newbno2 != NULLAGBLOCK)
408 newbno1 = newbno2;
409 } else if (freeend >= wantend) {
410 newbno1 = wantbno;
411 } else if (alignment > 1) {
412 newbno1 = roundup(freeend - wantlen, alignment);
413 if (newbno1 > freeend - wantlen &&
414 newbno1 - alignment >= freebno)
415 newbno1 -= alignment;
416 else if (newbno1 >= freeend)
417 newbno1 = NULLAGBLOCK;
418 } else
419 newbno1 = freeend - wantlen;
420 *newbnop = newbno1;
421 return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
422}
423
424/*
425 * Fix up the length, based on mod and prod.
426 * len should be k * prod + mod for some k.
427 * If len is too small it is returned unchanged.
428 * If len hits maxlen it is left alone.
429 */
430STATIC void
431xfs_alloc_fix_len(
dfc130f3 432 xfs_alloc_arg_t *args) /* allocation argument structure */
2bd0ea18
NS
433{
434 xfs_extlen_t k;
435 xfs_extlen_t rlen;
436
437 ASSERT(args->mod < args->prod);
438 rlen = args->len;
439 ASSERT(rlen >= args->minlen);
440 ASSERT(rlen <= args->maxlen);
441 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
442 (args->mod == 0 && rlen < args->prod))
443 return;
444 k = rlen % args->prod;
445 if (k == args->mod)
446 return;
ff105f75
DC
447 if (k > args->mod)
448 rlen = rlen - (k - args->mod);
449 else
450 rlen = rlen - args->prod + (args->mod - k);
19ebedcf 451 /* casts to (int) catch length underflows */
ff105f75
DC
452 if ((int)rlen < (int)args->minlen)
453 return;
454 ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
455 ASSERT(rlen % args->prod == args->mod);
2c003dc2
CH
456 ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
457 rlen + args->minleft);
2bd0ea18
NS
458 args->len = rlen;
459}
460
2bd0ea18
NS
461/*
462 * Update the two btrees, logically removing from freespace the extent
463 * starting at rbno, rlen blocks. The extent is contained within the
464 * actual (current) free extent fbno for flen blocks.
465 * Flags are passed in indicating whether the cursors are set to the
466 * relevant records.
467 */
468STATIC int /* error code */
469xfs_alloc_fixup_trees(
ec924d04
DW
470 struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */
471 struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */
2bd0ea18
NS
472 xfs_agblock_t fbno, /* starting block of free extent */
473 xfs_extlen_t flen, /* length of free extent */
474 xfs_agblock_t rbno, /* starting block of returned extent */
475 xfs_extlen_t rlen, /* length of returned extent */
476 int flags) /* flags, XFSA_FIXUP_... */
477{
478 int error; /* error code */
479 int i; /* operation results */
480 xfs_agblock_t nfbno1; /* first new free startblock */
481 xfs_agblock_t nfbno2; /* second new free startblock */
0e266570
NS
482 xfs_extlen_t nflen1=0; /* first new free length */
483 xfs_extlen_t nflen2=0; /* second new free length */
19ebedcf
DC
484 struct xfs_mount *mp;
485
486 mp = cnt_cur->bc_mp;
2bd0ea18
NS
487
488 /*
489 * Look up the record in the by-size tree if necessary.
490 */
491 if (flags & XFSA_FIXUP_CNT_OK) {
492#ifdef DEBUG
0e266570 493 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
2bd0ea18 494 return error;
fbb4fa7f
DW
495 if (XFS_IS_CORRUPT(mp,
496 i != 1 ||
497 nfbno1 != fbno ||
498 nflen1 != flen))
499 return -EFSCORRUPTED;
2bd0ea18
NS
500#endif
501 } else {
0e266570 502 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
2bd0ea18 503 return error;
fbb4fa7f
DW
504 if (XFS_IS_CORRUPT(mp, i != 1))
505 return -EFSCORRUPTED;
2bd0ea18
NS
506 }
507 /*
508 * Look up the record in the by-block tree if necessary.
509 */
510 if (flags & XFSA_FIXUP_BNO_OK) {
511#ifdef DEBUG
0e266570 512 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
2bd0ea18 513 return error;
fbb4fa7f
DW
514 if (XFS_IS_CORRUPT(mp,
515 i != 1 ||
516 nfbno1 != fbno ||
517 nflen1 != flen))
518 return -EFSCORRUPTED;
2bd0ea18
NS
519#endif
520 } else {
0e266570 521 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
2bd0ea18 522 return error;
fbb4fa7f
DW
523 if (XFS_IS_CORRUPT(mp, i != 1))
524 return -EFSCORRUPTED;
2bd0ea18 525 }
b3563c19 526
2bd0ea18 527#ifdef DEBUG
b3563c19
BN
528 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
529 struct xfs_btree_block *bnoblock;
530 struct xfs_btree_block *cntblock;
531
5df9b067
DW
532 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
533 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
2bd0ea18 534
fbb4fa7f
DW
535 if (XFS_IS_CORRUPT(mp,
536 bnoblock->bb_numrecs !=
537 cntblock->bb_numrecs))
538 return -EFSCORRUPTED;
2bd0ea18
NS
539 }
540#endif
b3563c19 541
2bd0ea18
NS
542 /*
543 * Deal with all four cases: the allocated record is contained
544 * within the freespace record, so we can have new freespace
545 * at either (or both) end, or no freespace remaining.
546 */
547 if (rbno == fbno && rlen == flen)
548 nfbno1 = nfbno2 = NULLAGBLOCK;
549 else if (rbno == fbno) {
550 nfbno1 = rbno + rlen;
551 nflen1 = flen - rlen;
552 nfbno2 = NULLAGBLOCK;
553 } else if (rbno + rlen == fbno + flen) {
554 nfbno1 = fbno;
555 nflen1 = flen - rlen;
556 nfbno2 = NULLAGBLOCK;
557 } else {
558 nfbno1 = fbno;
559 nflen1 = rbno - fbno;
560 nfbno2 = rbno + rlen;
561 nflen2 = (fbno + flen) - nfbno2;
562 }
563 /*
564 * Delete the entry from the by-size btree.
565 */
b194c7d8 566 if ((error = xfs_btree_delete(cnt_cur, &i)))
2bd0ea18 567 return error;
fbb4fa7f
DW
568 if (XFS_IS_CORRUPT(mp, i != 1))
569 return -EFSCORRUPTED;
2bd0ea18
NS
570 /*
571 * Add new by-size btree entry(s).
572 */
573 if (nfbno1 != NULLAGBLOCK) {
0e266570 574 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
2bd0ea18 575 return error;
fbb4fa7f
DW
576 if (XFS_IS_CORRUPT(mp, i != 0))
577 return -EFSCORRUPTED;
b194c7d8 578 if ((error = xfs_btree_insert(cnt_cur, &i)))
2bd0ea18 579 return error;
fbb4fa7f
DW
580 if (XFS_IS_CORRUPT(mp, i != 1))
581 return -EFSCORRUPTED;
2bd0ea18
NS
582 }
583 if (nfbno2 != NULLAGBLOCK) {
0e266570 584 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
2bd0ea18 585 return error;
fbb4fa7f
DW
586 if (XFS_IS_CORRUPT(mp, i != 0))
587 return -EFSCORRUPTED;
b194c7d8 588 if ((error = xfs_btree_insert(cnt_cur, &i)))
2bd0ea18 589 return error;
fbb4fa7f
DW
590 if (XFS_IS_CORRUPT(mp, i != 1))
591 return -EFSCORRUPTED;
2bd0ea18
NS
592 }
593 /*
594 * Fix up the by-block btree entry(s).
595 */
596 if (nfbno1 == NULLAGBLOCK) {
597 /*
598 * No remaining freespace, just delete the by-block tree entry.
599 */
b194c7d8 600 if ((error = xfs_btree_delete(bno_cur, &i)))
2bd0ea18 601 return error;
fbb4fa7f
DW
602 if (XFS_IS_CORRUPT(mp, i != 1))
603 return -EFSCORRUPTED;
2bd0ea18
NS
604 } else {
605 /*
606 * Update the by-block entry to start later|be shorter.
607 */
0e266570 608 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
2bd0ea18
NS
609 return error;
610 }
611 if (nfbno2 != NULLAGBLOCK) {
612 /*
613 * 2 resulting free entries, need to add one.
614 */
0e266570 615 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
2bd0ea18 616 return error;
fbb4fa7f
DW
617 if (XFS_IS_CORRUPT(mp, i != 0))
618 return -EFSCORRUPTED;
b194c7d8 619 if ((error = xfs_btree_insert(bno_cur, &i)))
2bd0ea18 620 return error;
fbb4fa7f
DW
621 if (XFS_IS_CORRUPT(mp, i != 1))
622 return -EFSCORRUPTED;
2bd0ea18
NS
623 }
624 return 0;
625}
626
daa2d820
DC
627/*
628 * We do not verify the AGFL contents against AGF-based index counters here,
629 * even though we may have access to the perag that contains shadow copies. We
630 * don't know if the AGF based counters have been checked, and if they have they
631 * still may be inconsistent because they haven't yet been reset on the first
632 * allocation after the AGF has been read in.
633 *
634 * This means we can only check that all agfl entries contain valid or null
635 * values because we can't reliably determine the active range to exclude
636 * NULLAGBNO as a valid value.
637 *
638 * However, we can't even do that for v4 format filesystems because there are
639 * old versions of mkfs out there that does not initialise the AGFL to known,
640 * verifiable values. HEnce we can't tell the difference between a AGFL block
641 * allocated by mkfs and a corrupted AGFL block here on v4 filesystems.
642 *
643 * As a result, we can only fully validate AGFL block numbers when we pull them
644 * from the freelist in xfs_alloc_get_freelist().
645 */
bc01119d 646static xfs_failaddr_t
a2ceac1f
DC
647xfs_agfl_verify(
648 struct xfs_buf *bp)
649{
7861ef77 650 struct xfs_mount *mp = bp->b_mount;
a2ceac1f 651 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
b134e771 652 __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp);
a2ceac1f
DC
653 int i;
654
b16a427a 655 if (!xfs_has_crc(mp))
95d9582b
DW
656 return NULL;
657
68dbe77f 658 if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
bc01119d 659 return __this_address;
68dbe77f 660 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
bc01119d 661 return __this_address;
dd5b876e
DC
662 /*
663 * during growfs operations, the perag is not fully initialised,
664 * so we can't use it for any useful checking. growfs ensures we can't
665 * use it by using uncached buffers that don't have the perag attached
666 * so we can detect and avoid this problem.
667 */
668 if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
bc01119d 669 return __this_address;
dd5b876e 670
b8165508 671 for (i = 0; i < xfs_agfl_size(mp); i++) {
b134e771
CH
672 if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
673 be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
bc01119d 674 return __this_address;
a2ceac1f 675 }
a65d8d29 676
bc01119d
DW
677 if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
678 return __this_address;
679 return NULL;
dd5b876e
DC
680}
681
682static void
683xfs_agfl_read_verify(
684 struct xfs_buf *bp)
685{
7861ef77 686 struct xfs_mount *mp = bp->b_mount;
1e697959 687 xfs_failaddr_t fa;
dd5b876e
DC
688
689 /*
690 * There is no verification of non-crc AGFLs because mkfs does not
691 * initialise the AGFL to zero or NULL. Hence the only valid part of the
692 * AGFL is what the AGF says is active. We can't get to the AGF, so we
693 * can't verify just those entries are valid.
694 */
b16a427a 695 if (!xfs_has_crc(mp))
dd5b876e
DC
696 return;
697
45922933 698 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
1e697959
DW
699 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
700 else {
701 fa = xfs_agfl_verify(bp);
702 if (fa)
703 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
704 }
a2ceac1f
DC
705}
706
707static void
708xfs_agfl_write_verify(
709 struct xfs_buf *bp)
710{
7861ef77 711 struct xfs_mount *mp = bp->b_mount;
37d086ca 712 struct xfs_buf_log_item *bip = bp->b_log_item;
1e697959 713 xfs_failaddr_t fa;
a2ceac1f 714
dd5b876e 715 /* no verification of non-crc AGFLs */
b16a427a 716 if (!xfs_has_crc(mp))
dd5b876e
DC
717 return;
718
1e697959
DW
719 fa = xfs_agfl_verify(bp);
720 if (fa) {
721 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
dd5b876e
DC
722 return;
723 }
724
725 if (bip)
726 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
727
43b5aeed 728 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
a2ceac1f
DC
729}
730
731const struct xfs_buf_ops xfs_agfl_buf_ops = {
a3fac935 732 .name = "xfs_agfl",
68dbe77f 733 .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
a2ceac1f
DC
734 .verify_read = xfs_agfl_read_verify,
735 .verify_write = xfs_agfl_write_verify,
95d9582b 736 .verify_struct = xfs_agfl_verify,
a2ceac1f
DC
737};
738
2bd0ea18
NS
739/*
740 * Read in the allocation group free block array.
741 */
75c01ccc 742int
2bd0ea18 743xfs_alloc_read_agfl(
75c01ccc
DC
744 struct xfs_perag *pag,
745 struct xfs_trans *tp,
746 struct xfs_buf **bpp)
2bd0ea18 747{
75c01ccc
DC
748 struct xfs_mount *mp = pag->pag_mount;
749 struct xfs_buf *bp;
750 int error;
2bd0ea18 751
9440d84d
NS
752 error = xfs_trans_read_buf(
753 mp, tp, mp->m_ddev_targp,
75c01ccc 754 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
a2ceac1f 755 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
9440d84d 756 if (error)
2bd0ea18 757 return error;
a2ceac1f 758 xfs_buf_set_ref(bp, XFS_AGFL_REF);
2bd0ea18
NS
759 *bpp = bp;
760 return 0;
761}
762
a2ceac1f
DC
763STATIC int
764xfs_alloc_update_counters(
765 struct xfs_trans *tp,
a2ceac1f
DC
766 struct xfs_buf *agbp,
767 long len)
768{
0bc284c2 769 struct xfs_agf *agf = agbp->b_addr;
a2ceac1f 770
10c0a390 771 agbp->b_pag->pagf_freeblks += len;
a2ceac1f
DC
772 be32_add_cpu(&agf->agf_freeblks, len);
773
a2ceac1f 774 if (unlikely(be32_to_cpu(agf->agf_freeblks) >
a0264b73 775 be32_to_cpu(agf->agf_length))) {
1be76d11 776 xfs_buf_mark_corrupt(agbp);
12b53197 777 return -EFSCORRUPTED;
a0264b73 778 }
a2ceac1f
DC
779
780 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
781 return 0;
782}
783
2bd0ea18 784/*
2950989c 785 * Block allocation algorithm and data structures.
2bd0ea18 786 */
2950989c
BF
787struct xfs_alloc_cur {
788 struct xfs_btree_cur *cnt; /* btree cursors */
789 struct xfs_btree_cur *bnolt;
790 struct xfs_btree_cur *bnogt;
4d66edb1 791 xfs_extlen_t cur_len;/* current search length */
19fe42e3
BF
792 xfs_agblock_t rec_bno;/* extent startblock */
793 xfs_extlen_t rec_len;/* extent length */
794 xfs_agblock_t bno; /* alloc bno */
795 xfs_extlen_t len; /* alloc len */
796 xfs_extlen_t diff; /* diff from search bno */
e055d59e
BF
797 unsigned int busy_gen;/* busy state */
798 bool busy;
2950989c
BF
799};
800
801/*
802 * Set up cursors, etc. in the extent allocation cursor. This function can be
803 * called multiple times to reset an initialized structure without having to
804 * reallocate cursors.
805 */
806static int
807xfs_alloc_cur_setup(
808 struct xfs_alloc_arg *args,
809 struct xfs_alloc_cur *acur)
810{
811 int error;
812 int i;
813
4d66edb1 814 acur->cur_len = args->maxlen;
19fe42e3
BF
815 acur->rec_bno = 0;
816 acur->rec_len = 0;
817 acur->bno = 0;
818 acur->len = 0;
04ea0ac1 819 acur->diff = -1;
e055d59e
BF
820 acur->busy = false;
821 acur->busy_gen = 0;
822
2950989c
BF
823 /*
824 * Perform an initial cntbt lookup to check for availability of maxlen
825 * extents. If this fails, we'll return -ENOSPC to signal the caller to
826 * attempt a small allocation.
827 */
828 if (!acur->cnt)
829 acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
ecb44e84 830 args->agbp, args->pag, XFS_BTNUM_CNT);
2950989c
BF
831 error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
832 if (error)
833 return error;
834
835 /*
836 * Allocate the bnobt left and right search cursors.
837 */
838 if (!acur->bnolt)
839 acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
ecb44e84 840 args->agbp, args->pag, XFS_BTNUM_BNO);
2950989c
BF
841 if (!acur->bnogt)
842 acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
ecb44e84 843 args->agbp, args->pag, XFS_BTNUM_BNO);
2950989c
BF
844 return i == 1 ? 0 : -ENOSPC;
845}
846
847static void
848xfs_alloc_cur_close(
849 struct xfs_alloc_cur *acur,
850 bool error)
851{
852 int cur_error = XFS_BTREE_NOERROR;
853
854 if (error)
855 cur_error = XFS_BTREE_ERROR;
856
857 if (acur->cnt)
858 xfs_btree_del_cursor(acur->cnt, cur_error);
859 if (acur->bnolt)
860 xfs_btree_del_cursor(acur->bnolt, cur_error);
861 if (acur->bnogt)
862 xfs_btree_del_cursor(acur->bnogt, cur_error);
863 acur->cnt = acur->bnolt = acur->bnogt = NULL;
864}
2bd0ea18 865
04ea0ac1
BF
866/*
867 * Check an extent for allocation and track the best available candidate in the
868 * allocation structure. The cursor is deactivated if it has entered an out of
869 * range state based on allocation arguments. Optionally return the extent
870 * extent geometry and allocation status if requested by the caller.
871 */
872static int
873xfs_alloc_cur_check(
874 struct xfs_alloc_arg *args,
875 struct xfs_alloc_cur *acur,
876 struct xfs_btree_cur *cur,
877 int *new)
878{
879 int error, i;
880 xfs_agblock_t bno, bnoa, bnew;
881 xfs_extlen_t len, lena, diff = -1;
882 bool busy;
883 unsigned busy_gen = 0;
884 bool deactivate = false;
3ca39168 885 bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
04ea0ac1
BF
886
887 *new = 0;
888
889 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
890 if (error)
891 return error;
fbb4fa7f
DW
892 if (XFS_IS_CORRUPT(args->mp, i != 1))
893 return -EFSCORRUPTED;
04ea0ac1
BF
894
895 /*
896 * Check minlen and deactivate a cntbt cursor if out of acceptable size
897 * range (i.e., walking backwards looking for a minlen extent).
898 */
899 if (len < args->minlen) {
3ca39168 900 deactivate = !isbnobt;
04ea0ac1
BF
901 goto out;
902 }
903
904 busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
905 &busy_gen);
906 acur->busy |= busy;
907 if (busy)
908 acur->busy_gen = busy_gen;
909 /* deactivate a bnobt cursor outside of locality range */
3ca39168
BF
910 if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
911 deactivate = isbnobt;
04ea0ac1 912 goto out;
3ca39168 913 }
04ea0ac1
BF
914 if (lena < args->minlen)
915 goto out;
916
917 args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
918 xfs_alloc_fix_len(args);
919 ASSERT(args->len >= args->minlen);
920 if (args->len < acur->len)
921 goto out;
922
923 /*
924 * We have an aligned record that satisfies minlen and beats or matches
925 * the candidate extent size. Compare locality for near allocation mode.
926 */
04ea0ac1
BF
927 diff = xfs_alloc_compute_diff(args->agbno, args->len,
928 args->alignment, args->datatype,
929 bnoa, lena, &bnew);
930 if (bnew == NULLAGBLOCK)
931 goto out;
3ca39168
BF
932
933 /*
934 * Deactivate a bnobt cursor with worse locality than the current best.
935 */
936 if (diff > acur->diff) {
937 deactivate = isbnobt;
04ea0ac1 938 goto out;
3ca39168 939 }
04ea0ac1
BF
940
941 ASSERT(args->len > acur->len ||
942 (args->len == acur->len && diff <= acur->diff));
943 acur->rec_bno = bno;
944 acur->rec_len = len;
945 acur->bno = bnew;
946 acur->len = args->len;
947 acur->diff = diff;
948 *new = 1;
949
4f2eee5a
BF
950 /*
951 * We're done if we found a perfect allocation. This only deactivates
952 * the current cursor, but this is just an optimization to terminate a
953 * cntbt search that otherwise runs to the edge of the tree.
954 */
955 if (acur->diff == 0 && acur->len == args->maxlen)
956 deactivate = true;
04ea0ac1
BF
957out:
958 if (deactivate)
63789af3 959 cur->bc_ag.abt.active = false;
04ea0ac1
BF
960 trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
961 *new);
962 return 0;
963}
964
dacde37d
BF
965/*
966 * Complete an allocation of a candidate extent. Remove the extent from both
967 * trees and update the args structure.
968 */
969STATIC int
970xfs_alloc_cur_finish(
971 struct xfs_alloc_arg *args,
972 struct xfs_alloc_cur *acur)
973{
0bc284c2 974 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
dacde37d
BF
975 int error;
976
977 ASSERT(acur->cnt && acur->bnolt);
978 ASSERT(acur->bno >= acur->rec_bno);
979 ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
0bc284c2 980 ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
dacde37d
BF
981
982 error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
983 acur->rec_len, acur->bno, acur->len, 0);
984 if (error)
985 return error;
986
987 args->agbno = acur->bno;
988 args->len = acur->len;
989 args->wasfromfl = 0;
990
991 trace_xfs_alloc_cur(args);
992 return 0;
993}
994
4d66edb1
BF
995/*
996 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
997 * bno optimized lookup to search for extents with ideal size and locality.
998 */
999STATIC int
1000xfs_alloc_cntbt_iter(
1001 struct xfs_alloc_arg *args,
1002 struct xfs_alloc_cur *acur)
1003{
1004 struct xfs_btree_cur *cur = acur->cnt;
1005 xfs_agblock_t bno;
1006 xfs_extlen_t len, cur_len;
1007 int error;
1008 int i;
1009
1010 if (!xfs_alloc_cur_active(cur))
1011 return 0;
1012
1013 /* locality optimized lookup */
1014 cur_len = acur->cur_len;
1015 error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
1016 if (error)
1017 return error;
1018 if (i == 0)
1019 return 0;
1020 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1021 if (error)
1022 return error;
1023
1024 /* check the current record and update search length from it */
1025 error = xfs_alloc_cur_check(args, acur, cur, &i);
1026 if (error)
1027 return error;
1028 ASSERT(len >= acur->cur_len);
1029 acur->cur_len = len;
1030
1031 /*
1032 * We looked up the first record >= [agbno, len] above. The agbno is a
1033 * secondary key and so the current record may lie just before or after
1034 * agbno. If it is past agbno, check the previous record too so long as
1035 * the length matches as it may be closer. Don't check a smaller record
1036 * because that could deactivate our cursor.
1037 */
1038 if (bno > args->agbno) {
1039 error = xfs_btree_decrement(cur, 0, &i);
1040 if (!error && i) {
1041 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1042 if (!error && i && len == acur->cur_len)
1043 error = xfs_alloc_cur_check(args, acur, cur,
1044 &i);
1045 }
1046 if (error)
1047 return error;
1048 }
1049
1050 /*
1051 * Increment the search key until we find at least one allocation
1052 * candidate or if the extent we found was larger. Otherwise, double the
1053 * search key to optimize the search. Efficiency is more important here
1054 * than absolute best locality.
1055 */
1056 cur_len <<= 1;
1057 if (!acur->len || acur->cur_len >= cur_len)
1058 acur->cur_len++;
1059 else
1060 acur->cur_len = cur_len;
1061
1062 return error;
1063}
1064
ba02381c
BF
1065/*
1066 * Deal with the case where only small freespaces remain. Either return the
1067 * contents of the last freespace record, or allocate space from the freelist if
1068 * there is nothing in the tree.
1069 */
1070STATIC int /* error */
1071xfs_alloc_ag_vextent_small(
1072 struct xfs_alloc_arg *args, /* allocation argument structure */
1073 struct xfs_btree_cur *ccur, /* optional by-size cursor */
1074 xfs_agblock_t *fbnop, /* result block number */
1075 xfs_extlen_t *flenp, /* result length */
1076 int *stat) /* status: 0-freelist, 1-normal/none */
1077{
0bc284c2 1078 struct xfs_agf *agf = args->agbp->b_addr;
ba02381c
BF
1079 int error = 0;
1080 xfs_agblock_t fbno = NULLAGBLOCK;
1081 xfs_extlen_t flen = 0;
9e1862f0 1082 int i = 0;
ba02381c 1083
9e1862f0
BF
1084 /*
1085 * If a cntbt cursor is provided, try to allocate the largest record in
1086 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1087 * allocation. Make sure to respect minleft even when pulling from the
1088 * freelist.
1089 */
1090 if (ccur)
1091 error = xfs_btree_decrement(ccur, 0, &i);
ba02381c
BF
1092 if (error)
1093 goto error;
1094 if (i) {
1095 error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1096 if (error)
1097 goto error;
fbb4fa7f
DW
1098 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1099 error = -EFSCORRUPTED;
1100 goto error;
1101 }
ba02381c
BF
1102 goto out;
1103 }
1104
1105 if (args->minlen != 1 || args->alignment != 1 ||
1106 args->resv == XFS_AG_RESV_AGFL ||
0bc284c2 1107 be32_to_cpu(agf->agf_flcount) <= args->minleft)
ba02381c
BF
1108 goto out;
1109
1d202c10
DC
1110 error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
1111 &fbno, 0);
ba02381c
BF
1112 if (error)
1113 goto error;
1114 if (fbno == NULLAGBLOCK)
1115 goto out;
1116
7635c486 1117 xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
a85522b6 1118 (args->datatype & XFS_ALLOC_NOBUSY));
ba02381c 1119
a85522b6 1120 if (args->datatype & XFS_ALLOC_USERDATA) {
ba02381c
BF
1121 struct xfs_buf *bp;
1122
78fcd346
DW
1123 error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
1124 XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
1125 args->mp->m_bsize, 0, &bp);
1126 if (error)
ba02381c 1127 goto error;
ba02381c
BF
1128 xfs_trans_binval(args->tp, bp);
1129 }
2d7ea81f
BF
1130 *fbnop = args->agbno = fbno;
1131 *flenp = args->len = 1;
0bc284c2 1132 if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
fbb4fa7f
DW
1133 error = -EFSCORRUPTED;
1134 goto error;
1135 }
ba02381c
BF
1136 args->wasfromfl = 1;
1137 trace_xfs_alloc_small_freelist(args);
1138
1139 /*
1140 * If we're feeding an AGFL block to something that doesn't live in the
1141 * free space, we need to clear out the OWN_AG rmap.
1142 */
195c248c 1143 error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1,
ba02381c
BF
1144 &XFS_RMAP_OINFO_AG);
1145 if (error)
1146 goto error;
1147
1148 *stat = 0;
1149 return 0;
1150
1151out:
1152 /*
1153 * Can't do the allocation, give up.
1154 */
1155 if (flen < args->minlen) {
1156 args->agbno = NULLAGBLOCK;
1157 trace_xfs_alloc_small_notenough(args);
1158 flen = 0;
1159 }
1160 *fbnop = fbno;
1161 *flenp = flen;
1162 *stat = 1;
1163 trace_xfs_alloc_small_done(args);
1164 return 0;
1165
1166error:
1167 trace_xfs_alloc_small_error(args);
1168 return error;
1169}
1170
2bd0ea18
NS
1171/*
1172 * Allocate a variable extent at exactly agno/bno.
1173 * Extent's length (returned in *len) will be between minlen and maxlen,
1174 * and of the form k * prod + mod unless there's nothing that large.
1175 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1176 */
1177STATIC int /* error */
1178xfs_alloc_ag_vextent_exact(
dfc130f3 1179 xfs_alloc_arg_t *args) /* allocation argument structure */
2bd0ea18 1180{
0bc284c2 1181 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
ec924d04
DW
1182 struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
1183 struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
2bd0ea18
NS
1184 int error;
1185 xfs_agblock_t fbno; /* start block of found extent */
2bd0ea18 1186 xfs_extlen_t flen; /* length of found extent */
cd80de04
CH
1187 xfs_agblock_t tbno; /* start block of busy extent */
1188 xfs_extlen_t tlen; /* length of busy extent */
1189 xfs_agblock_t tend; /* end block of busy extent */
2bd0ea18 1190 int i; /* success/failure of operation */
cd80de04 1191 unsigned busy_gen;
2bd0ea18
NS
1192
1193 ASSERT(args->alignment == 1);
a2ceac1f 1194
2bd0ea18
NS
1195 /*
1196 * Allocate/initialize a cursor for the by-number freespace btree.
1197 */
b194c7d8 1198 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
ecb44e84 1199 args->pag, XFS_BTNUM_BNO);
56b2de80 1200
2bd0ea18
NS
1201 /*
1202 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1203 * Look for the closest free block <= bno, it must contain bno
1204 * if any free block does.
1205 */
56b2de80
DC
1206 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1207 if (error)
2bd0ea18 1208 goto error0;
56b2de80
DC
1209 if (!i)
1210 goto not_found;
1211
2bd0ea18
NS
1212 /*
1213 * Grab the freespace record.
1214 */
56b2de80
DC
1215 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1216 if (error)
2bd0ea18 1217 goto error0;
fbb4fa7f
DW
1218 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1219 error = -EFSCORRUPTED;
1220 goto error0;
1221 }
2bd0ea18 1222 ASSERT(fbno <= args->agbno);
56b2de80 1223
5000d01d 1224 /*
a2ceac1f
DC
1225 * Check for overlapping busy extents.
1226 */
cd80de04
CH
1227 tbno = fbno;
1228 tlen = flen;
1229 xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
a2ceac1f
DC
1230
1231 /*
1232 * Give up if the start of the extent is busy, or the freespace isn't
1233 * long enough for the minimum request.
2bd0ea18 1234 */
a2ceac1f
DC
1235 if (tbno > args->agbno)
1236 goto not_found;
1237 if (tlen < args->minlen)
1238 goto not_found;
1239 tend = tbno + tlen;
1240 if (tend < args->agbno + args->minlen)
56b2de80
DC
1241 goto not_found;
1242
2bd0ea18
NS
1243 /*
1244 * End of extent will be smaller of the freespace end and the
1245 * maximal requested end.
56b2de80 1246 *
2bd0ea18
NS
1247 * Fix the length according to mod and prod if given.
1248 */
a2ceac1f
DC
1249 args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1250 - args->agbno;
2bd0ea18 1251 xfs_alloc_fix_len(args);
a2ceac1f 1252 ASSERT(args->agbno + args->len <= tend);
56b2de80 1253
2bd0ea18 1254 /*
a2ceac1f 1255 * We are allocating agbno for args->len
2bd0ea18
NS
1256 * Allocate/initialize a cursor for the by-size btree.
1257 */
b194c7d8 1258 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
ecb44e84 1259 args->pag, XFS_BTNUM_CNT);
0bc284c2 1260 ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
56b2de80
DC
1261 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1262 args->len, XFSA_FIXUP_BNO_OK);
1263 if (error) {
2bd0ea18
NS
1264 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1265 goto error0;
1266 }
a2ceac1f 1267
2bd0ea18
NS
1268 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1269 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
a2ceac1f 1270
2bd0ea18 1271 args->wasfromfl = 0;
56b2de80
DC
1272 trace_xfs_alloc_exact_done(args);
1273 return 0;
1274
1275not_found:
1276 /* Didn't find it, return null. */
1277 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1278 args->agbno = NULLAGBLOCK;
1279 trace_xfs_alloc_exact_notfound(args);
2bd0ea18
NS
1280 return 0;
1281
1282error0:
1283 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
56b2de80
DC
1284 trace_xfs_alloc_exact_error(args);
1285 return error;
1286}
1287
1288/*
4f2eee5a
BF
1289 * Search a given number of btree records in a given direction. Check each
1290 * record against the good extent we've already found.
56b2de80
DC
1291 */
1292STATIC int
4f2eee5a 1293xfs_alloc_walk_iter(
3ca39168
BF
1294 struct xfs_alloc_arg *args,
1295 struct xfs_alloc_cur *acur,
1296 struct xfs_btree_cur *cur,
4f2eee5a
BF
1297 bool increment,
1298 bool find_one, /* quit on first candidate */
1299 int count, /* rec count (-1 for infinite) */
1300 int *stat)
56b2de80 1301{
56b2de80
DC
1302 int error;
1303 int i;
56b2de80 1304
4f2eee5a
BF
1305 *stat = 0;
1306
56b2de80 1307 /*
3ca39168
BF
1308 * Search so long as the cursor is active or we find a better extent.
1309 * The cursor is deactivated if it extends beyond the range of the
1310 * current allocation candidate.
56b2de80 1311 */
4f2eee5a 1312 while (xfs_alloc_cur_active(cur) && count) {
3ca39168 1313 error = xfs_alloc_cur_check(args, acur, cur, &i);
56b2de80 1314 if (error)
3ca39168 1315 return error;
4f2eee5a
BF
1316 if (i == 1) {
1317 *stat = 1;
1318 if (find_one)
1319 break;
1320 }
3ca39168
BF
1321 if (!xfs_alloc_cur_active(cur))
1322 break;
56b2de80 1323
3ca39168
BF
1324 if (increment)
1325 error = xfs_btree_increment(cur, 0, &i);
56b2de80 1326 else
3ca39168 1327 error = xfs_btree_decrement(cur, 0, &i);
56b2de80 1328 if (error)
3ca39168
BF
1329 return error;
1330 if (i == 0)
63789af3 1331 cur->bc_ag.abt.active = false;
4f2eee5a
BF
1332
1333 if (count > 0)
1334 count--;
3ca39168 1335 }
56b2de80 1336
56b2de80 1337 return 0;
2bd0ea18
NS
1338}
1339
d0ebd9ee 1340/*
4d66edb1
BF
1341 * Search the by-bno and by-size btrees in parallel in search of an extent with
1342 * ideal locality based on the NEAR mode ->agbno locality hint.
d0ebd9ee
BF
1343 */
1344STATIC int
4d66edb1 1345xfs_alloc_ag_vextent_locality(
d0ebd9ee
BF
1346 struct xfs_alloc_arg *args,
1347 struct xfs_alloc_cur *acur,
1348 int *stat)
1349{
1350 struct xfs_btree_cur *fbcur = NULL;
1351 int error;
1352 int i;
1353 bool fbinc;
1354
1355 ASSERT(acur->len == 0);
d0ebd9ee
BF
1356
1357 *stat = 0;
1358
4d66edb1
BF
1359 error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1360 if (error)
1361 return error;
d0ebd9ee
BF
1362 error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1363 if (error)
1364 return error;
1365 error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1366 if (error)
1367 return error;
1368
1369 /*
4d66edb1
BF
1370 * Search the bnobt and cntbt in parallel. Search the bnobt left and
1371 * right and lookup the closest extent to the locality hint for each
1372 * extent size key in the cntbt. The entire search terminates
1373 * immediately on a bnobt hit because that means we've found best case
1374 * locality. Otherwise the search continues until the cntbt cursor runs
1375 * off the end of the tree. If no allocation candidate is found at this
1376 * point, give up on locality, walk backwards from the end of the cntbt
1377 * and take the first available extent.
1378 *
1379 * The parallel tree searches balance each other out to provide fairly
1380 * consistent performance for various situations. The bnobt search can
1381 * have pathological behavior in the worst case scenario of larger
1382 * allocation requests and fragmented free space. On the other hand, the
1383 * bnobt is able to satisfy most smaller allocation requests much more
1384 * quickly than the cntbt. The cntbt search can sift through fragmented
1385 * free space and sets of free extents for larger allocation requests
1386 * more quickly than the bnobt. Since the locality hint is just a hint
1387 * and we don't want to scan the entire bnobt for perfect locality, the
1388 * cntbt search essentially bounds the bnobt search such that we can
1389 * find good enough locality at reasonable performance in most cases.
d0ebd9ee
BF
1390 */
1391 while (xfs_alloc_cur_active(acur->bnolt) ||
4d66edb1
BF
1392 xfs_alloc_cur_active(acur->bnogt) ||
1393 xfs_alloc_cur_active(acur->cnt)) {
1394
1395 trace_xfs_alloc_cur_lookup(args);
1396
1397 /*
1398 * Search the bnobt left and right. In the case of a hit, finish
1399 * the search in the opposite direction and we're done.
1400 */
d0ebd9ee
BF
1401 error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1402 true, 1, &i);
1403 if (error)
1404 return error;
1405 if (i == 1) {
1406 trace_xfs_alloc_cur_left(args);
1407 fbcur = acur->bnogt;
1408 fbinc = true;
1409 break;
1410 }
d0ebd9ee
BF
1411 error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1412 1, &i);
1413 if (error)
1414 return error;
1415 if (i == 1) {
1416 trace_xfs_alloc_cur_right(args);
1417 fbcur = acur->bnolt;
1418 fbinc = false;
1419 break;
1420 }
4d66edb1
BF
1421
1422 /*
1423 * Check the extent with best locality based on the current
1424 * extent size search key and keep track of the best candidate.
1425 */
1426 error = xfs_alloc_cntbt_iter(args, acur);
1427 if (error)
1428 return error;
1429 if (!xfs_alloc_cur_active(acur->cnt)) {
1430 trace_xfs_alloc_cur_lookup_done(args);
1431 break;
1432 }
1433 }
1434
1435 /*
1436 * If we failed to find anything due to busy extents, return empty
1437 * handed so the caller can flush and retry. If no busy extents were
1438 * found, walk backwards from the end of the cntbt as a last resort.
1439 */
1440 if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1441 error = xfs_btree_decrement(acur->cnt, 0, &i);
1442 if (error)
1443 return error;
1444 if (i) {
63789af3 1445 acur->cnt->bc_ag.abt.active = true;
4d66edb1
BF
1446 fbcur = acur->cnt;
1447 fbinc = false;
1448 }
d0ebd9ee
BF
1449 }
1450
4d66edb1
BF
1451 /*
1452 * Search in the opposite direction for a better entry in the case of
1453 * a bnobt hit or walk backwards from the end of the cntbt.
1454 */
d0ebd9ee
BF
1455 if (fbcur) {
1456 error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1457 &i);
1458 if (error)
1459 return error;
1460 }
1461
1462 if (acur->len)
1463 *stat = 1;
1464
1465 return 0;
1466}
1467
0bb673ce
DW
1468/* Check the last block of the cnt btree for allocations. */
1469static int
1470xfs_alloc_ag_vextent_lastblock(
1471 struct xfs_alloc_arg *args,
1472 struct xfs_alloc_cur *acur,
1473 xfs_agblock_t *bno,
1474 xfs_extlen_t *len,
1475 bool *allocated)
1476{
1477 int error;
1478 int i;
1479
1480#ifdef DEBUG
1481 /* Randomly don't execute the first algorithm. */
9a046f96 1482 if (get_random_u32_below(2))
0bb673ce
DW
1483 return 0;
1484#endif
1485
1486 /*
1487 * Start from the entry that lookup found, sequence through all larger
1488 * free blocks. If we're actually pointing at a record smaller than
1489 * maxlen, go to the start of this block, and skip all those smaller
1490 * than minlen.
1491 */
2c711623 1492 if (*len || args->alignment > 1) {
5df9b067 1493 acur->cnt->bc_levels[0].ptr = 1;
0bb673ce
DW
1494 do {
1495 error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1496 if (error)
1497 return error;
fbb4fa7f
DW
1498 if (XFS_IS_CORRUPT(args->mp, i != 1))
1499 return -EFSCORRUPTED;
0bb673ce
DW
1500 if (*len >= args->minlen)
1501 break;
1502 error = xfs_btree_increment(acur->cnt, 0, &i);
1503 if (error)
1504 return error;
1505 } while (i);
1506 ASSERT(*len >= args->minlen);
1507 if (!i)
1508 return 0;
1509 }
1510
1511 error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1512 if (error)
1513 return error;
1514
1515 /*
1516 * It didn't work. We COULD be in a case where there's a good record
1517 * somewhere, so try again.
1518 */
1519 if (acur->len == 0)
1520 return 0;
1521
1522 trace_xfs_alloc_near_first(args);
1523 *allocated = true;
1524 return 0;
1525}
1526
2bd0ea18
NS
1527/*
1528 * Allocate a variable extent near bno in the allocation group agno.
1529 * Extent's length (returned in len) will be between minlen and maxlen,
1530 * and of the form k * prod + mod unless there's nothing that large.
1531 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1532 */
2950989c 1533STATIC int
2bd0ea18 1534xfs_alloc_ag_vextent_near(
01f05365
DC
1535 struct xfs_alloc_arg *args,
1536 uint32_t alloc_flags)
2bd0ea18 1537{
2950989c 1538 struct xfs_alloc_cur acur = {};
3ca39168
BF
1539 int error; /* error code */
1540 int i; /* result code, temporary */
3ca39168
BF
1541 xfs_agblock_t bno;
1542 xfs_extlen_t len;
a2ceac1f 1543
9bef1574 1544 /* handle uninitialized agbno range so caller doesn't have to */
ff3263dd
BF
1545 if (!args->min_agbno && !args->max_agbno)
1546 args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1547 ASSERT(args->min_agbno <= args->max_agbno);
1548
1549 /* clamp agbno to the range if it's outside */
1550 if (args->agbno < args->min_agbno)
1551 args->agbno = args->min_agbno;
1552 if (args->agbno > args->max_agbno)
1553 args->agbno = args->max_agbno;
1554
4127f5dc
DC
1555 /* Retry once quickly if we find busy extents before blocking. */
1556 alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH;
a2ceac1f 1557restart:
3ca39168 1558 len = 0;
a2ceac1f 1559
2bd0ea18 1560 /*
2950989c
BF
1561 * Set up cursors and see if there are any free extents as big as
1562 * maxlen. If not, pick the last entry in the tree unless the tree is
1563 * empty.
5000d01d 1564 */
2950989c
BF
1565 error = xfs_alloc_cur_setup(args, &acur);
1566 if (error == -ENOSPC) {
3ca39168
BF
1567 error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1568 &len, &i);
2950989c
BF
1569 if (error)
1570 goto out;
3ca39168 1571 if (i == 0 || len == 0) {
a2ceac1f 1572 trace_xfs_alloc_near_noentry(args);
2950989c 1573 goto out;
2bd0ea18
NS
1574 }
1575 ASSERT(i == 1);
2950989c
BF
1576 } else if (error) {
1577 goto out;
2bd0ea18 1578 }
a2ceac1f 1579
5000d01d 1580 /*
2bd0ea18
NS
1581 * First algorithm.
1582 * If the requested extent is large wrt the freespaces available
1583 * in this a.g., then the cursor will be pointing to a btree entry
1584 * near the right edge of the tree. If it's in the last btree leaf
1585 * block, then we just examine all the entries in that block
1586 * that are big enough, and pick the best one.
2bd0ea18 1587 */
0bb673ce
DW
1588 if (xfs_btree_islastblock(acur.cnt, 0)) {
1589 bool allocated = false;
4f2eee5a 1590
0bb673ce
DW
1591 error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1592 &allocated);
4f2eee5a
BF
1593 if (error)
1594 goto out;
0bb673ce
DW
1595 if (allocated)
1596 goto alloc_finish;
2bd0ea18 1597 }
2950989c 1598
2bd0ea18 1599 /*
4d66edb1
BF
1600 * Second algorithm. Combined cntbt and bnobt search to find ideal
1601 * locality.
2bd0ea18 1602 */
4d66edb1 1603 error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
2950989c
BF
1604 if (error)
1605 goto out;
1606
2bd0ea18
NS
1607 /*
1608 * If we couldn't get anything, give up.
1609 */
3ca39168 1610 if (!acur.len) {
e055d59e 1611 if (acur.busy) {
4127f5dc
DC
1612 /*
1613 * Our only valid extents must have been busy. Flush and
1614 * retry the allocation again. If we get an -EAGAIN
1615 * error, we're being told that a deadlock was avoided
1616 * and the current transaction needs committing before
1617 * the allocation can be retried.
1618 */
a2ceac1f 1619 trace_xfs_alloc_near_busy(args);
4127f5dc
DC
1620 error = xfs_extent_busy_flush(args->tp, args->pag,
1621 acur.busy_gen, alloc_flags);
1622 if (error)
1623 goto out;
1624
1625 alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
a2ceac1f
DC
1626 goto restart;
1627 }
56b2de80 1628 trace_xfs_alloc_size_neither(args);
2bd0ea18 1629 args->agbno = NULLAGBLOCK;
2950989c 1630 goto out;
2bd0ea18 1631 }
56b2de80 1632
0bb673ce 1633alloc_finish:
dacde37d
BF
1634 /* fix up btrees on a successful allocation */
1635 error = xfs_alloc_cur_finish(args, &acur);
56b2de80 1636
2950989c
BF
1637out:
1638 xfs_alloc_cur_close(&acur, error);
2bd0ea18
NS
1639 return error;
1640}
1641
1642/*
1643 * Allocate a variable extent anywhere in the allocation group agno.
1644 * Extent's length (returned in len) will be between minlen and maxlen,
1645 * and of the form k * prod + mod unless there's nothing that large.
1646 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1647 */
01f05365 1648static int
2bd0ea18 1649xfs_alloc_ag_vextent_size(
01f05365
DC
1650 struct xfs_alloc_arg *args,
1651 uint32_t alloc_flags)
2bd0ea18 1652{
01f05365
DC
1653 struct xfs_agf *agf = args->agbp->b_addr;
1654 struct xfs_btree_cur *bno_cur;
1655 struct xfs_btree_cur *cnt_cur;
1656 xfs_agblock_t fbno; /* start of found freespace */
1657 xfs_extlen_t flen; /* length of found freespace */
1658 xfs_agblock_t rbno; /* returned block number */
1659 xfs_extlen_t rlen; /* length of returned extent */
1660 bool busy;
1661 unsigned busy_gen;
1662 int error;
1663 int i;
2bd0ea18 1664
4127f5dc
DC
1665 /* Retry once quickly if we find busy extents before blocking. */
1666 alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH;
a2ceac1f 1667restart:
2bd0ea18
NS
1668 /*
1669 * Allocate and initialize a cursor for the by-size btree.
1670 */
b194c7d8 1671 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
ecb44e84 1672 args->pag, XFS_BTNUM_CNT);
2bd0ea18 1673 bno_cur = NULL;
a2ceac1f 1674
2bd0ea18
NS
1675 /*
1676 * Look for an entry >= maxlen+alignment-1 blocks.
1677 */
0e266570
NS
1678 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1679 args->maxlen + args->alignment - 1, &i)))
2bd0ea18 1680 goto error0;
a2ceac1f 1681
2bd0ea18 1682 /*
cd80de04
CH
1683 * If none then we have to settle for a smaller extent. In the case that
1684 * there are no large extents, this will return the last entry in the
1685 * tree unless the tree is empty. In the case that there are only busy
1686 * large extents, this will return the largest small extent unless there
a2ceac1f 1687 * are no smaller extents available.
5000d01d 1688 */
cd80de04 1689 if (!i) {
a2ceac1f
DC
1690 error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1691 &fbno, &flen, &i);
1692 if (error)
2bd0ea18
NS
1693 goto error0;
1694 if (i == 0 || flen == 0) {
1695 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
56b2de80 1696 trace_xfs_alloc_size_noentry(args);
2bd0ea18
NS
1697 return 0;
1698 }
1699 ASSERT(i == 1);
cd80de04
CH
1700 busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1701 &rlen, &busy_gen);
a2ceac1f
DC
1702 } else {
1703 /*
1704 * Search for a non-busy extent that is large enough.
a2ceac1f
DC
1705 */
1706 for (;;) {
1707 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1708 if (error)
1709 goto error0;
fbb4fa7f
DW
1710 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1711 error = -EFSCORRUPTED;
1712 goto error0;
1713 }
a2ceac1f 1714
cd80de04
CH
1715 busy = xfs_alloc_compute_aligned(args, fbno, flen,
1716 &rbno, &rlen, &busy_gen);
a2ceac1f
DC
1717
1718 if (rlen >= args->maxlen)
1719 break;
1720
1721 error = xfs_btree_increment(cnt_cur, 0, &i);
1722 if (error)
1723 goto error0;
4127f5dc
DC
1724 if (i)
1725 continue;
1726
1727 /*
1728 * Our only valid extents must have been busy. Flush and
1729 * retry the allocation again. If we get an -EAGAIN
1730 * error, we're being told that a deadlock was avoided
1731 * and the current transaction needs committing before
1732 * the allocation can be retried.
1733 */
1734 trace_xfs_alloc_size_busy(args);
1735 error = xfs_extent_busy_flush(args->tp, args->pag,
1736 busy_gen, alloc_flags);
1737 if (error)
1738 goto error0;
1739
1740 alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1741 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1742 goto restart;
a2ceac1f 1743 }
2bd0ea18 1744 }
a2ceac1f 1745
2bd0ea18
NS
1746 /*
1747 * In the first case above, we got the last entry in the
1748 * by-size btree. Now we check to see if the space hits maxlen
1749 * once aligned; if not, we search left for something better.
1750 * This can't happen in the second case above.
1751 */
2bd0ea18 1752 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
fbb4fa7f
DW
1753 if (XFS_IS_CORRUPT(args->mp,
1754 rlen != 0 &&
1755 (rlen > flen ||
1756 rbno + rlen > fbno + flen))) {
1757 error = -EFSCORRUPTED;
1758 goto error0;
1759 }
2bd0ea18
NS
1760 if (rlen < args->maxlen) {
1761 xfs_agblock_t bestfbno;
1762 xfs_extlen_t bestflen;
1763 xfs_agblock_t bestrbno;
1764 xfs_extlen_t bestrlen;
1765
1766 bestrlen = rlen;
1767 bestrbno = rbno;
1768 bestflen = flen;
1769 bestfbno = fbno;
1770 for (;;) {
b194c7d8 1771 if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
2bd0ea18
NS
1772 goto error0;
1773 if (i == 0)
1774 break;
0e266570
NS
1775 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1776 &i)))
2bd0ea18 1777 goto error0;
fbb4fa7f
DW
1778 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1779 error = -EFSCORRUPTED;
1780 goto error0;
1781 }
2bd0ea18
NS
1782 if (flen < bestrlen)
1783 break;
cd80de04
CH
1784 busy = xfs_alloc_compute_aligned(args, fbno, flen,
1785 &rbno, &rlen, &busy_gen);
2bd0ea18 1786 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
fbb4fa7f
DW
1787 if (XFS_IS_CORRUPT(args->mp,
1788 rlen != 0 &&
1789 (rlen > flen ||
1790 rbno + rlen > fbno + flen))) {
1791 error = -EFSCORRUPTED;
1792 goto error0;
1793 }
2bd0ea18
NS
1794 if (rlen > bestrlen) {
1795 bestrlen = rlen;
1796 bestrbno = rbno;
1797 bestflen = flen;
1798 bestfbno = fbno;
1799 if (rlen == args->maxlen)
1800 break;
1801 }
5000d01d 1802 }
0e266570
NS
1803 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1804 &i)))
2bd0ea18 1805 goto error0;
fbb4fa7f
DW
1806 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1807 error = -EFSCORRUPTED;
1808 goto error0;
1809 }
2bd0ea18
NS
1810 rlen = bestrlen;
1811 rbno = bestrbno;
1812 flen = bestflen;
1813 fbno = bestfbno;
1814 }
1815 args->wasfromfl = 0;
1816 /*
1817 * Fix up the length.
1818 */
1819 args->len = rlen;
a2ceac1f 1820 if (rlen < args->minlen) {
cd80de04 1821 if (busy) {
4127f5dc
DC
1822 /*
1823 * Our only valid extents must have been busy. Flush and
1824 * retry the allocation again. If we get an -EAGAIN
1825 * error, we're being told that a deadlock was avoided
1826 * and the current transaction needs committing before
1827 * the allocation can be retried.
1828 */
a2ceac1f 1829 trace_xfs_alloc_size_busy(args);
4127f5dc
DC
1830 error = xfs_extent_busy_flush(args->tp, args->pag,
1831 busy_gen, alloc_flags);
1832 if (error)
1833 goto error0;
1834
1835 alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH;
1836 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
a2ceac1f
DC
1837 goto restart;
1838 }
1839 goto out_nominleft;
2bd0ea18 1840 }
a2ceac1f
DC
1841 xfs_alloc_fix_len(args);
1842
2bd0ea18 1843 rlen = args->len;
fbb4fa7f
DW
1844 if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1845 error = -EFSCORRUPTED;
1846 goto error0;
1847 }
2bd0ea18
NS
1848 /*
1849 * Allocate and initialize a cursor for the by-block tree.
1850 */
b194c7d8 1851 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
ecb44e84 1852 args->pag, XFS_BTNUM_BNO);
0e266570
NS
1853 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1854 rbno, rlen, XFSA_FIXUP_CNT_OK)))
2bd0ea18
NS
1855 goto error0;
1856 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1857 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1858 cnt_cur = bno_cur = NULL;
1859 args->len = rlen;
1860 args->agbno = rbno;
fbb4fa7f
DW
1861 if (XFS_IS_CORRUPT(args->mp,
1862 args->agbno + args->len >
0bc284c2 1863 be32_to_cpu(agf->agf_length))) {
fbb4fa7f
DW
1864 error = -EFSCORRUPTED;
1865 goto error0;
1866 }
56b2de80 1867 trace_xfs_alloc_size_done(args);
2bd0ea18
NS
1868 return 0;
1869
1870error0:
56b2de80 1871 trace_xfs_alloc_size_error(args);
2bd0ea18
NS
1872 if (cnt_cur)
1873 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1874 if (bno_cur)
1875 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1876 return error;
a2ceac1f
DC
1877
1878out_nominleft:
1879 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1880 trace_xfs_alloc_size_nominleft(args);
1881 args->agbno = NULLAGBLOCK;
1882 return 0;
2bd0ea18
NS
1883}
1884
2bd0ea18
NS
1885/*
1886 * Free the extent starting at agno/bno for length.
1887 */
85aec44f 1888STATIC int
2bd0ea18 1889xfs_free_ag_extent(
5837e73b
DW
1890 struct xfs_trans *tp,
1891 struct xfs_buf *agbp,
1892 xfs_agnumber_t agno,
1893 xfs_agblock_t bno,
1894 xfs_extlen_t len,
1895 const struct xfs_owner_info *oinfo,
1896 enum xfs_ag_resv_type type)
2bd0ea18 1897{
5837e73b 1898 struct xfs_mount *mp;
5837e73b
DW
1899 struct xfs_btree_cur *bno_cur;
1900 struct xfs_btree_cur *cnt_cur;
1901 xfs_agblock_t gtbno; /* start of right neighbor */
1902 xfs_extlen_t gtlen; /* length of right neighbor */
1903 xfs_agblock_t ltbno; /* start of left neighbor */
1904 xfs_extlen_t ltlen; /* length of left neighbor */
1905 xfs_agblock_t nbno; /* new starting block of freesp */
1906 xfs_extlen_t nlen; /* new length of freespace */
1907 int haveleft; /* have a left neighbor */
1908 int haveright; /* have a right neighbor */
1909 int i;
1910 int error;
195c248c 1911 struct xfs_perag *pag = agbp->b_pag;
2bd0ea18 1912
631ac87a 1913 bno_cur = cnt_cur = NULL;
2bd0ea18 1914 mp = tp->t_mountp;
631ac87a 1915
3ee858aa 1916 if (!xfs_rmap_should_skip_owner_update(oinfo)) {
195c248c 1917 error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo);
631ac87a
DW
1918 if (error)
1919 goto error0;
1920 }
1921
5000d01d 1922 /*
2bd0ea18
NS
1923 * Allocate and initialize a cursor for the by-block btree.
1924 */
ecb44e84 1925 bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
5000d01d 1926 /*
2bd0ea18
NS
1927 * Look for a neighboring block on the left (lower block numbers)
1928 * that is contiguous with this space.
1929 */
0e266570 1930 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
2bd0ea18
NS
1931 goto error0;
1932 if (haveleft) {
1933 /*
1934 * There is a block to our left.
1935 */
0e266570 1936 if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
2bd0ea18 1937 goto error0;
fbb4fa7f
DW
1938 if (XFS_IS_CORRUPT(mp, i != 1)) {
1939 error = -EFSCORRUPTED;
1940 goto error0;
1941 }
2bd0ea18
NS
1942 /*
1943 * It's not contiguous, though.
1944 */
1945 if (ltbno + ltlen < bno)
1946 haveleft = 0;
1947 else {
1948 /*
1949 * If this failure happens the request to free this
1950 * space was invalid, it's (partly) already free.
1951 * Very bad.
1952 */
fbb4fa7f
DW
1953 if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
1954 error = -EFSCORRUPTED;
1955 goto error0;
1956 }
2bd0ea18
NS
1957 }
1958 }
5000d01d 1959 /*
2bd0ea18
NS
1960 * Look for a neighboring block on the right (higher block numbers)
1961 * that is contiguous with this space.
1962 */
b194c7d8 1963 if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
2bd0ea18
NS
1964 goto error0;
1965 if (haveright) {
1966 /*
1967 * There is a block to our right.
1968 */
0e266570 1969 if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
2bd0ea18 1970 goto error0;
fbb4fa7f
DW
1971 if (XFS_IS_CORRUPT(mp, i != 1)) {
1972 error = -EFSCORRUPTED;
1973 goto error0;
1974 }
2bd0ea18
NS
1975 /*
1976 * It's not contiguous, though.
1977 */
1978 if (bno + len < gtbno)
1979 haveright = 0;
1980 else {
1981 /*
1982 * If this failure happens the request to free this
1983 * space was invalid, it's (partly) already free.
1984 * Very bad.
1985 */
fbb4fa7f
DW
1986 if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
1987 error = -EFSCORRUPTED;
1988 goto error0;
1989 }
2bd0ea18
NS
1990 }
1991 }
1992 /*
1993 * Now allocate and initialize a cursor for the by-size tree.
1994 */
ecb44e84 1995 cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
2bd0ea18
NS
1996 /*
1997 * Have both left and right contiguous neighbors.
1998 * Merge all three into a single free block.
1999 */
2000 if (haveleft && haveright) {
2001 /*
2002 * Delete the old by-size entry on the left.
2003 */
0e266570 2004 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2bd0ea18 2005 goto error0;
fbb4fa7f
DW
2006 if (XFS_IS_CORRUPT(mp, i != 1)) {
2007 error = -EFSCORRUPTED;
2008 goto error0;
2009 }
b194c7d8 2010 if ((error = xfs_btree_delete(cnt_cur, &i)))
2bd0ea18 2011 goto error0;
fbb4fa7f
DW
2012 if (XFS_IS_CORRUPT(mp, i != 1)) {
2013 error = -EFSCORRUPTED;
2014 goto error0;
2015 }
2bd0ea18
NS
2016 /*
2017 * Delete the old by-size entry on the right.
2018 */
0e266570 2019 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2bd0ea18 2020 goto error0;
fbb4fa7f
DW
2021 if (XFS_IS_CORRUPT(mp, i != 1)) {
2022 error = -EFSCORRUPTED;
2023 goto error0;
2024 }
b194c7d8 2025 if ((error = xfs_btree_delete(cnt_cur, &i)))
2bd0ea18 2026 goto error0;
fbb4fa7f
DW
2027 if (XFS_IS_CORRUPT(mp, i != 1)) {
2028 error = -EFSCORRUPTED;
2029 goto error0;
2030 }
2bd0ea18
NS
2031 /*
2032 * Delete the old by-block entry for the right block.
2033 */
b194c7d8 2034 if ((error = xfs_btree_delete(bno_cur, &i)))
2bd0ea18 2035 goto error0;
fbb4fa7f
DW
2036 if (XFS_IS_CORRUPT(mp, i != 1)) {
2037 error = -EFSCORRUPTED;
2038 goto error0;
2039 }
2bd0ea18
NS
2040 /*
2041 * Move the by-block cursor back to the left neighbor.
2042 */
b194c7d8 2043 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2bd0ea18 2044 goto error0;
fbb4fa7f
DW
2045 if (XFS_IS_CORRUPT(mp, i != 1)) {
2046 error = -EFSCORRUPTED;
2047 goto error0;
2048 }
2bd0ea18
NS
2049#ifdef DEBUG
2050 /*
2051 * Check that this is the right record: delete didn't
2052 * mangle the cursor.
2053 */
2054 {
2055 xfs_agblock_t xxbno;
2056 xfs_extlen_t xxlen;
2057
0e266570
NS
2058 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2059 &i)))
2bd0ea18 2060 goto error0;
fbb4fa7f
DW
2061 if (XFS_IS_CORRUPT(mp,
2062 i != 1 ||
2063 xxbno != ltbno ||
2064 xxlen != ltlen)) {
2065 error = -EFSCORRUPTED;
2066 goto error0;
2067 }
2bd0ea18
NS
2068 }
2069#endif
2070 /*
2071 * Update remaining by-block entry to the new, joined block.
2072 */
2073 nbno = ltbno;
2074 nlen = len + ltlen + gtlen;
0e266570 2075 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2bd0ea18
NS
2076 goto error0;
2077 }
2078 /*
2079 * Have only a left contiguous neighbor.
2080 * Merge it together with the new freespace.
2081 */
2082 else if (haveleft) {
2083 /*
2084 * Delete the old by-size entry on the left.
2085 */
0e266570 2086 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2bd0ea18 2087 goto error0;
fbb4fa7f
DW
2088 if (XFS_IS_CORRUPT(mp, i != 1)) {
2089 error = -EFSCORRUPTED;
2090 goto error0;
2091 }
b194c7d8 2092 if ((error = xfs_btree_delete(cnt_cur, &i)))
2bd0ea18 2093 goto error0;
fbb4fa7f
DW
2094 if (XFS_IS_CORRUPT(mp, i != 1)) {
2095 error = -EFSCORRUPTED;
2096 goto error0;
2097 }
2bd0ea18
NS
2098 /*
2099 * Back up the by-block cursor to the left neighbor, and
2100 * update its length.
2101 */
b194c7d8 2102 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2bd0ea18 2103 goto error0;
fbb4fa7f
DW
2104 if (XFS_IS_CORRUPT(mp, i != 1)) {
2105 error = -EFSCORRUPTED;
2106 goto error0;
2107 }
2bd0ea18
NS
2108 nbno = ltbno;
2109 nlen = len + ltlen;
0e266570 2110 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2bd0ea18
NS
2111 goto error0;
2112 }
2113 /*
2114 * Have only a right contiguous neighbor.
2115 * Merge it together with the new freespace.
2116 */
2117 else if (haveright) {
2118 /*
2119 * Delete the old by-size entry on the right.
2120 */
0e266570 2121 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2bd0ea18 2122 goto error0;
fbb4fa7f
DW
2123 if (XFS_IS_CORRUPT(mp, i != 1)) {
2124 error = -EFSCORRUPTED;
2125 goto error0;
2126 }
b194c7d8 2127 if ((error = xfs_btree_delete(cnt_cur, &i)))
2bd0ea18 2128 goto error0;
fbb4fa7f
DW
2129 if (XFS_IS_CORRUPT(mp, i != 1)) {
2130 error = -EFSCORRUPTED;
2131 goto error0;
2132 }
2bd0ea18 2133 /*
5000d01d 2134 * Update the starting block and length of the right
2bd0ea18
NS
2135 * neighbor in the by-block tree.
2136 */
2137 nbno = bno;
2138 nlen = len + gtlen;
0e266570 2139 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2bd0ea18
NS
2140 goto error0;
2141 }
2142 /*
2143 * No contiguous neighbors.
2144 * Insert the new freespace into the by-block tree.
2145 */
2146 else {
2147 nbno = bno;
2148 nlen = len;
b194c7d8 2149 if ((error = xfs_btree_insert(bno_cur, &i)))
2bd0ea18 2150 goto error0;
fbb4fa7f
DW
2151 if (XFS_IS_CORRUPT(mp, i != 1)) {
2152 error = -EFSCORRUPTED;
2153 goto error0;
2154 }
2bd0ea18
NS
2155 }
2156 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2157 bno_cur = NULL;
2158 /*
2159 * In all cases we need to insert the new freespace in the by-size tree.
2160 */
0e266570 2161 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2bd0ea18 2162 goto error0;
fbb4fa7f
DW
2163 if (XFS_IS_CORRUPT(mp, i != 0)) {
2164 error = -EFSCORRUPTED;
2165 goto error0;
2166 }
b194c7d8 2167 if ((error = xfs_btree_insert(cnt_cur, &i)))
2bd0ea18 2168 goto error0;
fbb4fa7f
DW
2169 if (XFS_IS_CORRUPT(mp, i != 1)) {
2170 error = -EFSCORRUPTED;
2171 goto error0;
2172 }
2bd0ea18
NS
2173 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2174 cnt_cur = NULL;
a2ceac1f 2175
2bd0ea18
NS
2176 /*
2177 * Update the freespace totals in the ag and superblock.
2178 */
10c0a390
GX
2179 error = xfs_alloc_update_counters(tp, agbp, len);
2180 xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
a2ceac1f
DC
2181 if (error)
2182 goto error0;
2183
79896434
BD
2184 XFS_STATS_INC(mp, xs_freex);
2185 XFS_STATS_ADD(mp, xs_freeb, len);
56b2de80 2186
65a15e06 2187 trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
3e535bba 2188
2bd0ea18
NS
2189 return 0;
2190
2191 error0:
65a15e06 2192 trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2bd0ea18
NS
2193 if (bno_cur)
2194 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2195 if (cnt_cur)
2196 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2197 return error;
2198}
2199
5000d01d 2200/*
2bd0ea18
NS
2201 * Visible (exported) allocation/free functions.
2202 * Some of these are used just by xfs_alloc_btree.c and this file.
2203 */
2204
2205/*
4aa2259d 2206 * Compute and fill in value of m_alloc_maxlevels.
2bd0ea18
NS
2207 */
2208void
2209xfs_alloc_compute_maxlevels(
2210 xfs_mount_t *mp) /* file system mount structure */
2211{
4aa2259d 2212 mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
730e2a19 2213 (mp->m_sb.sb_agblocks + 1) / 2);
441815c7 2214 ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
2bd0ea18
NS
2215}
2216
56b2de80 2217/*
cf8ce220
DW
2218 * Find the length of the longest extent in an AG. The 'need' parameter
2219 * specifies how much space we're going to need for the AGFL and the
2220 * 'reserved' parameter tells us how many blocks in this AG are reserved for
2221 * other callers.
56b2de80
DC
2222 */
2223xfs_extlen_t
2224xfs_alloc_longest_free_extent(
72bda06d 2225 struct xfs_perag *pag,
cf8ce220
DW
2226 xfs_extlen_t need,
2227 xfs_extlen_t reserved)
56b2de80 2228{
72bda06d 2229 xfs_extlen_t delta = 0;
56b2de80 2230
cf8ce220
DW
2231 /*
2232 * If the AGFL needs a recharge, we'll have to subtract that from the
2233 * longest extent.
2234 */
56b2de80
DC
2235 if (need > pag->pagf_flcount)
2236 delta = need - pag->pagf_flcount;
2237
cf8ce220
DW
2238 /*
2239 * If we cannot maintain others' reservations with space from the
2240 * not-longest freesp extents, we'll have to subtract /that/ from
2241 * the longest extent too.
2242 */
2243 if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2244 delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2245
2246 /*
2247 * If the longest extent is long enough to satisfy all the
2248 * reservations and AGFL rules in place, we can return this extent.
2249 */
56b2de80 2250 if (pag->pagf_longest > delta)
14058d94
DC
2251 return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
2252 pag->pagf_longest - delta);
cf8ce220
DW
2253
2254 /* Otherwise, let the caller try for 1 block if there's space. */
56b2de80
DC
2255 return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2256}
2257
b75bb1bd
DW
2258/*
2259 * Compute the minimum length of the AGFL in the given AG. If @pag is NULL,
2260 * return the largest possible minimum length.
2261 */
de046644
DC
2262unsigned int
2263xfs_alloc_min_freelist(
2264 struct xfs_mount *mp,
2265 struct xfs_perag *pag)
2266{
b75bb1bd
DW
2267 /* AG btrees have at least 1 level. */
2268 static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
2269 const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
de046644
DC
2270 unsigned int min_free;
2271
4aa2259d 2272 ASSERT(mp->m_alloc_maxlevels > 0);
b75bb1bd 2273
8fdde9dd
OS
2274 /*
2275 * For a btree shorter than the maximum height, the worst case is that
2276 * every level gets split and a new level is added, then while inserting
2277 * another entry to refill the AGFL, every level under the old root gets
2278 * split again. This is:
2279 *
2280 * (full height split reservation) + (AGFL refill split height)
2281 * = (current height + 1) + (current height - 1)
2282 * = (new height) + (new height - 2)
2283 * = 2 * new height - 2
2284 *
2285 * For a btree of maximum height, the worst case is that every level
2286 * under the root gets split, then while inserting another entry to
2287 * refill the AGFL, every level under the root gets split again. This is
2288 * also:
2289 *
2290 * 2 * (current height - 1)
2291 * = 2 * (new height - 1)
2292 * = 2 * new height - 2
2293 */
2294
de046644 2295 /* space needed by-bno freespace btree */
b75bb1bd 2296 min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
8fdde9dd 2297 mp->m_alloc_maxlevels) * 2 - 2;
de046644 2298 /* space needed by-size freespace btree */
b75bb1bd 2299 min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
8fdde9dd 2300 mp->m_alloc_maxlevels) * 2 - 2;
b8a8d6e5 2301 /* space needed reverse mapping used space btree */
94541a16 2302 if (xfs_has_rmapbt(mp))
b75bb1bd 2303 min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
8fdde9dd 2304 mp->m_rmap_maxlevels) * 2 - 2;
de046644
DC
2305
2306 return min_free;
2307}
2308
5515b7c1
DC
2309/*
2310 * Check if the operation we are fixing up the freelist for should go ahead or
2311 * not. If we are freeing blocks, we always allow it, otherwise the allocation
2312 * is dependent on whether the size and shape of free space available will
2313 * permit the requested allocation to take place.
2314 */
2315static bool
2316xfs_alloc_space_available(
2317 struct xfs_alloc_arg *args,
2318 xfs_extlen_t min_free,
2319 int flags)
2320{
2321 struct xfs_perag *pag = args->pag;
3fe4a6dd 2322 xfs_extlen_t alloc_len, longest;
cf8ce220 2323 xfs_extlen_t reservation; /* blocks that are still reserved */
5515b7c1 2324 int available;
580a4380 2325 xfs_extlen_t agflcount;
5515b7c1
DC
2326
2327 if (flags & XFS_ALLOC_FLAG_FREEING)
2328 return true;
2329
cf8ce220
DW
2330 reservation = xfs_ag_resv_needed(pag, args->resv);
2331
5515b7c1 2332 /* do we have enough contiguous free space for the allocation? */
3fe4a6dd 2333 alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
1421de38 2334 longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
3fe4a6dd 2335 if (longest < alloc_len)
5515b7c1
DC
2336 return false;
2337
580a4380
BF
2338 /*
2339 * Do we have enough free space remaining for the allocation? Don't
2340 * account extra agfl blocks because we are about to defer free them,
2341 * making them unavailable until the current transaction commits.
2342 */
2343 agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2344 available = (int)(pag->pagf_freeblks + agflcount -
2c003dc2 2345 reservation - min_free - args->minleft);
3fe4a6dd 2346 if (available < (int)max(args->total, alloc_len))
5515b7c1
DC
2347 return false;
2348
2c003dc2
CH
2349 /*
2350 * Clamp maxlen to the amount of free space available for the actual
2351 * extent allocation.
2352 */
2353 if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2354 args->maxlen = available;
2355 ASSERT(args->maxlen > 0);
2356 ASSERT(args->maxlen >= args->minlen);
2357 }
2358
5515b7c1
DC
2359 return true;
2360}
2361
30c8be8a
BF
2362int
2363xfs_free_agfl_block(
2364 struct xfs_trans *tp,
2365 xfs_agnumber_t agno,
2366 xfs_agblock_t agbno,
2367 struct xfs_buf *agbp,
2368 struct xfs_owner_info *oinfo)
2369{
2370 int error;
2371 struct xfs_buf *bp;
2372
2373 error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2374 XFS_AG_RESV_AGFL);
2375 if (error)
2376 return error;
2377
78fcd346
DW
2378 error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
2379 XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
2380 tp->t_mountp->m_bsize, 0, &bp);
2381 if (error)
2382 return error;
30c8be8a
BF
2383 xfs_trans_binval(tp, bp);
2384
2385 return 0;
2386}
2387
8dbee8f5 2388/*
daa2d820
DC
2389 * Check the agfl fields of the agf for inconsistency or corruption.
2390 *
2391 * The original purpose was to detect an agfl header padding mismatch between
2392 * current and early v5 kernels. This problem manifests as a 1-slot size
2393 * difference between the on-disk flcount and the active [first, last] range of
2394 * a wrapped agfl.
2395 *
2396 * However, we need to use these same checks to catch agfl count corruptions
2397 * unrelated to padding. This could occur on any v4 or v5 filesystem, so either
2398 * way, we need to reset the agfl and warn the user.
8dbee8f5
BF
2399 *
2400 * Return true if a reset is required before the agfl can be used, false
2401 * otherwise.
2402 */
2403static bool
2404xfs_agfl_needs_reset(
2405 struct xfs_mount *mp,
2406 struct xfs_agf *agf)
2407{
2408 uint32_t f = be32_to_cpu(agf->agf_flfirst);
2409 uint32_t l = be32_to_cpu(agf->agf_fllast);
2410 uint32_t c = be32_to_cpu(agf->agf_flcount);
2411 int agfl_size = xfs_agfl_size(mp);
2412 int active;
2413
8dbee8f5
BF
2414 /*
2415 * The agf read verifier catches severe corruption of these fields.
2416 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2417 * the verifier allows it.
2418 */
2419 if (f >= agfl_size || l >= agfl_size)
2420 return true;
2421 if (c > agfl_size)
2422 return true;
2423
2424 /*
2425 * Check consistency between the on-disk count and the active range. An
2426 * agfl padding mismatch manifests as an inconsistent flcount.
2427 */
2428 if (c && l >= f)
2429 active = l - f + 1;
2430 else if (c)
2431 active = agfl_size - f + l + 1;
2432 else
2433 active = 0;
2434
2435 return active != c;
2436}
2437
2438/*
2439 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2440 * agfl content cannot be trusted. Warn the user that a repair is required to
2441 * recover leaked blocks.
2442 *
2443 * The purpose of this mechanism is to handle filesystems affected by the agfl
2444 * header padding mismatch problem. A reset keeps the filesystem online with a
2445 * relatively minor free space accounting inconsistency rather than suffer the
2446 * inevitable crash from use of an invalid agfl block.
2447 */
2448static void
2449xfs_agfl_reset(
2450 struct xfs_trans *tp,
2451 struct xfs_buf *agbp,
2452 struct xfs_perag *pag)
2453{
2454 struct xfs_mount *mp = tp->t_mountp;
0bc284c2 2455 struct xfs_agf *agf = agbp->b_addr;
8dbee8f5 2456
03dc2ef2 2457 ASSERT(xfs_perag_agfl_needs_reset(pag));
8dbee8f5
BF
2458 trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2459
2460 xfs_warn(mp,
2461 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2462 "Please unmount and run xfs_repair.",
2463 pag->pag_agno, pag->pagf_flcount);
2464
2465 agf->agf_flfirst = 0;
2466 agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2467 agf->agf_flcount = 0;
2468 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2469 XFS_AGF_FLCOUNT);
2470
2471 pag->pagf_flcount = 0;
03dc2ef2 2472 clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
8dbee8f5
BF
2473}
2474
d5c1b462
BF
2475/*
2476 * Defer an AGFL block free. This is effectively equivalent to
7d84b02d 2477 * xfs_free_extent_later() with some special handling particular to AGFL blocks.
d5c1b462
BF
2478 *
2479 * Deferring AGFL frees helps prevent log reservation overruns due to too many
2480 * allocation operations in a transaction. AGFL frees are prone to this problem
2481 * because for one they are always freed one at a time. Further, an immediate
2482 * AGFL block free can cause a btree join and require another block free before
2483 * the real allocation can proceed. Deferring the free disconnects freeing up
2484 * the AGFL slot from freeing the block.
2485 */
cd3e5d3c 2486static int
d5c1b462 2487xfs_defer_agfl_block(
21375e5d 2488 struct xfs_trans *tp,
d5c1b462 2489 xfs_agnumber_t agno,
d096b26c 2490 xfs_agblock_t agbno,
d5c1b462
BF
2491 struct xfs_owner_info *oinfo)
2492{
21375e5d 2493 struct xfs_mount *mp = tp->t_mountp;
feee990c 2494 struct xfs_extent_free_item *xefi;
d096b26c 2495 xfs_fsblock_t fsbno = XFS_AGB_TO_FSB(mp, agno, agbno);
d5c1b462 2496
7d84b02d 2497 ASSERT(xfs_extfree_item_cache != NULL);
d5c1b462
BF
2498 ASSERT(oinfo != NULL);
2499
d096b26c
DC
2500 if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno)))
2501 return -EFSCORRUPTED;
2502
feee990c 2503 xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
d7faa18a 2504 GFP_KERNEL | __GFP_NOFAIL);
d096b26c 2505 xefi->xefi_startblock = fsbno;
feee990c
DW
2506 xefi->xefi_blockcount = 1;
2507 xefi->xefi_owner = oinfo->oi_owner;
ef16737e 2508 xefi->xefi_agresv = XFS_AG_RESV_AGFL;
d5c1b462
BF
2509
2510 trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2511
7fef0c11 2512 xfs_extent_free_get_group(mp, xefi);
feee990c 2513 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
cd3e5d3c 2514 return 0;
d5c1b462
BF
2515}
2516
7d84b02d
DW
2517/*
2518 * Add the extent to the list of extents to be free at transaction end.
2519 * The list is maintained sorted (by block number).
2520 */
cd3e5d3c 2521int
7d84b02d
DW
2522__xfs_free_extent_later(
2523 struct xfs_trans *tp,
2524 xfs_fsblock_t bno,
2525 xfs_filblks_t len,
2526 const struct xfs_owner_info *oinfo,
ef16737e 2527 enum xfs_ag_resv_type type,
7d84b02d
DW
2528 bool skip_discard)
2529{
feee990c 2530 struct xfs_extent_free_item *xefi;
7d84b02d 2531 struct xfs_mount *mp = tp->t_mountp;
7fef0c11 2532#ifdef DEBUG
7d84b02d
DW
2533 xfs_agnumber_t agno;
2534 xfs_agblock_t agbno;
2535
2536 ASSERT(bno != NULLFSBLOCK);
2537 ASSERT(len > 0);
d3e0c71f 2538 ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
7d84b02d
DW
2539 ASSERT(!isnullstartblock(bno));
2540 agno = XFS_FSB_TO_AGNO(mp, bno);
2541 agbno = XFS_FSB_TO_AGBNO(mp, bno);
2542 ASSERT(agno < mp->m_sb.sb_agcount);
2543 ASSERT(agbno < mp->m_sb.sb_agblocks);
2544 ASSERT(len < mp->m_sb.sb_agblocks);
2545 ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
2546#endif
2547 ASSERT(xfs_extfree_item_cache != NULL);
ef16737e 2548 ASSERT(type != XFS_AG_RESV_AGFL);
7d84b02d 2549
cd3e5d3c
DC
2550 if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len)))
2551 return -EFSCORRUPTED;
2552
feee990c 2553 xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
7d84b02d 2554 GFP_KERNEL | __GFP_NOFAIL);
feee990c
DW
2555 xefi->xefi_startblock = bno;
2556 xefi->xefi_blockcount = (xfs_extlen_t)len;
ef16737e 2557 xefi->xefi_agresv = type;
6d72c6ea 2558 if (skip_discard)
feee990c 2559 xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
6d72c6ea
DW
2560 if (oinfo) {
2561 ASSERT(oinfo->oi_offset == 0);
2562
2563 if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
feee990c 2564 xefi->xefi_flags |= XFS_EFI_ATTR_FORK;
6d72c6ea 2565 if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
feee990c
DW
2566 xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK;
2567 xefi->xefi_owner = oinfo->oi_owner;
6d72c6ea 2568 } else {
feee990c 2569 xefi->xefi_owner = XFS_RMAP_OWN_NULL;
6d72c6ea 2570 }
7fef0c11 2571 trace_xfs_bmap_free_defer(mp,
7d84b02d
DW
2572 XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
2573 XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
7fef0c11
DW
2574
2575 xfs_extent_free_get_group(mp, xefi);
feee990c 2576 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
cd3e5d3c 2577 return 0;
7d84b02d
DW
2578}
2579
3006cea4
CB
2580#ifdef DEBUG
2581/*
2582 * Check if an AGF has a free extent record whose length is equal to
2583 * args->minlen.
2584 */
2585STATIC int
2586xfs_exact_minlen_extent_available(
2587 struct xfs_alloc_arg *args,
2588 struct xfs_buf *agbp,
2589 int *stat)
2590{
2591 struct xfs_btree_cur *cnt_cur;
2592 xfs_agblock_t fbno;
2593 xfs_extlen_t flen;
2594 int error = 0;
2595
2596 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
ecb44e84 2597 args->pag, XFS_BTNUM_CNT);
3006cea4
CB
2598 error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
2599 if (error)
2600 goto out;
2601
2602 if (*stat == 0) {
2603 error = -EFSCORRUPTED;
2604 goto out;
2605 }
2606
2607 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
2608 if (error)
2609 goto out;
2610
2611 if (*stat == 1 && flen != args->minlen)
2612 *stat = 0;
2613
2614out:
2615 xfs_btree_del_cursor(cnt_cur, error);
2616
2617 return error;
2618}
2619#endif
2620
2bd0ea18
NS
2621/*
2622 * Decide whether to use this allocation group for this allocation.
2623 * If so, fix up the btree freelist's size.
2bd0ea18 2624 */
ff105f75 2625int /* error */
2bd0ea18 2626xfs_alloc_fix_freelist(
c98e644e 2627 struct xfs_alloc_arg *args, /* allocation argument structure */
01f05365 2628 uint32_t alloc_flags)
2bd0ea18 2629{
c98e644e
DC
2630 struct xfs_mount *mp = args->mp;
2631 struct xfs_perag *pag = args->pag;
2632 struct xfs_trans *tp = args->tp;
2633 struct xfs_buf *agbp = NULL;
2634 struct xfs_buf *agflbp = NULL;
2635 struct xfs_alloc_arg targs; /* local allocation arguments */
2636 xfs_agblock_t bno; /* freelist block */
2637 xfs_extlen_t need; /* total blocks needed in freelist */
fcdd428c 2638 int error = 0;
c98e644e 2639
d85d3424
BF
2640 /* deferred ops (AGFL block frees) require permanent transactions */
2641 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2642
03dc2ef2 2643 if (!xfs_perag_initialised_agf(pag)) {
01f05365 2644 error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp);
475f184c
DW
2645 if (error) {
2646 /* Couldn't lock the AGF so skip this AG. */
2647 if (error == -EAGAIN)
2648 error = 0;
c98e644e 2649 goto out_no_agbp;
2bd0ea18 2650 }
c98e644e 2651 }
34317449 2652
5e656dbb 2653 /*
c98e644e
DC
2654 * If this is a metadata preferred pag and we are user data then try
2655 * somewhere else if we are not being asked to try harder at this
2656 * point
34317449 2657 */
03dc2ef2
DC
2658 if (xfs_perag_prefers_metadata(pag) &&
2659 (args->datatype & XFS_ALLOC_USERDATA) &&
01f05365
DC
2660 (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2661 ASSERT(!(alloc_flags & XFS_ALLOC_FLAG_FREEING));
c98e644e 2662 goto out_agbp_relse;
34317449
NS
2663 }
2664
de046644 2665 need = xfs_alloc_min_freelist(mp, pag);
01f05365 2666 if (!xfs_alloc_space_available(args, need, alloc_flags |
2c003dc2 2667 XFS_ALLOC_FLAG_CHECK))
c98e644e 2668 goto out_agbp_relse;
5e656dbb 2669
2bd0ea18
NS
2670 /*
2671 * Get the a.g. freespace buffer.
2672 * Can fail if we're not blocking on locks, and it's held.
2673 */
c98e644e 2674 if (!agbp) {
01f05365 2675 error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp);
475f184c
DW
2676 if (error) {
2677 /* Couldn't lock the AGF so skip this AG. */
2678 if (error == -EAGAIN)
2679 error = 0;
c98e644e 2680 goto out_no_agbp;
2bd0ea18
NS
2681 }
2682 }
72bda06d 2683
8dbee8f5 2684 /* reset a padding mismatched agfl before final free space check */
03dc2ef2 2685 if (xfs_perag_agfl_needs_reset(pag))
8dbee8f5
BF
2686 xfs_agfl_reset(tp, agbp, pag);
2687
72bda06d 2688 /* If there isn't enough total space or single-extent, reject it. */
de046644 2689 need = xfs_alloc_min_freelist(mp, pag);
01f05365 2690 if (!xfs_alloc_space_available(args, need, alloc_flags))
c98e644e 2691 goto out_agbp_relse;
5515b7c1 2692
3006cea4
CB
2693#ifdef DEBUG
2694 if (args->alloc_minlen_only) {
2695 int stat;
2696
2697 error = xfs_exact_minlen_extent_available(args, agbp, &stat);
2698 if (error || !stat)
2699 goto out_agbp_relse;
2700 }
2701#endif
2bd0ea18
NS
2702 /*
2703 * Make the freelist shorter if it's too long.
72bda06d 2704 *
c98e644e
DC
2705 * Note that from this point onwards, we will always release the agf and
2706 * agfl buffers on error. This handles the case where we error out and
2707 * the buffers are clean or may not have been joined to the transaction
2708 * and hence need to be released manually. If they have been joined to
2709 * the transaction, then xfs_trans_brelse() will handle them
2710 * appropriately based on the recursion count and dirty state of the
2711 * buffer.
2712 *
72bda06d
DC
2713 * XXX (dgc): When we have lots of free space, does this buy us
2714 * anything other than extra overhead when we need to put more blocks
2715 * back on the free list? Maybe we should only do this when space is
2716 * getting low or the AGFL is more than half full?
e365af6f
DW
2717 *
2718 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2719 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2720 * updating the rmapbt. Both flags are used in xfs_repair while we're
2721 * rebuilding the rmapbt, and neither are used by the kernel. They're
2722 * both required to ensure that rmaps are correctly recorded for the
2723 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and
2724 * repair/rmap.c in xfsprogs for details.
2bd0ea18 2725 */
e365af6f 2726 memset(&targs, 0, sizeof(targs));
007347e3 2727 /* struct copy below */
01f05365 2728 if (alloc_flags & XFS_ALLOC_FLAG_NORMAP)
007347e3 2729 targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
e365af6f 2730 else
007347e3 2731 targs.oinfo = XFS_RMAP_OINFO_AG;
01f05365
DC
2732 while (!(alloc_flags & XFS_ALLOC_FLAG_NOSHRINK) &&
2733 pag->pagf_flcount > need) {
1d202c10 2734 error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
5e656dbb 2735 if (error)
c98e644e 2736 goto out_agbp_relse;
30c8be8a 2737
6dc24128 2738 /* defer agfl frees */
cd3e5d3c
DC
2739 error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2740 if (error)
2741 goto out_agbp_relse;
2bd0ea18 2742 }
72bda06d 2743
2bd0ea18
NS
2744 targs.tp = tp;
2745 targs.mp = mp;
2746 targs.agbp = agbp;
2747 targs.agno = args->agno;
cf8ce220 2748 targs.alignment = targs.minlen = targs.prod = 1;
2bd0ea18 2749 targs.pag = pag;
75c01ccc 2750 error = xfs_alloc_read_agfl(pag, tp, &agflbp);
72bda06d 2751 if (error)
c98e644e 2752 goto out_agbp_relse;
72bda06d
DC
2753
2754 /* Make the freelist longer if it's too short. */
2755 while (pag->pagf_flcount < need) {
2bd0ea18 2756 targs.agbno = 0;
72bda06d 2757 targs.maxlen = need - pag->pagf_flcount;
9760cac2 2758 targs.resv = XFS_AG_RESV_AGFL;
72bda06d
DC
2759
2760 /* Allocate as many blocks as possible at once. */
01f05365 2761 error = xfs_alloc_ag_vextent_size(&targs, alloc_flags);
c98e644e
DC
2762 if (error)
2763 goto out_agflbp_relse;
2764
2bd0ea18 2765 /*
dfc130f3
RC
2766 * Stop if we run out. Won't happen if callers are obeying
2767 * the restrictions correctly. Can happen for free calls
2bd0ea18
NS
2768 * on a completely full ag.
2769 */
5e656dbb 2770 if (targs.agbno == NULLAGBLOCK) {
01f05365 2771 if (alloc_flags & XFS_ALLOC_FLAG_FREEING)
5e656dbb 2772 break;
c98e644e 2773 goto out_agflbp_relse;
5e656dbb 2774 }
bad76a2c
DC
2775
2776 if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) {
2777 error = xfs_rmap_alloc(tp, agbp, pag,
2778 targs.agbno, targs.len, &targs.oinfo);
2779 if (error)
2780 goto out_agflbp_relse;
2781 }
2782 error = xfs_alloc_update_counters(tp, agbp,
2783 -((long)(targs.len)));
2784 if (error)
2785 goto out_agflbp_relse;
2786
2bd0ea18
NS
2787 /*
2788 * Put each allocated block on the list.
2789 */
2790 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
9a73333d 2791 error = xfs_alloc_put_freelist(pag, tp, agbp,
5e656dbb
BN
2792 agflbp, bno, 0);
2793 if (error)
c98e644e 2794 goto out_agflbp_relse;
2bd0ea18
NS
2795 }
2796 }
cb4deb22 2797 xfs_trans_brelse(tp, agflbp);
2bd0ea18
NS
2798 args->agbp = agbp;
2799 return 0;
c98e644e
DC
2800
2801out_agflbp_relse:
2802 xfs_trans_brelse(tp, agflbp);
2803out_agbp_relse:
2804 if (agbp)
2805 xfs_trans_brelse(tp, agbp);
2806out_no_agbp:
2807 args->agbp = NULL;
2808 return error;
2bd0ea18
NS
2809}
2810
2811/*
2812 * Get a block from the freelist.
2813 * Returns with the buffer for the block gotten.
2814 */
d2ff101f 2815int
2bd0ea18 2816xfs_alloc_get_freelist(
1d202c10 2817 struct xfs_perag *pag,
d2ff101f
DC
2818 struct xfs_trans *tp,
2819 struct xfs_buf *agbp,
2820 xfs_agblock_t *bnop,
2821 int btreeblk)
2bd0ea18 2822{
d2ff101f
DC
2823 struct xfs_agf *agf = agbp->b_addr;
2824 struct xfs_buf *agflbp;
2825 xfs_agblock_t bno;
2826 __be32 *agfl_bno;
2827 int error;
a562dd2f 2828 uint32_t logflags;
d2ff101f 2829 struct xfs_mount *mp = tp->t_mountp;
2bd0ea18 2830
2bd0ea18
NS
2831 /*
2832 * Freelist is empty, give up.
2833 */
46eca962 2834 if (!agf->agf_flcount) {
2bd0ea18
NS
2835 *bnop = NULLAGBLOCK;
2836 return 0;
2837 }
2838 /*
2839 * Read the array of free blocks.
2840 */
75c01ccc 2841 error = xfs_alloc_read_agfl(pag, tp, &agflbp);
dd5b876e 2842 if (error)
2bd0ea18 2843 return error;
dd5b876e
DC
2844
2845
2bd0ea18
NS
2846 /*
2847 * Get the block number and update the data structures.
2848 */
b134e771 2849 agfl_bno = xfs_buf_to_agfl_bno(agflbp);
dd5b876e 2850 bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
629d6b3d
DC
2851 if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno)))
2852 return -EFSCORRUPTED;
2853
5e656dbb 2854 be32_add_cpu(&agf->agf_flfirst, 1);
2bd0ea18 2855 xfs_trans_brelse(tp, agflbp);
b8165508 2856 if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
46eca962 2857 agf->agf_flfirst = 0;
56b2de80 2858
03dc2ef2 2859 ASSERT(!xfs_perag_agfl_needs_reset(pag));
5e656dbb 2860 be32_add_cpu(&agf->agf_flcount, -1);
2bd0ea18 2861 pag->pagf_flcount--;
cdded3d8
DC
2862
2863 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2864 if (btreeblk) {
5e656dbb 2865 be32_add_cpu(&agf->agf_btreeblks, 1);
cdded3d8
DC
2866 pag->pagf_btreeblks++;
2867 logflags |= XFS_AGF_BTREEBLKS;
2868 }
2869
cdded3d8 2870 xfs_alloc_log_agf(tp, agbp, logflags);
2bd0ea18 2871 *bnop = bno;
3e535bba 2872
2bd0ea18
NS
2873 return 0;
2874}
2875
2876/*
2877 * Log the given fields from the agf structure.
2878 */
2879void
2880xfs_alloc_log_agf(
a562dd2f
DC
2881 struct xfs_trans *tp,
2882 struct xfs_buf *bp,
2883 uint32_t fields)
2bd0ea18
NS
2884{
2885 int first; /* first byte offset */
2886 int last; /* last byte offset */
2887 static const short offsets[] = {
2888 offsetof(xfs_agf_t, agf_magicnum),
2889 offsetof(xfs_agf_t, agf_versionnum),
2890 offsetof(xfs_agf_t, agf_seqno),
2891 offsetof(xfs_agf_t, agf_length),
2892 offsetof(xfs_agf_t, agf_roots[0]),
2893 offsetof(xfs_agf_t, agf_levels[0]),
2894 offsetof(xfs_agf_t, agf_flfirst),
2895 offsetof(xfs_agf_t, agf_fllast),
2896 offsetof(xfs_agf_t, agf_flcount),
2897 offsetof(xfs_agf_t, agf_freeblks),
2898 offsetof(xfs_agf_t, agf_longest),
cdded3d8 2899 offsetof(xfs_agf_t, agf_btreeblks),
dd5b876e 2900 offsetof(xfs_agf_t, agf_uuid),
8511b71a 2901 offsetof(xfs_agf_t, agf_rmap_blocks),
bc859611
DW
2902 offsetof(xfs_agf_t, agf_refcount_blocks),
2903 offsetof(xfs_agf_t, agf_refcount_root),
2904 offsetof(xfs_agf_t, agf_refcount_level),
8511b71a
DW
2905 /* needed so that we don't log the whole rest of the structure: */
2906 offsetof(xfs_agf_t, agf_spare64),
2bd0ea18
NS
2907 sizeof(xfs_agf_t)
2908 };
2909
0bc284c2 2910 trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
56b2de80 2911
bdc16ee5 2912 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
dd5b876e 2913
2bd0ea18
NS
2914 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2915 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2916}
2917
2bd0ea18
NS
2918/*
2919 * Put the block on the freelist for the allocation group.
2920 */
d2ff101f 2921int
2bd0ea18 2922xfs_alloc_put_freelist(
9a73333d 2923 struct xfs_perag *pag,
d2ff101f
DC
2924 struct xfs_trans *tp,
2925 struct xfs_buf *agbp,
2926 struct xfs_buf *agflbp,
2927 xfs_agblock_t bno,
2928 int btreeblk)
2bd0ea18 2929{
0bc284c2
CH
2930 struct xfs_mount *mp = tp->t_mountp;
2931 struct xfs_agf *agf = agbp->b_addr;
d2ff101f 2932 __be32 *blockp;
2bd0ea18 2933 int error;
a562dd2f 2934 uint32_t logflags;
dd5b876e
DC
2935 __be32 *agfl_bno;
2936 int startoff;
2bd0ea18 2937
75c01ccc
DC
2938 if (!agflbp) {
2939 error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2940 if (error)
2941 return error;
2942 }
2943
5e656dbb 2944 be32_add_cpu(&agf->agf_fllast, 1);
b8165508 2945 if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
46eca962 2946 agf->agf_fllast = 0;
56b2de80 2947
03dc2ef2 2948 ASSERT(!xfs_perag_agfl_needs_reset(pag));
5e656dbb 2949 be32_add_cpu(&agf->agf_flcount, 1);
2bd0ea18 2950 pag->pagf_flcount++;
cdded3d8
DC
2951
2952 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2953 if (btreeblk) {
5e656dbb 2954 be32_add_cpu(&agf->agf_btreeblks, -1);
cdded3d8
DC
2955 pag->pagf_btreeblks--;
2956 logflags |= XFS_AGF_BTREEBLKS;
2957 }
2958
5e656dbb
BN
2959 xfs_alloc_log_agf(tp, agbp, logflags);
2960
b8165508 2961 ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
dd5b876e 2962
b134e771 2963 agfl_bno = xfs_buf_to_agfl_bno(agflbp);
dd5b876e 2964 blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
5e656dbb 2965 *blockp = cpu_to_be32(bno);
dd5b876e
DC
2966 startoff = (char *)blockp - (char *)agflbp->b_addr;
2967
cdded3d8 2968 xfs_alloc_log_agf(tp, agbp, logflags);
dd5b876e 2969
bdc16ee5 2970 xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
dd5b876e
DC
2971 xfs_trans_log_buf(tp, agflbp, startoff,
2972 startoff + sizeof(xfs_agblock_t) - 1);
2bd0ea18
NS
2973 return 0;
2974}
2975
98572f41
DW
2976/*
2977 * Check that this AGF/AGI header's sequence number and length matches the AG
2978 * number and size in fsblocks.
2979 */
2980xfs_failaddr_t
2981xfs_validate_ag_length(
2982 struct xfs_buf *bp,
2983 uint32_t seqno,
2984 uint32_t length)
2985{
2986 struct xfs_mount *mp = bp->b_mount;
2987 /*
2988 * During growfs operations, the perag is not fully initialised,
2989 * so we can't use it for any useful checking. growfs ensures we can't
2990 * use it by using uncached buffers that don't have the perag attached
2991 * so we can detect and avoid this problem.
2992 */
2993 if (bp->b_pag && seqno != bp->b_pag->pag_agno)
2994 return __this_address;
2995
2996 /*
2997 * Only the last AG in the filesystem is allowed to be shorter
2998 * than the AG size recorded in the superblock.
2999 */
3000 if (length != mp->m_sb.sb_agblocks) {
3001 /*
3002 * During growfs, the new last AG can get here before we
3003 * have updated the superblock. Give it a pass on the seqno
3004 * check.
3005 */
3006 if (bp->b_pag && seqno != mp->m_sb.sb_agcount - 1)
3007 return __this_address;
3008 if (length < XFS_MIN_AG_BLOCKS)
3009 return __this_address;
3010 if (length > mp->m_sb.sb_agblocks)
3011 return __this_address;
3012 }
3013
3014 return NULL;
3015}
3016
daa2d820
DC
3017/*
3018 * Verify the AGF is consistent.
3019 *
3020 * We do not verify the AGFL indexes in the AGF are fully consistent here
3021 * because of issues with variable on-disk structure sizes. Instead, we check
3022 * the agfl indexes for consistency when we initialise the perag from the AGF
3023 * information after a read completes.
3024 *
3025 * If the index is inconsistent, then we mark the perag as needing an AGFL
3026 * reset. The first AGFL update performed then resets the AGFL indexes and
3027 * refills the AGFL with known good free blocks, allowing the filesystem to
3028 * continue operating normally at the cost of a few leaked free space blocks.
3029 */
bc01119d 3030static xfs_failaddr_t
a2ceac1f 3031xfs_agf_verify(
95d9582b
DW
3032 struct xfs_buf *bp)
3033{
7861ef77 3034 struct xfs_mount *mp = bp->b_mount;
0bc284c2 3035 struct xfs_agf *agf = bp->b_addr;
98572f41
DW
3036 xfs_failaddr_t fa;
3037 uint32_t agf_seqno = be32_to_cpu(agf->agf_seqno);
6ac452dc 3038 uint32_t agf_length = be32_to_cpu(agf->agf_length);
a2ceac1f 3039
b16a427a 3040 if (xfs_has_crc(mp)) {
a65d8d29 3041 if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
bc01119d 3042 return __this_address;
0bc284c2 3043 if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
bc01119d 3044 return __this_address;
a65d8d29 3045 }
a2ceac1f 3046
68dbe77f
BF
3047 if (!xfs_verify_magic(bp, agf->agf_magicnum))
3048 return __this_address;
3049
6ac452dc 3050 if (!XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)))
bc01119d 3051 return __this_address;
a2ceac1f 3052
6ac452dc
DC
3053 /*
3054 * Both agf_seqno and agf_length need to validated before anything else
3055 * block number related in the AGF or AGFL can be checked.
6ac452dc 3056 */
98572f41
DW
3057 fa = xfs_validate_ag_length(bp, agf_seqno, agf_length);
3058 if (fa)
3059 return fa;
6ac452dc
DC
3060
3061 if (be32_to_cpu(agf->agf_flfirst) >= xfs_agfl_size(mp))
3062 return __this_address;
3063 if (be32_to_cpu(agf->agf_fllast) >= xfs_agfl_size(mp))
3064 return __this_address;
3065 if (be32_to_cpu(agf->agf_flcount) > xfs_agfl_size(mp))
0f24228d
ZB
3066 return __this_address;
3067
3068 if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
6ac452dc 3069 be32_to_cpu(agf->agf_freeblks) > agf_length)
0f24228d
ZB
3070 return __this_address;
3071
00795aae
DW
3072 if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
3073 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
4aa2259d
DW
3074 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
3075 mp->m_alloc_maxlevels ||
3076 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
3077 mp->m_alloc_maxlevels)
bc01119d 3078 return __this_address;
5a35bf2c 3079
6ac452dc
DC
3080 if (xfs_has_lazysbcount(mp) &&
3081 be32_to_cpu(agf->agf_btreeblks) > agf_length)
0f24228d
ZB
3082 return __this_address;
3083
6ac452dc
DC
3084 if (xfs_has_rmapbt(mp)) {
3085 if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length)
3086 return __this_address;
a2ceac1f 3087
6ac452dc
DC
3088 if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
3089 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
3090 mp->m_rmap_maxlevels)
3091 return __this_address;
3092 }
dd5b876e 3093
6ac452dc
DC
3094 if (xfs_has_reflink(mp)) {
3095 if (be32_to_cpu(agf->agf_refcount_blocks) > agf_length)
3096 return __this_address;
0f24228d 3097
6ac452dc
DC
3098 if (be32_to_cpu(agf->agf_refcount_level) < 1 ||
3099 be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)
3100 return __this_address;
3101 }
88ce0792 3102
bc01119d 3103 return NULL;
a2ceac1f
DC
3104}
3105
3106static void
3107xfs_agf_read_verify(
3108 struct xfs_buf *bp)
3109{
7861ef77 3110 struct xfs_mount *mp = bp->b_mount;
1e697959 3111 xfs_failaddr_t fa;
dd5b876e 3112
b16a427a 3113 if (xfs_has_crc(mp) &&
45922933 3114 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
1e697959
DW
3115 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
3116 else {
95d9582b 3117 fa = xfs_agf_verify(bp);
1e697959
DW
3118 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
3119 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3120 }
a2ceac1f
DC
3121}
3122
3123static void
3124xfs_agf_write_verify(
3125 struct xfs_buf *bp)
3126{
7861ef77 3127 struct xfs_mount *mp = bp->b_mount;
37d086ca 3128 struct xfs_buf_log_item *bip = bp->b_log_item;
0bc284c2 3129 struct xfs_agf *agf = bp->b_addr;
1e697959 3130 xfs_failaddr_t fa;
dd5b876e 3131
95d9582b 3132 fa = xfs_agf_verify(bp);
1e697959
DW
3133 if (fa) {
3134 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
dd5b876e
DC
3135 return;
3136 }
3137
b16a427a 3138 if (!xfs_has_crc(mp))
dd5b876e
DC
3139 return;
3140
3141 if (bip)
0bc284c2 3142 agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
dd5b876e 3143
43b5aeed 3144 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
a2ceac1f
DC
3145}
3146
3147const struct xfs_buf_ops xfs_agf_buf_ops = {
a3fac935 3148 .name = "xfs_agf",
68dbe77f 3149 .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
a2ceac1f
DC
3150 .verify_read = xfs_agf_read_verify,
3151 .verify_write = xfs_agf_write_verify,
95d9582b 3152 .verify_struct = xfs_agf_verify,
a2ceac1f
DC
3153};
3154
2bd0ea18
NS
3155/*
3156 * Read in the allocation group header (free/alloc section).
3157 */
c1030eda 3158int
56b2de80 3159xfs_read_agf(
c1030eda
DC
3160 struct xfs_perag *pag,
3161 struct xfs_trans *tp,
3162 int flags,
3163 struct xfs_buf **agfbpp)
2bd0ea18 3164{
c1030eda
DC
3165 struct xfs_mount *mp = pag->pag_mount;
3166 int error;
2bd0ea18 3167
c1030eda 3168 trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
ff105f75 3169
3e6069a1 3170 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
c1030eda
DC
3171 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
3172 XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
9440d84d 3173 if (error)
2bd0ea18 3174 return error;
56b2de80 3175
c1030eda 3176 xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
56b2de80
DC
3177 return 0;
3178}
3179
3180/*
87db57ba
DC
3181 * Read in the allocation group header (free/alloc section) and initialise the
3182 * perag structure if necessary. If the caller provides @agfbpp, then return the
3183 * locked buffer to the caller, otherwise free it.
56b2de80 3184 */
f9084bd9 3185int
56b2de80 3186xfs_alloc_read_agf(
f9084bd9
DC
3187 struct xfs_perag *pag,
3188 struct xfs_trans *tp,
3189 int flags,
87db57ba 3190 struct xfs_buf **agfbpp)
56b2de80 3191{
87db57ba 3192 struct xfs_buf *agfbp;
f9084bd9 3193 struct xfs_agf *agf;
56b2de80 3194 int error;
92c76be5 3195 int allocbt_blks;
56b2de80 3196
f9084bd9 3197 trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
56b2de80 3198
475f184c
DW
3199 /* We don't support trylock when freeing. */
3200 ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
3201 (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
c1030eda 3202 error = xfs_read_agf(pag, tp,
56b2de80 3203 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
87db57ba 3204 &agfbp);
56b2de80
DC
3205 if (error)
3206 return error;
56b2de80 3207
87db57ba 3208 agf = agfbp->b_addr;
03dc2ef2 3209 if (!xfs_perag_initialised_agf(pag)) {
6e3140c7 3210 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
cdded3d8 3211 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
6e3140c7
NS
3212 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3213 pag->pagf_longest = be32_to_cpu(agf->agf_longest);
2bd0ea18 3214 pag->pagf_levels[XFS_BTNUM_BNOi] =
6e3140c7 3215 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
2bd0ea18 3216 pag->pagf_levels[XFS_BTNUM_CNTi] =
6e3140c7 3217 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
e37838e5
DW
3218 pag->pagf_levels[XFS_BTNUM_RMAPi] =
3219 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
88ce0792 3220 pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
03dc2ef2
DC
3221 if (xfs_agfl_needs_reset(pag->pag_mount, agf))
3222 set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
b2f12cf3
DW
3223 else
3224 clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
92c76be5
BF
3225
3226 /*
3227 * Update the in-core allocbt counter. Filter out the rmapbt
3228 * subset of the btreeblks counter because the rmapbt is managed
3229 * by perag reservation. Subtract one for the rmapbt root block
3230 * because the rmap counter includes it while the btreeblks
3231 * counter only tracks non-root blocks.
3232 */
3233 allocbt_blks = pag->pagf_btreeblks;
f9084bd9 3234 if (xfs_has_rmapbt(pag->pag_mount))
92c76be5
BF
3235 allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
3236 if (allocbt_blks > 0)
f9084bd9
DC
3237 atomic64_add(allocbt_blks,
3238 &pag->pag_mount->m_allocbt_blks);
03dc2ef2
DC
3239
3240 set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
2bd0ea18
NS
3241 }
3242#ifdef DEBUG
f9084bd9 3243 else if (!xfs_is_shutdown(pag->pag_mount)) {
6e3140c7 3244 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
cdded3d8 3245 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
6e3140c7
NS
3246 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
3247 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
2bd0ea18 3248 ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
6e3140c7 3249 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
2bd0ea18 3250 ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
6e3140c7 3251 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
2bd0ea18
NS
3252 }
3253#endif
87db57ba
DC
3254 if (agfbpp)
3255 *agfbpp = agfbp;
3256 else
3257 xfs_trans_brelse(tp, agfbp);
2bd0ea18
NS
3258 return 0;
3259}
3260
3261/*
0ac70e08
DC
3262 * Pre-proces allocation arguments to set initial state that we don't require
3263 * callers to set up correctly, as well as bounds check the allocation args
3264 * that are set up.
2bd0ea18 3265 */
0ac70e08
DC
3266static int
3267xfs_alloc_vextent_check_args(
5ba1f915 3268 struct xfs_alloc_arg *args,
50f6a20b
DC
3269 xfs_fsblock_t target,
3270 xfs_agnumber_t *minimum_agno)
2bd0ea18 3271{
0ac70e08
DC
3272 struct xfs_mount *mp = args->mp;
3273 xfs_agblock_t agsize;
2bd0ea18 3274
32cd26bf 3275 args->fsbno = NULLFSBLOCK;
0ac70e08 3276
50f6a20b
DC
3277 *minimum_agno = 0;
3278 if (args->tp->t_highest_agno != NULLAGNUMBER)
3279 *minimum_agno = args->tp->t_highest_agno;
3280
2bd0ea18
NS
3281 /*
3282 * Just fix this up, for the case where the last a.g. is shorter
3283 * (or there's only one a.g.) and the caller couldn't easily figure
3284 * that out (xfs_bmap_alloc).
3285 */
3286 agsize = mp->m_sb.sb_agblocks;
3287 if (args->maxlen > agsize)
3288 args->maxlen = agsize;
3289 if (args->alignment == 0)
3290 args->alignment = 1;
a74b5502
DC
3291
3292 ASSERT(args->minlen > 0);
3293 ASSERT(args->maxlen > 0);
3294 ASSERT(args->alignment > 0);
3295 ASSERT(args->resv != XFS_AG_RESV_AGFL);
3296
5ba1f915
DC
3297 ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount);
3298 ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize);
2bd0ea18
NS
3299 ASSERT(args->minlen <= args->maxlen);
3300 ASSERT(args->minlen <= agsize);
3301 ASSERT(args->mod < args->prod);
a74b5502 3302
5ba1f915
DC
3303 if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount ||
3304 XFS_FSB_TO_AGBNO(mp, target) >= agsize ||
2bd0ea18
NS
3305 args->minlen > args->maxlen || args->minlen > agsize ||
3306 args->mod >= args->prod) {
56b2de80 3307 trace_xfs_alloc_vextent_badargs(args);
0ac70e08
DC
3308 return -ENOSPC;
3309 }
50f6a20b
DC
3310
3311 if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) {
3312 trace_xfs_alloc_vextent_skip_deadlock(args);
3313 return -ENOSPC;
3314 }
0ac70e08 3315 return 0;
50f6a20b 3316
0ac70e08
DC
3317}
3318
a74b5502
DC
3319/*
3320 * Prepare an AG for allocation. If the AG is not prepared to accept the
3321 * allocation, return failure.
3322 *
3323 * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are
3324 * modified to hold their own perag references.
3325 */
3326static int
3327xfs_alloc_vextent_prepare_ag(
8b2a40ff 3328 struct xfs_alloc_arg *args,
01f05365 3329 uint32_t alloc_flags)
a74b5502
DC
3330{
3331 bool need_pag = !args->pag;
3332 int error;
3333
3334 if (need_pag)
3335 args->pag = xfs_perag_get(args->mp, args->agno);
3336
91cceb74 3337 args->agbp = NULL;
01f05365 3338 error = xfs_alloc_fix_freelist(args, alloc_flags);
a74b5502
DC
3339 if (error) {
3340 trace_xfs_alloc_vextent_nofix(args);
3341 if (need_pag)
3342 xfs_perag_put(args->pag);
3343 args->agbno = NULLAGBLOCK;
3344 return error;
3345 }
3346 if (!args->agbp) {
3347 /* cannot allocate in this AG at all */
3348 trace_xfs_alloc_vextent_noagbp(args);
3349 args->agbno = NULLAGBLOCK;
3350 return 0;
3351 }
3352 args->wasfromfl = 0;
3353 return 0;
3354}
3355
0ac70e08 3356/*
d99d5287
DC
3357 * Post-process allocation results to account for the allocation if it succeed
3358 * and set the allocated block number correctly for the caller.
0ac70e08 3359 *
d99d5287 3360 * XXX: we should really be returning ENOSPC for ENOSPC, not
0ac70e08
DC
3361 * hiding it behind a "successful" NULLFSBLOCK allocation.
3362 */
d99d5287
DC
3363static int
3364xfs_alloc_vextent_finish(
0ac70e08 3365 struct xfs_alloc_arg *args,
d99d5287
DC
3366 xfs_agnumber_t minimum_agno,
3367 int alloc_error,
3368 bool drop_perag)
0ac70e08
DC
3369{
3370 struct xfs_mount *mp = args->mp;
d99d5287 3371 int error = 0;
0ac70e08
DC
3372
3373 /*
3374 * We can end up here with a locked AGF. If we failed, the caller is
3375 * likely going to try to allocate again with different parameters, and
3376 * that can widen the AGs that are searched for free space. If we have
3377 * to do BMBT block allocation, we have to do a new allocation.
3378 *
3379 * Hence leaving this function with the AGF locked opens up potential
3380 * ABBA AGF deadlocks because a future allocation attempt in this
3381 * transaction may attempt to lock a lower number AGF.
3382 *
3383 * We can't release the AGF until the transaction is commited, so at
3384 * this point we must update the "first allocation" tracker to point at
3385 * this AG if the tracker is empty or points to a lower AG. This allows
3386 * the next allocation attempt to be modified appropriately to avoid
3387 * deadlocks.
3388 */
3389 if (args->agbp &&
3390 (args->tp->t_highest_agno == NULLAGNUMBER ||
3391 args->agno > minimum_agno))
3392 args->tp->t_highest_agno = args->agno;
3393
d99d5287
DC
3394 /*
3395 * If the allocation failed with an error or we had an ENOSPC result,
3396 * preserve the returned error whilst also marking the allocation result
3397 * as "no extent allocated". This ensures that callers that fail to
3398 * capture the error will still treat it as a failed allocation.
3399 */
3400 if (alloc_error || args->agbno == NULLAGBLOCK) {
0ac70e08 3401 args->fsbno = NULLFSBLOCK;
d99d5287
DC
3402 error = alloc_error;
3403 goto out_drop_perag;
0ac70e08
DC
3404 }
3405
3406 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
d99d5287 3407
0ac70e08
DC
3408 ASSERT(args->len >= args->minlen);
3409 ASSERT(args->len <= args->maxlen);
3410 ASSERT(args->agbno % args->alignment == 0);
3411 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len);
d99d5287
DC
3412
3413 /* if not file data, insert new block into the reverse map btree */
3414 if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
3415 error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
3416 args->agbno, args->len, &args->oinfo);
3417 if (error)
3418 goto out_drop_perag;
3419 }
3420
3421 if (!args->wasfromfl) {
3422 error = xfs_alloc_update_counters(args->tp, args->agbp,
3423 -((long)(args->len)));
3424 if (error)
3425 goto out_drop_perag;
3426
3427 ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno,
3428 args->len));
3429 }
3430
3431 xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
3432
3433 XFS_STATS_INC(mp, xs_allocx);
3434 XFS_STATS_ADD(mp, xs_allocb, args->len);
3435
6904abce
DW
3436 trace_xfs_alloc_vextent_finish(args);
3437
d99d5287 3438out_drop_perag:
91cceb74
DC
3439 if (drop_perag && args->pag) {
3440 xfs_perag_rele(args->pag);
d99d5287
DC
3441 args->pag = NULL;
3442 }
3443 return error;
0ac70e08
DC
3444}
3445
3446/*
32cd26bf
DC
3447 * Allocate within a single AG only. This uses a best-fit length algorithm so if
3448 * you need an exact sized allocation without locality constraints, this is the
3449 * fastest way to do it.
3450 *
3451 * Caller is expected to hold a perag reference in args->pag.
0ac70e08 3452 */
04215b9f 3453int
0ac70e08 3454xfs_alloc_vextent_this_ag(
c4e10f2b
DC
3455 struct xfs_alloc_arg *args,
3456 xfs_agnumber_t agno)
0ac70e08
DC
3457{
3458 struct xfs_mount *mp = args->mp;
50f6a20b 3459 xfs_agnumber_t minimum_agno;
01f05365 3460 uint32_t alloc_flags = 0;
0ac70e08
DC
3461 int error;
3462
fc59ca7e
DW
3463 ASSERT(args->pag != NULL);
3464 ASSERT(args->pag->pag_agno == agno);
3465
50f6a20b
DC
3466 args->agno = agno;
3467 args->agbno = 0;
6904abce
DW
3468
3469 trace_xfs_alloc_vextent_this_ag(args);
3470
50f6a20b
DC
3471 error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0),
3472 &minimum_agno);
0ac70e08
DC
3473 if (error) {
3474 if (error == -ENOSPC)
3475 return 0;
3476 return error;
3477 }
3478
01f05365 3479 error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
a74b5502 3480 if (!error && args->agbp)
01f05365 3481 error = xfs_alloc_ag_vextent_size(args, alloc_flags);
a74b5502 3482
d99d5287 3483 return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
0ac70e08
DC
3484}
3485
3486/*
3487 * Iterate all AGs trying to allocate an extent starting from @start_ag.
3488 *
3489 * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the
3490 * allocation attempts in @start_agno have locality information. If we fail to
3491 * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs
3492 * we attempt to allocation in as there is no locality optimisation possible for
3493 * those allocations.
3494 *
91cceb74
DC
3495 * On return, args->pag may be left referenced if we finish before the "all
3496 * failed" return point. The allocation finish still needs the perag, and
3497 * so the caller will release it once they've finished the allocation.
3498 *
0ac70e08
DC
3499 * When we wrap the AG iteration at the end of the filesystem, we have to be
3500 * careful not to wrap into AGs below ones we already have locked in the
3501 * transaction if we are doing a blocking iteration. This will result in an
3502 * out-of-order locking of AGFs and hence can cause deadlocks.
3503 */
3504static int
3505xfs_alloc_vextent_iterate_ags(
3506 struct xfs_alloc_arg *args,
3507 xfs_agnumber_t minimum_agno,
3508 xfs_agnumber_t start_agno,
32cd26bf 3509 xfs_agblock_t target_agbno,
01f05365 3510 uint32_t alloc_flags)
0ac70e08
DC
3511{
3512 struct xfs_mount *mp = args->mp;
c2337664 3513 xfs_agnumber_t restart_agno = minimum_agno;
91cceb74 3514 xfs_agnumber_t agno;
0ac70e08 3515 int error = 0;
755477b4 3516
01f05365 3517 if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)
c2337664 3518 restart_agno = 0;
91cceb74 3519restart:
c2337664 3520 for_each_perag_wrap_range(mp, start_agno, restart_agno,
91cceb74
DC
3521 mp->m_sb.sb_agcount, agno, args->pag) {
3522 args->agno = agno;
01f05365 3523 error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
a74b5502 3524 if (error)
2bd0ea18 3525 break;
91cceb74
DC
3526 if (!args->agbp) {
3527 trace_xfs_alloc_vextent_loopfailed(args);
3528 continue;
a74b5502 3529 }
0ac70e08 3530
0ac70e08 3531 /*
91cceb74
DC
3532 * Allocation is supposed to succeed now, so break out of the
3533 * loop regardless of whether we succeed or not.
0ac70e08 3534 */
91cceb74 3535 if (args->agno == start_agno && target_agbno) {
32cd26bf 3536 args->agbno = target_agbno;
01f05365 3537 error = xfs_alloc_ag_vextent_near(args, alloc_flags);
91cceb74
DC
3538 } else {
3539 args->agbno = 0;
01f05365 3540 error = xfs_alloc_ag_vextent_size(args, alloc_flags);
0ac70e08 3541 }
91cceb74
DC
3542 break;
3543 }
3544 if (error) {
3545 xfs_perag_rele(args->pag);
0ac70e08 3546 args->pag = NULL;
91cceb74 3547 return error;
0ac70e08 3548 }
91cceb74
DC
3549 if (args->agbp)
3550 return 0;
3551
d99d5287 3552 /*
91cceb74
DC
3553 * We didn't find an AG we can alloation from. If we were given
3554 * constraining flags by the caller, drop them and retry the allocation
3555 * without any constraints being set.
d99d5287 3556 */
01f05365
DC
3557 if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) {
3558 alloc_flags &= ~XFS_ALLOC_FLAG_TRYLOCK;
c2337664 3559 restart_agno = minimum_agno;
91cceb74
DC
3560 goto restart;
3561 }
3562
3563 ASSERT(args->pag == NULL);
3564 trace_xfs_alloc_vextent_allfailed(args);
3565 return 0;
0ac70e08 3566}
56b2de80 3567
0ac70e08
DC
3568/*
3569 * Iterate from the AGs from the start AG to the end of the filesystem, trying
3570 * to allocate blocks. It starts with a near allocation attempt in the initial
3571 * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap
3572 * back to zero if allowed by previous allocations in this transaction,
3573 * otherwise will wrap back to the start AG and run a second blocking pass to
3574 * the end of the filesystem.
3575 */
1a5a98d5 3576int
0ac70e08
DC
3577xfs_alloc_vextent_start_ag(
3578 struct xfs_alloc_arg *args,
1a5a98d5 3579 xfs_fsblock_t target)
0ac70e08
DC
3580{
3581 struct xfs_mount *mp = args->mp;
50f6a20b 3582 xfs_agnumber_t minimum_agno;
0ac70e08
DC
3583 xfs_agnumber_t start_agno;
3584 xfs_agnumber_t rotorstep = xfs_rotorstep;
3585 bool bump_rotor = false;
01f05365 3586 uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK;
0ac70e08 3587 int error;
755477b4 3588
fc59ca7e
DW
3589 ASSERT(args->pag == NULL);
3590
50f6a20b
DC
3591 args->agno = NULLAGNUMBER;
3592 args->agbno = NULLAGBLOCK;
6904abce 3593
002b5d96 3594 trace_xfs_alloc_vextent_start_ag(args);
6904abce 3595
50f6a20b 3596 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
0ac70e08
DC
3597 if (error) {
3598 if (error == -ENOSPC)
3599 return 0;
3600 return error;
3601 }
755477b4 3602
0ac70e08
DC
3603 if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
3604 xfs_is_inode32(mp)) {
1a5a98d5 3605 target = XFS_AGB_TO_FSB(mp,
0ac70e08
DC
3606 ((mp->m_agfrotor / rotorstep) %
3607 mp->m_sb.sb_agcount), 0);
3608 bump_rotor = 1;
3609 }
a3b4a951 3610
32cd26bf 3611 start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
0ac70e08 3612 error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
01f05365 3613 XFS_FSB_TO_AGBNO(mp, target), alloc_flags);
32cd26bf 3614
0ac70e08
DC
3615 if (bump_rotor) {
3616 if (args->agno == start_agno)
3617 mp->m_agfrotor = (mp->m_agfrotor + 1) %
3618 (mp->m_sb.sb_agcount * rotorstep);
3619 else
3620 mp->m_agfrotor = (args->agno * rotorstep + 1) %
3621 (mp->m_sb.sb_agcount * rotorstep);
2bd0ea18 3622 }
9542ae13 3623
d99d5287 3624 return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
0ac70e08
DC
3625}
3626
3627/*
32cd26bf 3628 * Iterate from the agno indicated via @target through to the end of the
0ac70e08 3629 * filesystem attempting blocking allocation. This does not wrap or try a second
32cd26bf 3630 * pass, so will not recurse into AGs lower than indicated by the target.
0ac70e08 3631 */
5ba1f915 3632int
0ac70e08
DC
3633xfs_alloc_vextent_first_ag(
3634 struct xfs_alloc_arg *args,
5ba1f915
DC
3635 xfs_fsblock_t target)
3636 {
0ac70e08 3637 struct xfs_mount *mp = args->mp;
50f6a20b 3638 xfs_agnumber_t minimum_agno;
0ac70e08 3639 xfs_agnumber_t start_agno;
01f05365 3640 uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK;
0ac70e08
DC
3641 int error;
3642
fc59ca7e
DW
3643 ASSERT(args->pag == NULL);
3644
50f6a20b
DC
3645 args->agno = NULLAGNUMBER;
3646 args->agbno = NULLAGBLOCK;
6904abce 3647
002b5d96 3648 trace_xfs_alloc_vextent_first_ag(args);
6904abce 3649
50f6a20b 3650 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
0ac70e08
DC
3651 if (error) {
3652 if (error == -ENOSPC)
3653 return 0;
3654 return error;
2bd0ea18 3655 }
755477b4 3656
5ba1f915 3657 start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target));
32cd26bf 3658 error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno,
01f05365 3659 XFS_FSB_TO_AGBNO(mp, target), alloc_flags);
d99d5287 3660 return xfs_alloc_vextent_finish(args, minimum_agno, error, true);
2bd0ea18
NS
3661}
3662
c4e10f2b 3663/*
a74b5502
DC
3664 * Allocate at the exact block target or fail. Caller is expected to hold a
3665 * perag reference in args->pag.
c4e10f2b
DC
3666 */
3667int
3668xfs_alloc_vextent_exact_bno(
3669 struct xfs_alloc_arg *args,
3670 xfs_fsblock_t target)
3671{
3672 struct xfs_mount *mp = args->mp;
50f6a20b 3673 xfs_agnumber_t minimum_agno;
c4e10f2b
DC
3674 int error;
3675
fc59ca7e
DW
3676 ASSERT(args->pag != NULL);
3677 ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target));
3678
50f6a20b
DC
3679 args->agno = XFS_FSB_TO_AGNO(mp, target);
3680 args->agbno = XFS_FSB_TO_AGBNO(mp, target);
6904abce 3681
002b5d96 3682 trace_xfs_alloc_vextent_exact_bno(args);
6904abce 3683
50f6a20b 3684 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
c4e10f2b
DC
3685 if (error) {
3686 if (error == -ENOSPC)
3687 return 0;
3688 return error;
3689 }
3690
8b2a40ff 3691 error = xfs_alloc_vextent_prepare_ag(args, 0);
a74b5502 3692 if (!error && args->agbp)
32cd26bf 3693 error = xfs_alloc_ag_vextent_exact(args);
c4e10f2b 3694
d99d5287 3695 return xfs_alloc_vextent_finish(args, minimum_agno, error, false);
c4e10f2b
DC
3696}
3697
0ac70e08 3698/*
15653695
DC
3699 * Allocate an extent as close to the target as possible. If there are not
3700 * viable candidates in the AG, then fail the allocation.
a74b5502
DC
3701 *
3702 * Caller may or may not have a per-ag reference in args->pag.
0ac70e08
DC
3703 */
3704int
15653695
DC
3705xfs_alloc_vextent_near_bno(
3706 struct xfs_alloc_arg *args,
3707 xfs_fsblock_t target)
0ac70e08 3708{
15653695 3709 struct xfs_mount *mp = args->mp;
50f6a20b 3710 xfs_agnumber_t minimum_agno;
d99d5287 3711 bool needs_perag = args->pag == NULL;
01f05365 3712 uint32_t alloc_flags = 0;
04215b9f 3713 int error;
0ac70e08 3714
fc59ca7e
DW
3715 if (!needs_perag)
3716 ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target));
3717
50f6a20b
DC
3718 args->agno = XFS_FSB_TO_AGNO(mp, target);
3719 args->agbno = XFS_FSB_TO_AGBNO(mp, target);
6904abce 3720
002b5d96 3721 trace_xfs_alloc_vextent_near_bno(args);
6904abce 3722
50f6a20b 3723 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno);
15653695
DC
3724 if (error) {
3725 if (error == -ENOSPC)
3726 return 0;
3727 return error;
0ac70e08 3728 }
15653695 3729
d99d5287 3730 if (needs_perag)
91cceb74 3731 args->pag = xfs_perag_grab(mp, args->agno);
d99d5287 3732
01f05365 3733 error = xfs_alloc_vextent_prepare_ag(args, alloc_flags);
a74b5502 3734 if (!error && args->agbp)
01f05365 3735 error = xfs_alloc_ag_vextent_near(args, alloc_flags);
a74b5502 3736
d99d5287 3737 return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag);
0ac70e08
DC
3738}
3739
2a6da3b8
DC
3740/* Ensure that the freelist is at full capacity. */
3741int
3742xfs_free_extent_fix_freelist(
3743 struct xfs_trans *tp,
7635c486 3744 struct xfs_perag *pag,
2a6da3b8 3745 struct xfs_buf **agbp)
2bd0ea18 3746{
2a6da3b8
DC
3747 struct xfs_alloc_arg args;
3748 int error;
2bd0ea18 3749
2a6da3b8 3750 memset(&args, 0, sizeof(struct xfs_alloc_arg));
2bd0ea18
NS
3751 args.tp = tp;
3752 args.mp = tp->t_mountp;
7635c486
DC
3753 args.agno = pag->pag_agno;
3754 args.pag = pag;
a2ceac1f
DC
3755
3756 /*
3757 * validate that the block number is legal - the enables us to detect
3758 * and handle a silent filesystem corruption rather than crashing.
3759 */
a2ceac1f 3760 if (args.agno >= args.mp->m_sb.sb_agcount)
12b53197 3761 return -EFSCORRUPTED;
a2ceac1f 3762
a2ceac1f
DC
3763 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3764 if (error)
7635c486 3765 return error;
2a6da3b8
DC
3766
3767 *agbp = args.agbp;
7635c486 3768 return 0;
2a6da3b8
DC
3769}
3770
3771/*
3772 * Free an extent.
3773 * Just break up the extent address and hand off to xfs_free_ag_extent
3774 * after fixing up the freelist.
3775 */
5837e73b 3776int
3a13f959 3777__xfs_free_extent(
5837e73b 3778 struct xfs_trans *tp,
42c1e5c1
DW
3779 struct xfs_perag *pag,
3780 xfs_agblock_t agbno,
5837e73b
DW
3781 xfs_extlen_t len,
3782 const struct xfs_owner_info *oinfo,
3783 enum xfs_ag_resv_type type,
3784 bool skip_discard)
2a6da3b8 3785{
5837e73b
DW
3786 struct xfs_mount *mp = tp->t_mountp;
3787 struct xfs_buf *agbp;
0bc284c2 3788 struct xfs_agf *agf;
5837e73b
DW
3789 int error;
3790 unsigned int busy_flags = 0;
2a6da3b8
DC
3791
3792 ASSERT(len != 0);
9760cac2 3793 ASSERT(type != XFS_AG_RESV_AGFL);
2a6da3b8 3794
a9da40de 3795 if (XFS_TEST_ERROR(false, mp,
e2a190dd 3796 XFS_ERRTAG_FREE_EXTENT))
a9da40de
DW
3797 return -EIO;
3798
7635c486 3799 error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
2a6da3b8 3800 if (error)
42c1e5c1 3801 return error;
0bc284c2 3802 agf = agbp->b_addr;
2a6da3b8 3803
fbb4fa7f
DW
3804 if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
3805 error = -EFSCORRUPTED;
7635c486 3806 goto err_release;
fbb4fa7f 3807 }
a2ceac1f
DC
3808
3809 /* validate the extent size is legal now we have the agf locked */
0bc284c2 3810 if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
fbb4fa7f 3811 error = -EFSCORRUPTED;
7635c486 3812 goto err_release;
fbb4fa7f 3813 }
a2ceac1f 3814
42c1e5c1
DW
3815 error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo,
3816 type);
2a6da3b8 3817 if (error)
7635c486 3818 goto err_release;
2a6da3b8 3819
3a13f959
BF
3820 if (skip_discard)
3821 busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
7635c486 3822 xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
2a6da3b8
DC
3823 return 0;
3824
7635c486 3825err_release:
2a6da3b8 3826 xfs_trans_brelse(tp, agbp);
2bd0ea18
NS
3827 return error;
3828}
b3d83fa6
DW
3829
3830struct xfs_alloc_query_range_info {
3831 xfs_alloc_query_range_fn fn;
3832 void *priv;
3833};
3834
3835/* Format btree record and pass to our callback. */
3836STATIC int
3837xfs_alloc_query_range_helper(
3838 struct xfs_btree_cur *cur,
e62318a3 3839 const union xfs_btree_rec *rec,
b3d83fa6
DW
3840 void *priv)
3841{
3842 struct xfs_alloc_query_range_info *query = priv;
3843 struct xfs_alloc_rec_incore irec;
e70bf9ba 3844 xfs_failaddr_t fa;
b3d83fa6 3845
c7005aef 3846 xfs_alloc_btrec_to_irec(rec, &irec);
e70bf9ba
DW
3847 fa = xfs_alloc_check_irec(cur, &irec);
3848 if (fa)
3849 return xfs_alloc_complain_bad_rec(cur, fa, &irec);
c7005aef 3850
b3d83fa6
DW
3851 return query->fn(cur, &irec, query->priv);
3852}
3853
3854/* Find all free space within a given range of blocks. */
3855int
3856xfs_alloc_query_range(
3857 struct xfs_btree_cur *cur,
d34c6373
DW
3858 const struct xfs_alloc_rec_incore *low_rec,
3859 const struct xfs_alloc_rec_incore *high_rec,
b3d83fa6
DW
3860 xfs_alloc_query_range_fn fn,
3861 void *priv)
3862{
05dcea07
DW
3863 union xfs_btree_irec low_brec = { .a = *low_rec };
3864 union xfs_btree_irec high_brec = { .a = *high_rec };
3865 struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn };
b3d83fa6
DW
3866
3867 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
b3d83fa6
DW
3868 return xfs_btree_query_range(cur, &low_brec, &high_brec,
3869 xfs_alloc_query_range_helper, &query);
3870}
7e05e856
DW
3871
3872/* Find all free space records. */
3873int
3874xfs_alloc_query_all(
3875 struct xfs_btree_cur *cur,
3876 xfs_alloc_query_range_fn fn,
3877 void *priv)
3878{
3879 struct xfs_alloc_query_range_info query;
3880
3881 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3882 query.priv = priv;
3883 query.fn = fn;
3884 return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
3885}
9bef6258 3886
9ba4dc82
DW
3887/*
3888 * Scan part of the keyspace of the free space and tell us if the area has no
3889 * records, is fully mapped by records, or is partially filled.
3890 */
1fe41a73 3891int
9ba4dc82 3892xfs_alloc_has_records(
1fe41a73
DW
3893 struct xfs_btree_cur *cur,
3894 xfs_agblock_t bno,
3895 xfs_extlen_t len,
9ba4dc82 3896 enum xbtree_recpacking *outcome)
1fe41a73
DW
3897{
3898 union xfs_btree_irec low;
3899 union xfs_btree_irec high;
3900
3901 memset(&low, 0, sizeof(low));
3902 low.a.ar_startblock = bno;
3903 memset(&high, 0xFF, sizeof(high));
3904 high.a.ar_startblock = bno + len - 1;
3905
d99b8900 3906 return xfs_btree_has_records(cur, &low, &high, NULL, outcome);
1fe41a73 3907}
71a98c66
DW
3908
3909/*
3910 * Walk all the blocks in the AGFL. The @walk_fn can return any negative
4a509d6d 3911 * error code or XFS_ITER_*.
71a98c66
DW
3912 */
3913int
3914xfs_agfl_walk(
3915 struct xfs_mount *mp,
3916 struct xfs_agf *agf,
3917 struct xfs_buf *agflbp,
3918 xfs_agfl_walk_fn walk_fn,
3919 void *priv)
3920{
3921 __be32 *agfl_bno;
3922 unsigned int i;
3923 int error;
3924
b134e771 3925 agfl_bno = xfs_buf_to_agfl_bno(agflbp);
71a98c66
DW
3926 i = be32_to_cpu(agf->agf_flfirst);
3927
3928 /* Nothing to walk in an empty AGFL. */
3929 if (agf->agf_flcount == cpu_to_be32(0))
3930 return 0;
3931
3932 /* Otherwise, walk from first to last, wrapping as needed. */
3933 for (;;) {
3934 error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
3935 if (error)
3936 return error;
3937 if (i == be32_to_cpu(agf->agf_fllast))
3938 break;
3939 if (++i == xfs_agfl_size(mp))
3940 i = 0;
3941 }
3942
3943 return 0;
3944}
7d84b02d
DW
3945
3946int __init
3947xfs_extfree_intent_init_cache(void)
3948{
3949 xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
3950 sizeof(struct xfs_extent_free_item),
3951 0, 0, NULL);
3952
3953 return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
3954}
3955
3956void
3957xfs_extfree_intent_destroy_cache(void)
3958{
3959 kmem_cache_destroy(xfs_extfree_item_cache);
3960 xfs_extfree_item_cache = NULL;
3961}