]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - libxfs/xfs_alloc.c
xfs: perags need atomic operational state
[thirdparty/xfsprogs-dev.git] / libxfs / xfs_alloc.c
CommitLineData
37b3b4d6 1// SPDX-License-Identifier: GPL-2.0
2bd0ea18 2/*
5e656dbb 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
da23017d 4 * All Rights Reserved.
2bd0ea18 5 */
9c799827 6#include "libxfs_priv.h"
b626fb59
DC
7#include "xfs_fs.h"
8#include "xfs_format.h"
9#include "xfs_log_format.h"
10#include "xfs_shared.h"
11#include "xfs_trans_resv.h"
12#include "xfs_bit.h"
b626fb59 13#include "xfs_mount.h"
f944d3d0 14#include "xfs_defer.h"
b626fb59 15#include "xfs_btree.h"
631ac87a 16#include "xfs_rmap.h"
b626fb59
DC
17#include "xfs_alloc_btree.h"
18#include "xfs_alloc.h"
2cf10e4c 19#include "xfs_errortag.h"
b626fb59
DC
20#include "xfs_trace.h"
21#include "xfs_trans.h"
f93d2173 22#include "xfs_ag.h"
cf8ce220 23#include "xfs_ag_resv.h"
d5c1b462
BF
24#include "xfs_bmap.h"
25
7d84b02d 26struct kmem_cache *xfs_extfree_item_cache;
2bd0ea18 27
ff105f75
DC
28struct workqueue_struct *xfs_alloc_wq;
29
2bd0ea18 30#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
5e656dbb
BN
31
32#define XFSA_FIXUP_BNO_OK 1
33#define XFSA_FIXUP_CNT_OK 2
34
5e656dbb
BN
35STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
36STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
37STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
2bd0ea18 38
b8165508
DC
39/*
40 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
41 * the beginning of the block for a proper header with the location information
42 * and CRC.
43 */
44unsigned int
45xfs_agfl_size(
46 struct xfs_mount *mp)
47{
48 unsigned int size = mp->m_sb.sb_sectsize;
49
b16a427a 50 if (xfs_has_crc(mp))
b8165508
DC
51 size -= sizeof(struct xfs_agfl);
52
53 return size / sizeof(xfs_agblock_t);
54}
55
2a96beb9
DW
56unsigned int
57xfs_refc_block(
58 struct xfs_mount *mp)
59{
b16a427a 60 if (xfs_has_rmapbt(mp))
2a96beb9 61 return XFS_RMAP_BLOCK(mp) + 1;
b16a427a 62 if (xfs_has_finobt(mp))
2a96beb9
DW
63 return XFS_FIBT_BLOCK(mp) + 1;
64 return XFS_IBT_BLOCK(mp) + 1;
65}
66
ef5340cd
DW
67xfs_extlen_t
68xfs_prealloc_blocks(
69 struct xfs_mount *mp)
70{
b16a427a 71 if (xfs_has_reflink(mp))
2a96beb9 72 return xfs_refc_block(mp) + 1;
b16a427a 73 if (xfs_has_rmapbt(mp))
ef5340cd 74 return XFS_RMAP_BLOCK(mp) + 1;
b16a427a 75 if (xfs_has_finobt(mp))
ef5340cd
DW
76 return XFS_FIBT_BLOCK(mp) + 1;
77 return XFS_IBT_BLOCK(mp) + 1;
78}
79
b8a8d6e5 80/*
bc4d60eb
DW
81 * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
82 * guarantee that we can refill the AGFL prior to allocating space in a nearly
e4a32219 83 * full AG. Although the space described by the free space btrees, the
bc4d60eb
DW
84 * blocks used by the freesp btrees themselves, and the blocks owned by the
85 * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
86 * free space in the AG drop so low that the free space btrees cannot refill an
87 * empty AGFL up to the minimum level. Rather than grind through empty AGs
88 * until the fs goes down, we subtract this many AG blocks from the incore
89 * fdblocks to ensure user allocation does not overcommit the space the
90 * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to
91 * withhold space from xfs_mod_fdblocks, so we do not account for that here.
92 */
93#define XFS_ALLOCBT_AGFL_RESERVE 4
94
95/*
96 * Compute the number of blocks that we set aside to guarantee the ability to
97 * refill the AGFL and handle a full bmap btree split.
98 *
b8a8d6e5
DW
99 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
100 * AGF buffer (PV 947395), we place constraints on the relationship among
101 * actual allocations for data blocks, freelist blocks, and potential file data
102 * bmap btree blocks. However, these restrictions may result in no actual space
103 * allocated for a delayed extent, for example, a data block in a certain AG is
104 * allocated but there is no additional block for the additional bmap btree
105 * block due to a split of the bmap btree of the file. The result of this may
106 * lead to an infinite loop when the file gets flushed to disk and all delayed
107 * extents need to be actually allocated. To get around this, we explicitly set
108 * aside a few blocks which will not be reserved in delayed allocation.
109 *
bc4d60eb
DW
110 * For each AG, we need to reserve enough blocks to replenish a totally empty
111 * AGFL and 4 more to handle a potential split of the file's bmap btree.
b8a8d6e5
DW
112 */
113unsigned int
114xfs_alloc_set_aside(
115 struct xfs_mount *mp)
116{
bc4d60eb 117 return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4);
b8a8d6e5
DW
118}
119
120/*
121 * When deciding how much space to allocate out of an AG, we limit the
122 * allocation maximum size to the size the AG. However, we cannot use all the
123 * blocks in the AG - some are permanently used by metadata. These
124 * blocks are generally:
125 * - the AG superblock, AGF, AGI and AGFL
126 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally
127 * the AGI free inode and rmap btree root blocks.
128 * - blocks on the AGFL according to xfs_alloc_set_aside() limits
129 * - the rmapbt root block
130 *
131 * The AG headers are sector sized, so the amount of space they take up is
132 * dependent on filesystem geometry. The others are all single blocks.
133 */
134unsigned int
135xfs_alloc_ag_max_usable(
136 struct xfs_mount *mp)
137{
138 unsigned int blocks;
139
140 blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
bc4d60eb 141 blocks += XFS_ALLOCBT_AGFL_RESERVE;
b8a8d6e5 142 blocks += 3; /* AGF, AGI btree root blocks */
b16a427a 143 if (xfs_has_finobt(mp))
b8a8d6e5 144 blocks++; /* finobt root block */
b16a427a 145 if (xfs_has_rmapbt(mp))
bc4d60eb 146 blocks++; /* rmap root block */
b16a427a 147 if (xfs_has_reflink(mp))
868c70e3 148 blocks++; /* refcount root block */
b8a8d6e5
DW
149
150 return mp->m_sb.sb_agblocks - blocks;
151}
152
b194c7d8
BN
153/*
154 * Lookup the record equal to [bno, len] in the btree given by cur.
155 */
156STATIC int /* error */
157xfs_alloc_lookup_eq(
158 struct xfs_btree_cur *cur, /* btree cursor */
159 xfs_agblock_t bno, /* starting block of extent */
160 xfs_extlen_t len, /* length of extent */
161 int *stat) /* success/failure */
162{
7777d8c6
BF
163 int error;
164
b194c7d8
BN
165 cur->bc_rec.a.ar_startblock = bno;
166 cur->bc_rec.a.ar_blockcount = len;
7777d8c6 167 error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
63789af3 168 cur->bc_ag.abt.active = (*stat == 1);
7777d8c6 169 return error;
b194c7d8
BN
170}
171
172/*
173 * Lookup the first record greater than or equal to [bno, len]
174 * in the btree given by cur.
175 */
a2ceac1f 176int /* error */
b194c7d8
BN
177xfs_alloc_lookup_ge(
178 struct xfs_btree_cur *cur, /* btree cursor */
179 xfs_agblock_t bno, /* starting block of extent */
180 xfs_extlen_t len, /* length of extent */
181 int *stat) /* success/failure */
182{
7777d8c6
BF
183 int error;
184
b194c7d8
BN
185 cur->bc_rec.a.ar_startblock = bno;
186 cur->bc_rec.a.ar_blockcount = len;
7777d8c6 187 error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
63789af3 188 cur->bc_ag.abt.active = (*stat == 1);
7777d8c6 189 return error;
b194c7d8
BN
190}
191
192/*
193 * Lookup the first record less than or equal to [bno, len]
194 * in the btree given by cur.
195 */
1fe41a73 196int /* error */
b194c7d8
BN
197xfs_alloc_lookup_le(
198 struct xfs_btree_cur *cur, /* btree cursor */
199 xfs_agblock_t bno, /* starting block of extent */
200 xfs_extlen_t len, /* length of extent */
201 int *stat) /* success/failure */
202{
7777d8c6 203 int error;
b194c7d8
BN
204 cur->bc_rec.a.ar_startblock = bno;
205 cur->bc_rec.a.ar_blockcount = len;
7777d8c6 206 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
63789af3 207 cur->bc_ag.abt.active = (*stat == 1);
7777d8c6
BF
208 return error;
209}
210
211static inline bool
212xfs_alloc_cur_active(
213 struct xfs_btree_cur *cur)
214{
63789af3 215 return cur && cur->bc_ag.abt.active;
b194c7d8
BN
216}
217
218/*
219 * Update the record referred to by cur to the value given
220 * by [bno, len].
221 * This either works (return 0) or gets an EFSCORRUPTED error.
222 */
223STATIC int /* error */
224xfs_alloc_update(
225 struct xfs_btree_cur *cur, /* btree cursor */
226 xfs_agblock_t bno, /* starting block of extent */
227 xfs_extlen_t len) /* length of extent */
228{
229 union xfs_btree_rec rec;
230
231 rec.alloc.ar_startblock = cpu_to_be32(bno);
232 rec.alloc.ar_blockcount = cpu_to_be32(len);
233 return xfs_btree_update(cur, &rec);
234}
235
236/*
237 * Get the data from the pointed-to record.
238 */
a2ceac1f 239int /* error */
b194c7d8
BN
240xfs_alloc_get_rec(
241 struct xfs_btree_cur *cur, /* btree cursor */
242 xfs_agblock_t *bno, /* output: starting block of extent */
243 xfs_extlen_t *len, /* output: length of extent */
244 int *stat) /* output: success/failure */
245{
ec291989 246 struct xfs_mount *mp = cur->bc_mp;
83af0d13 247 struct xfs_perag *pag = cur->bc_ag.pag;
b194c7d8
BN
248 union xfs_btree_rec *rec;
249 int error;
250
251 error = xfs_btree_get_rec(cur, &rec, stat);
3741f971
DW
252 if (error || !(*stat))
253 return error;
3741f971
DW
254
255 *bno = be32_to_cpu(rec->alloc.ar_startblock);
256 *len = be32_to_cpu(rec->alloc.ar_blockcount);
257
c25e4305
CM
258 if (*len == 0)
259 goto out_bad_rec;
260
ec291989 261 /* check for valid extent range, including overflow */
b3f9ae08 262 if (!xfs_verify_agbext(pag, *bno, *len))
ec291989
DC
263 goto out_bad_rec;
264
265 return 0;
266
267out_bad_rec:
268 xfs_warn(mp,
269 "%s Freespace BTree record corruption in AG %d detected!",
83af0d13
DC
270 cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
271 pag->pag_agno);
ec291989
DC
272 xfs_warn(mp,
273 "start block 0x%x block count 0x%x", *bno, *len);
274 return -EFSCORRUPTED;
b194c7d8
BN
275}
276
2bd0ea18
NS
277/*
278 * Compute aligned version of the found extent.
279 * Takes alignment and min length into account.
280 */
cd80de04 281STATIC bool
2bd0ea18 282xfs_alloc_compute_aligned(
a2ceac1f 283 xfs_alloc_arg_t *args, /* allocation argument structure */
2bd0ea18
NS
284 xfs_agblock_t foundbno, /* starting block in found extent */
285 xfs_extlen_t foundlen, /* length in found extent */
2bd0ea18 286 xfs_agblock_t *resbno, /* result block number */
cd80de04
CH
287 xfs_extlen_t *reslen, /* result length */
288 unsigned *busy_gen)
2bd0ea18 289{
cd80de04
CH
290 xfs_agblock_t bno = foundbno;
291 xfs_extlen_t len = foundlen;
ff3263dd 292 xfs_extlen_t diff;
cd80de04 293 bool busy;
2bd0ea18 294
a2ceac1f 295 /* Trim busy sections out of found extent */
cd80de04 296 busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
a2ceac1f 297
ff3263dd
BF
298 /*
299 * If we have a largish extent that happens to start before min_agbno,
300 * see if we can shift it into range...
301 */
302 if (bno < args->min_agbno && bno + len > args->min_agbno) {
303 diff = args->min_agbno - bno;
304 if (len > diff) {
305 bno += diff;
306 len -= diff;
307 }
308 }
309
a2ceac1f
DC
310 if (args->alignment > 1 && len >= args->minlen) {
311 xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
ff3263dd
BF
312
313 diff = aligned_bno - bno;
a2ceac1f
DC
314
315 *resbno = aligned_bno;
316 *reslen = diff >= len ? 0 : len - diff;
2bd0ea18 317 } else {
a2ceac1f
DC
318 *resbno = bno;
319 *reslen = len;
2bd0ea18 320 }
cd80de04
CH
321
322 return busy;
2bd0ea18
NS
323}
324
325/*
326 * Compute best start block and diff for "near" allocations.
327 * freelen >= wantlen already checked by caller.
328 */
329STATIC xfs_extlen_t /* difference value (absolute) */
330xfs_alloc_compute_diff(
331 xfs_agblock_t wantbno, /* target starting block */
332 xfs_extlen_t wantlen, /* target length */
333 xfs_extlen_t alignment, /* target alignment */
1fccd5c8 334 int datatype, /* are we allocating data? */
2bd0ea18
NS
335 xfs_agblock_t freebno, /* freespace's starting block */
336 xfs_extlen_t freelen, /* freespace's length */
337 xfs_agblock_t *newbnop) /* result: best start block from free */
338{
339 xfs_agblock_t freeend; /* end of freespace extent */
340 xfs_agblock_t newbno1; /* return block number */
341 xfs_agblock_t newbno2; /* other new block number */
0e266570
NS
342 xfs_extlen_t newlen1=0; /* length with newbno1 */
343 xfs_extlen_t newlen2=0; /* length with newbno2 */
2bd0ea18 344 xfs_agblock_t wantend; /* end of target extent */
a85522b6 345 bool userdata = datatype & XFS_ALLOC_USERDATA;
2bd0ea18
NS
346
347 ASSERT(freelen >= wantlen);
348 freeend = freebno + freelen;
349 wantend = wantbno + wantlen;
84a62eea
DC
350 /*
351 * We want to allocate from the start of a free extent if it is past
352 * the desired block or if we are allocating user data and the free
353 * extent is before desired block. The second case is there to allow
354 * for contiguous allocation from the remaining free space if the file
355 * grows in the short term.
356 */
357 if (freebno >= wantbno || (userdata && freeend < wantend)) {
2bd0ea18
NS
358 if ((newbno1 = roundup(freebno, alignment)) >= freeend)
359 newbno1 = NULLAGBLOCK;
360 } else if (freeend >= wantend && alignment > 1) {
361 newbno1 = roundup(wantbno, alignment);
362 newbno2 = newbno1 - alignment;
363 if (newbno1 >= freeend)
364 newbno1 = NULLAGBLOCK;
365 else
366 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
367 if (newbno2 < freebno)
368 newbno2 = NULLAGBLOCK;
369 else
370 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
371 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
372 if (newlen1 < newlen2 ||
373 (newlen1 == newlen2 &&
374 XFS_ABSDIFF(newbno1, wantbno) >
375 XFS_ABSDIFF(newbno2, wantbno)))
376 newbno1 = newbno2;
377 } else if (newbno2 != NULLAGBLOCK)
378 newbno1 = newbno2;
379 } else if (freeend >= wantend) {
380 newbno1 = wantbno;
381 } else if (alignment > 1) {
382 newbno1 = roundup(freeend - wantlen, alignment);
383 if (newbno1 > freeend - wantlen &&
384 newbno1 - alignment >= freebno)
385 newbno1 -= alignment;
386 else if (newbno1 >= freeend)
387 newbno1 = NULLAGBLOCK;
388 } else
389 newbno1 = freeend - wantlen;
390 *newbnop = newbno1;
391 return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
392}
393
394/*
395 * Fix up the length, based on mod and prod.
396 * len should be k * prod + mod for some k.
397 * If len is too small it is returned unchanged.
398 * If len hits maxlen it is left alone.
399 */
400STATIC void
401xfs_alloc_fix_len(
dfc130f3 402 xfs_alloc_arg_t *args) /* allocation argument structure */
2bd0ea18
NS
403{
404 xfs_extlen_t k;
405 xfs_extlen_t rlen;
406
407 ASSERT(args->mod < args->prod);
408 rlen = args->len;
409 ASSERT(rlen >= args->minlen);
410 ASSERT(rlen <= args->maxlen);
411 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
412 (args->mod == 0 && rlen < args->prod))
413 return;
414 k = rlen % args->prod;
415 if (k == args->mod)
416 return;
ff105f75
DC
417 if (k > args->mod)
418 rlen = rlen - (k - args->mod);
419 else
420 rlen = rlen - args->prod + (args->mod - k);
19ebedcf 421 /* casts to (int) catch length underflows */
ff105f75
DC
422 if ((int)rlen < (int)args->minlen)
423 return;
424 ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
425 ASSERT(rlen % args->prod == args->mod);
2c003dc2
CH
426 ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
427 rlen + args->minleft);
2bd0ea18
NS
428 args->len = rlen;
429}
430
2bd0ea18
NS
431/*
432 * Update the two btrees, logically removing from freespace the extent
433 * starting at rbno, rlen blocks. The extent is contained within the
434 * actual (current) free extent fbno for flen blocks.
435 * Flags are passed in indicating whether the cursors are set to the
436 * relevant records.
437 */
438STATIC int /* error code */
439xfs_alloc_fixup_trees(
ec924d04
DW
440 struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */
441 struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */
2bd0ea18
NS
442 xfs_agblock_t fbno, /* starting block of free extent */
443 xfs_extlen_t flen, /* length of free extent */
444 xfs_agblock_t rbno, /* starting block of returned extent */
445 xfs_extlen_t rlen, /* length of returned extent */
446 int flags) /* flags, XFSA_FIXUP_... */
447{
448 int error; /* error code */
449 int i; /* operation results */
450 xfs_agblock_t nfbno1; /* first new free startblock */
451 xfs_agblock_t nfbno2; /* second new free startblock */
0e266570
NS
452 xfs_extlen_t nflen1=0; /* first new free length */
453 xfs_extlen_t nflen2=0; /* second new free length */
19ebedcf
DC
454 struct xfs_mount *mp;
455
456 mp = cnt_cur->bc_mp;
2bd0ea18
NS
457
458 /*
459 * Look up the record in the by-size tree if necessary.
460 */
461 if (flags & XFSA_FIXUP_CNT_OK) {
462#ifdef DEBUG
0e266570 463 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
2bd0ea18 464 return error;
fbb4fa7f
DW
465 if (XFS_IS_CORRUPT(mp,
466 i != 1 ||
467 nfbno1 != fbno ||
468 nflen1 != flen))
469 return -EFSCORRUPTED;
2bd0ea18
NS
470#endif
471 } else {
0e266570 472 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
2bd0ea18 473 return error;
fbb4fa7f
DW
474 if (XFS_IS_CORRUPT(mp, i != 1))
475 return -EFSCORRUPTED;
2bd0ea18
NS
476 }
477 /*
478 * Look up the record in the by-block tree if necessary.
479 */
480 if (flags & XFSA_FIXUP_BNO_OK) {
481#ifdef DEBUG
0e266570 482 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
2bd0ea18 483 return error;
fbb4fa7f
DW
484 if (XFS_IS_CORRUPT(mp,
485 i != 1 ||
486 nfbno1 != fbno ||
487 nflen1 != flen))
488 return -EFSCORRUPTED;
2bd0ea18
NS
489#endif
490 } else {
0e266570 491 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
2bd0ea18 492 return error;
fbb4fa7f
DW
493 if (XFS_IS_CORRUPT(mp, i != 1))
494 return -EFSCORRUPTED;
2bd0ea18 495 }
b3563c19 496
2bd0ea18 497#ifdef DEBUG
b3563c19
BN
498 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
499 struct xfs_btree_block *bnoblock;
500 struct xfs_btree_block *cntblock;
501
5df9b067
DW
502 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
503 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
2bd0ea18 504
fbb4fa7f
DW
505 if (XFS_IS_CORRUPT(mp,
506 bnoblock->bb_numrecs !=
507 cntblock->bb_numrecs))
508 return -EFSCORRUPTED;
2bd0ea18
NS
509 }
510#endif
b3563c19 511
2bd0ea18
NS
512 /*
513 * Deal with all four cases: the allocated record is contained
514 * within the freespace record, so we can have new freespace
515 * at either (or both) end, or no freespace remaining.
516 */
517 if (rbno == fbno && rlen == flen)
518 nfbno1 = nfbno2 = NULLAGBLOCK;
519 else if (rbno == fbno) {
520 nfbno1 = rbno + rlen;
521 nflen1 = flen - rlen;
522 nfbno2 = NULLAGBLOCK;
523 } else if (rbno + rlen == fbno + flen) {
524 nfbno1 = fbno;
525 nflen1 = flen - rlen;
526 nfbno2 = NULLAGBLOCK;
527 } else {
528 nfbno1 = fbno;
529 nflen1 = rbno - fbno;
530 nfbno2 = rbno + rlen;
531 nflen2 = (fbno + flen) - nfbno2;
532 }
533 /*
534 * Delete the entry from the by-size btree.
535 */
b194c7d8 536 if ((error = xfs_btree_delete(cnt_cur, &i)))
2bd0ea18 537 return error;
fbb4fa7f
DW
538 if (XFS_IS_CORRUPT(mp, i != 1))
539 return -EFSCORRUPTED;
2bd0ea18
NS
540 /*
541 * Add new by-size btree entry(s).
542 */
543 if (nfbno1 != NULLAGBLOCK) {
0e266570 544 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
2bd0ea18 545 return error;
fbb4fa7f
DW
546 if (XFS_IS_CORRUPT(mp, i != 0))
547 return -EFSCORRUPTED;
b194c7d8 548 if ((error = xfs_btree_insert(cnt_cur, &i)))
2bd0ea18 549 return error;
fbb4fa7f
DW
550 if (XFS_IS_CORRUPT(mp, i != 1))
551 return -EFSCORRUPTED;
2bd0ea18
NS
552 }
553 if (nfbno2 != NULLAGBLOCK) {
0e266570 554 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
2bd0ea18 555 return error;
fbb4fa7f
DW
556 if (XFS_IS_CORRUPT(mp, i != 0))
557 return -EFSCORRUPTED;
b194c7d8 558 if ((error = xfs_btree_insert(cnt_cur, &i)))
2bd0ea18 559 return error;
fbb4fa7f
DW
560 if (XFS_IS_CORRUPT(mp, i != 1))
561 return -EFSCORRUPTED;
2bd0ea18
NS
562 }
563 /*
564 * Fix up the by-block btree entry(s).
565 */
566 if (nfbno1 == NULLAGBLOCK) {
567 /*
568 * No remaining freespace, just delete the by-block tree entry.
569 */
b194c7d8 570 if ((error = xfs_btree_delete(bno_cur, &i)))
2bd0ea18 571 return error;
fbb4fa7f
DW
572 if (XFS_IS_CORRUPT(mp, i != 1))
573 return -EFSCORRUPTED;
2bd0ea18
NS
574 } else {
575 /*
576 * Update the by-block entry to start later|be shorter.
577 */
0e266570 578 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
2bd0ea18
NS
579 return error;
580 }
581 if (nfbno2 != NULLAGBLOCK) {
582 /*
583 * 2 resulting free entries, need to add one.
584 */
0e266570 585 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
2bd0ea18 586 return error;
fbb4fa7f
DW
587 if (XFS_IS_CORRUPT(mp, i != 0))
588 return -EFSCORRUPTED;
b194c7d8 589 if ((error = xfs_btree_insert(bno_cur, &i)))
2bd0ea18 590 return error;
fbb4fa7f
DW
591 if (XFS_IS_CORRUPT(mp, i != 1))
592 return -EFSCORRUPTED;
2bd0ea18
NS
593 }
594 return 0;
595}
596
bc01119d 597static xfs_failaddr_t
a2ceac1f
DC
598xfs_agfl_verify(
599 struct xfs_buf *bp)
600{
7861ef77 601 struct xfs_mount *mp = bp->b_mount;
a2ceac1f 602 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
b134e771 603 __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp);
a2ceac1f
DC
604 int i;
605
95d9582b
DW
606 /*
607 * There is no verification of non-crc AGFLs because mkfs does not
608 * initialise the AGFL to zero or NULL. Hence the only valid part of the
609 * AGFL is what the AGF says is active. We can't get to the AGF, so we
610 * can't verify just those entries are valid.
611 */
b16a427a 612 if (!xfs_has_crc(mp))
95d9582b
DW
613 return NULL;
614
68dbe77f 615 if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
bc01119d 616 return __this_address;
68dbe77f 617 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
bc01119d 618 return __this_address;
dd5b876e
DC
619 /*
620 * during growfs operations, the perag is not fully initialised,
621 * so we can't use it for any useful checking. growfs ensures we can't
622 * use it by using uncached buffers that don't have the perag attached
623 * so we can detect and avoid this problem.
624 */
625 if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
bc01119d 626 return __this_address;
dd5b876e 627
b8165508 628 for (i = 0; i < xfs_agfl_size(mp); i++) {
b134e771
CH
629 if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
630 be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
bc01119d 631 return __this_address;
a2ceac1f 632 }
a65d8d29 633
bc01119d
DW
634 if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
635 return __this_address;
636 return NULL;
dd5b876e
DC
637}
638
639static void
640xfs_agfl_read_verify(
641 struct xfs_buf *bp)
642{
7861ef77 643 struct xfs_mount *mp = bp->b_mount;
1e697959 644 xfs_failaddr_t fa;
dd5b876e
DC
645
646 /*
647 * There is no verification of non-crc AGFLs because mkfs does not
648 * initialise the AGFL to zero or NULL. Hence the only valid part of the
649 * AGFL is what the AGF says is active. We can't get to the AGF, so we
650 * can't verify just those entries are valid.
651 */
b16a427a 652 if (!xfs_has_crc(mp))
dd5b876e
DC
653 return;
654
45922933 655 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
1e697959
DW
656 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
657 else {
658 fa = xfs_agfl_verify(bp);
659 if (fa)
660 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
661 }
a2ceac1f
DC
662}
663
664static void
665xfs_agfl_write_verify(
666 struct xfs_buf *bp)
667{
7861ef77 668 struct xfs_mount *mp = bp->b_mount;
37d086ca 669 struct xfs_buf_log_item *bip = bp->b_log_item;
1e697959 670 xfs_failaddr_t fa;
a2ceac1f 671
dd5b876e 672 /* no verification of non-crc AGFLs */
b16a427a 673 if (!xfs_has_crc(mp))
dd5b876e
DC
674 return;
675
1e697959
DW
676 fa = xfs_agfl_verify(bp);
677 if (fa) {
678 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
dd5b876e
DC
679 return;
680 }
681
682 if (bip)
683 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
684
43b5aeed 685 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
a2ceac1f
DC
686}
687
688const struct xfs_buf_ops xfs_agfl_buf_ops = {
a3fac935 689 .name = "xfs_agfl",
68dbe77f 690 .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
a2ceac1f
DC
691 .verify_read = xfs_agfl_read_verify,
692 .verify_write = xfs_agfl_write_verify,
95d9582b 693 .verify_struct = xfs_agfl_verify,
a2ceac1f
DC
694};
695
2bd0ea18
NS
696/*
697 * Read in the allocation group free block array.
698 */
75c01ccc 699int
2bd0ea18 700xfs_alloc_read_agfl(
75c01ccc
DC
701 struct xfs_perag *pag,
702 struct xfs_trans *tp,
703 struct xfs_buf **bpp)
2bd0ea18 704{
75c01ccc
DC
705 struct xfs_mount *mp = pag->pag_mount;
706 struct xfs_buf *bp;
707 int error;
2bd0ea18 708
9440d84d
NS
709 error = xfs_trans_read_buf(
710 mp, tp, mp->m_ddev_targp,
75c01ccc 711 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
a2ceac1f 712 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
9440d84d 713 if (error)
2bd0ea18 714 return error;
a2ceac1f 715 xfs_buf_set_ref(bp, XFS_AGFL_REF);
2bd0ea18
NS
716 *bpp = bp;
717 return 0;
718}
719
a2ceac1f
DC
720STATIC int
721xfs_alloc_update_counters(
722 struct xfs_trans *tp,
a2ceac1f
DC
723 struct xfs_buf *agbp,
724 long len)
725{
0bc284c2 726 struct xfs_agf *agf = agbp->b_addr;
a2ceac1f 727
10c0a390 728 agbp->b_pag->pagf_freeblks += len;
a2ceac1f
DC
729 be32_add_cpu(&agf->agf_freeblks, len);
730
a2ceac1f 731 if (unlikely(be32_to_cpu(agf->agf_freeblks) >
a0264b73 732 be32_to_cpu(agf->agf_length))) {
1be76d11 733 xfs_buf_mark_corrupt(agbp);
12b53197 734 return -EFSCORRUPTED;
a0264b73 735 }
a2ceac1f
DC
736
737 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
738 return 0;
739}
740
2bd0ea18 741/*
2950989c 742 * Block allocation algorithm and data structures.
2bd0ea18 743 */
2950989c
BF
744struct xfs_alloc_cur {
745 struct xfs_btree_cur *cnt; /* btree cursors */
746 struct xfs_btree_cur *bnolt;
747 struct xfs_btree_cur *bnogt;
4d66edb1 748 xfs_extlen_t cur_len;/* current search length */
19fe42e3
BF
749 xfs_agblock_t rec_bno;/* extent startblock */
750 xfs_extlen_t rec_len;/* extent length */
751 xfs_agblock_t bno; /* alloc bno */
752 xfs_extlen_t len; /* alloc len */
753 xfs_extlen_t diff; /* diff from search bno */
e055d59e
BF
754 unsigned int busy_gen;/* busy state */
755 bool busy;
2950989c
BF
756};
757
758/*
759 * Set up cursors, etc. in the extent allocation cursor. This function can be
760 * called multiple times to reset an initialized structure without having to
761 * reallocate cursors.
762 */
763static int
764xfs_alloc_cur_setup(
765 struct xfs_alloc_arg *args,
766 struct xfs_alloc_cur *acur)
767{
768 int error;
769 int i;
770
771 ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
772
4d66edb1 773 acur->cur_len = args->maxlen;
19fe42e3
BF
774 acur->rec_bno = 0;
775 acur->rec_len = 0;
776 acur->bno = 0;
777 acur->len = 0;
04ea0ac1 778 acur->diff = -1;
e055d59e
BF
779 acur->busy = false;
780 acur->busy_gen = 0;
781
2950989c
BF
782 /*
783 * Perform an initial cntbt lookup to check for availability of maxlen
784 * extents. If this fails, we'll return -ENOSPC to signal the caller to
785 * attempt a small allocation.
786 */
787 if (!acur->cnt)
788 acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
ecb44e84 789 args->agbp, args->pag, XFS_BTNUM_CNT);
2950989c
BF
790 error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
791 if (error)
792 return error;
793
794 /*
795 * Allocate the bnobt left and right search cursors.
796 */
797 if (!acur->bnolt)
798 acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
ecb44e84 799 args->agbp, args->pag, XFS_BTNUM_BNO);
2950989c
BF
800 if (!acur->bnogt)
801 acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
ecb44e84 802 args->agbp, args->pag, XFS_BTNUM_BNO);
2950989c
BF
803 return i == 1 ? 0 : -ENOSPC;
804}
805
806static void
807xfs_alloc_cur_close(
808 struct xfs_alloc_cur *acur,
809 bool error)
810{
811 int cur_error = XFS_BTREE_NOERROR;
812
813 if (error)
814 cur_error = XFS_BTREE_ERROR;
815
816 if (acur->cnt)
817 xfs_btree_del_cursor(acur->cnt, cur_error);
818 if (acur->bnolt)
819 xfs_btree_del_cursor(acur->bnolt, cur_error);
820 if (acur->bnogt)
821 xfs_btree_del_cursor(acur->bnogt, cur_error);
822 acur->cnt = acur->bnolt = acur->bnogt = NULL;
823}
2bd0ea18 824
04ea0ac1
BF
825/*
826 * Check an extent for allocation and track the best available candidate in the
827 * allocation structure. The cursor is deactivated if it has entered an out of
828 * range state based on allocation arguments. Optionally return the extent
829 * extent geometry and allocation status if requested by the caller.
830 */
831static int
832xfs_alloc_cur_check(
833 struct xfs_alloc_arg *args,
834 struct xfs_alloc_cur *acur,
835 struct xfs_btree_cur *cur,
836 int *new)
837{
838 int error, i;
839 xfs_agblock_t bno, bnoa, bnew;
840 xfs_extlen_t len, lena, diff = -1;
841 bool busy;
842 unsigned busy_gen = 0;
843 bool deactivate = false;
3ca39168 844 bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
04ea0ac1
BF
845
846 *new = 0;
847
848 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
849 if (error)
850 return error;
fbb4fa7f
DW
851 if (XFS_IS_CORRUPT(args->mp, i != 1))
852 return -EFSCORRUPTED;
04ea0ac1
BF
853
854 /*
855 * Check minlen and deactivate a cntbt cursor if out of acceptable size
856 * range (i.e., walking backwards looking for a minlen extent).
857 */
858 if (len < args->minlen) {
3ca39168 859 deactivate = !isbnobt;
04ea0ac1
BF
860 goto out;
861 }
862
863 busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
864 &busy_gen);
865 acur->busy |= busy;
866 if (busy)
867 acur->busy_gen = busy_gen;
868 /* deactivate a bnobt cursor outside of locality range */
3ca39168
BF
869 if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
870 deactivate = isbnobt;
04ea0ac1 871 goto out;
3ca39168 872 }
04ea0ac1
BF
873 if (lena < args->minlen)
874 goto out;
875
876 args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
877 xfs_alloc_fix_len(args);
878 ASSERT(args->len >= args->minlen);
879 if (args->len < acur->len)
880 goto out;
881
882 /*
883 * We have an aligned record that satisfies minlen and beats or matches
884 * the candidate extent size. Compare locality for near allocation mode.
885 */
886 ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
887 diff = xfs_alloc_compute_diff(args->agbno, args->len,
888 args->alignment, args->datatype,
889 bnoa, lena, &bnew);
890 if (bnew == NULLAGBLOCK)
891 goto out;
3ca39168
BF
892
893 /*
894 * Deactivate a bnobt cursor with worse locality than the current best.
895 */
896 if (diff > acur->diff) {
897 deactivate = isbnobt;
04ea0ac1 898 goto out;
3ca39168 899 }
04ea0ac1
BF
900
901 ASSERT(args->len > acur->len ||
902 (args->len == acur->len && diff <= acur->diff));
903 acur->rec_bno = bno;
904 acur->rec_len = len;
905 acur->bno = bnew;
906 acur->len = args->len;
907 acur->diff = diff;
908 *new = 1;
909
4f2eee5a
BF
910 /*
911 * We're done if we found a perfect allocation. This only deactivates
912 * the current cursor, but this is just an optimization to terminate a
913 * cntbt search that otherwise runs to the edge of the tree.
914 */
915 if (acur->diff == 0 && acur->len == args->maxlen)
916 deactivate = true;
04ea0ac1
BF
917out:
918 if (deactivate)
63789af3 919 cur->bc_ag.abt.active = false;
04ea0ac1
BF
920 trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
921 *new);
922 return 0;
923}
924
dacde37d
BF
925/*
926 * Complete an allocation of a candidate extent. Remove the extent from both
927 * trees and update the args structure.
928 */
929STATIC int
930xfs_alloc_cur_finish(
931 struct xfs_alloc_arg *args,
932 struct xfs_alloc_cur *acur)
933{
0bc284c2 934 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
dacde37d
BF
935 int error;
936
937 ASSERT(acur->cnt && acur->bnolt);
938 ASSERT(acur->bno >= acur->rec_bno);
939 ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
0bc284c2 940 ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
dacde37d
BF
941
942 error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
943 acur->rec_len, acur->bno, acur->len, 0);
944 if (error)
945 return error;
946
947 args->agbno = acur->bno;
948 args->len = acur->len;
949 args->wasfromfl = 0;
950
951 trace_xfs_alloc_cur(args);
952 return 0;
953}
954
4d66edb1
BF
955/*
956 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
957 * bno optimized lookup to search for extents with ideal size and locality.
958 */
959STATIC int
960xfs_alloc_cntbt_iter(
961 struct xfs_alloc_arg *args,
962 struct xfs_alloc_cur *acur)
963{
964 struct xfs_btree_cur *cur = acur->cnt;
965 xfs_agblock_t bno;
966 xfs_extlen_t len, cur_len;
967 int error;
968 int i;
969
970 if (!xfs_alloc_cur_active(cur))
971 return 0;
972
973 /* locality optimized lookup */
974 cur_len = acur->cur_len;
975 error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
976 if (error)
977 return error;
978 if (i == 0)
979 return 0;
980 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
981 if (error)
982 return error;
983
984 /* check the current record and update search length from it */
985 error = xfs_alloc_cur_check(args, acur, cur, &i);
986 if (error)
987 return error;
988 ASSERT(len >= acur->cur_len);
989 acur->cur_len = len;
990
991 /*
992 * We looked up the first record >= [agbno, len] above. The agbno is a
993 * secondary key and so the current record may lie just before or after
994 * agbno. If it is past agbno, check the previous record too so long as
995 * the length matches as it may be closer. Don't check a smaller record
996 * because that could deactivate our cursor.
997 */
998 if (bno > args->agbno) {
999 error = xfs_btree_decrement(cur, 0, &i);
1000 if (!error && i) {
1001 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1002 if (!error && i && len == acur->cur_len)
1003 error = xfs_alloc_cur_check(args, acur, cur,
1004 &i);
1005 }
1006 if (error)
1007 return error;
1008 }
1009
1010 /*
1011 * Increment the search key until we find at least one allocation
1012 * candidate or if the extent we found was larger. Otherwise, double the
1013 * search key to optimize the search. Efficiency is more important here
1014 * than absolute best locality.
1015 */
1016 cur_len <<= 1;
1017 if (!acur->len || acur->cur_len >= cur_len)
1018 acur->cur_len++;
1019 else
1020 acur->cur_len = cur_len;
1021
1022 return error;
1023}
1024
ba02381c
BF
1025/*
1026 * Deal with the case where only small freespaces remain. Either return the
1027 * contents of the last freespace record, or allocate space from the freelist if
1028 * there is nothing in the tree.
1029 */
1030STATIC int /* error */
1031xfs_alloc_ag_vextent_small(
1032 struct xfs_alloc_arg *args, /* allocation argument structure */
1033 struct xfs_btree_cur *ccur, /* optional by-size cursor */
1034 xfs_agblock_t *fbnop, /* result block number */
1035 xfs_extlen_t *flenp, /* result length */
1036 int *stat) /* status: 0-freelist, 1-normal/none */
1037{
0bc284c2 1038 struct xfs_agf *agf = args->agbp->b_addr;
ba02381c
BF
1039 int error = 0;
1040 xfs_agblock_t fbno = NULLAGBLOCK;
1041 xfs_extlen_t flen = 0;
9e1862f0 1042 int i = 0;
ba02381c 1043
9e1862f0
BF
1044 /*
1045 * If a cntbt cursor is provided, try to allocate the largest record in
1046 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1047 * allocation. Make sure to respect minleft even when pulling from the
1048 * freelist.
1049 */
1050 if (ccur)
1051 error = xfs_btree_decrement(ccur, 0, &i);
ba02381c
BF
1052 if (error)
1053 goto error;
1054 if (i) {
1055 error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1056 if (error)
1057 goto error;
fbb4fa7f
DW
1058 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1059 error = -EFSCORRUPTED;
1060 goto error;
1061 }
ba02381c
BF
1062 goto out;
1063 }
1064
1065 if (args->minlen != 1 || args->alignment != 1 ||
1066 args->resv == XFS_AG_RESV_AGFL ||
0bc284c2 1067 be32_to_cpu(agf->agf_flcount) <= args->minleft)
ba02381c
BF
1068 goto out;
1069
1d202c10
DC
1070 error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
1071 &fbno, 0);
ba02381c
BF
1072 if (error)
1073 goto error;
1074 if (fbno == NULLAGBLOCK)
1075 goto out;
1076
7635c486 1077 xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
a85522b6 1078 (args->datatype & XFS_ALLOC_NOBUSY));
ba02381c 1079
a85522b6 1080 if (args->datatype & XFS_ALLOC_USERDATA) {
ba02381c
BF
1081 struct xfs_buf *bp;
1082
78fcd346
DW
1083 error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
1084 XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
1085 args->mp->m_bsize, 0, &bp);
1086 if (error)
ba02381c 1087 goto error;
ba02381c
BF
1088 xfs_trans_binval(args->tp, bp);
1089 }
2d7ea81f
BF
1090 *fbnop = args->agbno = fbno;
1091 *flenp = args->len = 1;
0bc284c2 1092 if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
fbb4fa7f
DW
1093 error = -EFSCORRUPTED;
1094 goto error;
1095 }
ba02381c
BF
1096 args->wasfromfl = 1;
1097 trace_xfs_alloc_small_freelist(args);
1098
1099 /*
1100 * If we're feeding an AGFL block to something that doesn't live in the
1101 * free space, we need to clear out the OWN_AG rmap.
1102 */
195c248c 1103 error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1,
ba02381c
BF
1104 &XFS_RMAP_OINFO_AG);
1105 if (error)
1106 goto error;
1107
1108 *stat = 0;
1109 return 0;
1110
1111out:
1112 /*
1113 * Can't do the allocation, give up.
1114 */
1115 if (flen < args->minlen) {
1116 args->agbno = NULLAGBLOCK;
1117 trace_xfs_alloc_small_notenough(args);
1118 flen = 0;
1119 }
1120 *fbnop = fbno;
1121 *flenp = flen;
1122 *stat = 1;
1123 trace_xfs_alloc_small_done(args);
1124 return 0;
1125
1126error:
1127 trace_xfs_alloc_small_error(args);
1128 return error;
1129}
1130
2bd0ea18
NS
1131/*
1132 * Allocate a variable extent in the allocation group agno.
1133 * Type and bno are used to determine where in the allocation group the
1134 * extent will start.
1135 * Extent's length (returned in *len) will be between minlen and maxlen,
1136 * and of the form k * prod + mod unless there's nothing that large.
1137 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1138 */
1139STATIC int /* error */
1140xfs_alloc_ag_vextent(
dfc130f3 1141 xfs_alloc_arg_t *args) /* argument structure for allocation */
2bd0ea18 1142{
0e266570 1143 int error=0;
2bd0ea18
NS
1144
1145 ASSERT(args->minlen > 0);
1146 ASSERT(args->maxlen > 0);
1147 ASSERT(args->minlen <= args->maxlen);
1148 ASSERT(args->mod < args->prod);
1149 ASSERT(args->alignment > 0);
cf8ce220 1150
2bd0ea18
NS
1151 /*
1152 * Branch to correct routine based on the type.
1153 */
1154 args->wasfromfl = 0;
1155 switch (args->type) {
1156 case XFS_ALLOCTYPE_THIS_AG:
1157 error = xfs_alloc_ag_vextent_size(args);
1158 break;
1159 case XFS_ALLOCTYPE_NEAR_BNO:
1160 error = xfs_alloc_ag_vextent_near(args);
1161 break;
1162 case XFS_ALLOCTYPE_THIS_BNO:
1163 error = xfs_alloc_ag_vextent_exact(args);
1164 break;
1165 default:
1166 ASSERT(0);
1167 /* NOTREACHED */
1168 }
a2ceac1f
DC
1169
1170 if (error || args->agbno == NULLAGBLOCK)
2bd0ea18 1171 return error;
2bd0ea18 1172
a2ceac1f
DC
1173 ASSERT(args->len >= args->minlen);
1174 ASSERT(args->len <= args->maxlen);
9760cac2 1175 ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
a2ceac1f
DC
1176 ASSERT(args->agbno % args->alignment == 0);
1177
631ac87a 1178 /* if not file data, insert new block into the reverse map btree */
3ee858aa 1179 if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
195c248c 1180 error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
631ac87a
DW
1181 args->agbno, args->len, &args->oinfo);
1182 if (error)
1183 return error;
1184 }
1185
a2ceac1f 1186 if (!args->wasfromfl) {
10c0a390 1187 error = xfs_alloc_update_counters(args->tp, args->agbp,
a2ceac1f
DC
1188 -((long)(args->len)));
1189 if (error)
1190 return error;
1191
7635c486 1192 ASSERT(!xfs_extent_busy_search(args->mp, args->pag,
a2ceac1f 1193 args->agbno, args->len));
2bd0ea18 1194 }
a2ceac1f 1195
cf8ce220 1196 xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
a2ceac1f 1197
79896434
BD
1198 XFS_STATS_INC(args->mp, xs_allocx);
1199 XFS_STATS_ADD(args->mp, xs_allocb, args->len);
a2ceac1f 1200 return error;
2bd0ea18
NS
1201}
1202
1203/*
1204 * Allocate a variable extent at exactly agno/bno.
1205 * Extent's length (returned in *len) will be between minlen and maxlen,
1206 * and of the form k * prod + mod unless there's nothing that large.
1207 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1208 */
1209STATIC int /* error */
1210xfs_alloc_ag_vextent_exact(
dfc130f3 1211 xfs_alloc_arg_t *args) /* allocation argument structure */
2bd0ea18 1212{
0bc284c2 1213 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
ec924d04
DW
1214 struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
1215 struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
2bd0ea18
NS
1216 int error;
1217 xfs_agblock_t fbno; /* start block of found extent */
2bd0ea18 1218 xfs_extlen_t flen; /* length of found extent */
cd80de04
CH
1219 xfs_agblock_t tbno; /* start block of busy extent */
1220 xfs_extlen_t tlen; /* length of busy extent */
1221 xfs_agblock_t tend; /* end block of busy extent */
2bd0ea18 1222 int i; /* success/failure of operation */
cd80de04 1223 unsigned busy_gen;
2bd0ea18
NS
1224
1225 ASSERT(args->alignment == 1);
a2ceac1f 1226
2bd0ea18
NS
1227 /*
1228 * Allocate/initialize a cursor for the by-number freespace btree.
1229 */
b194c7d8 1230 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
ecb44e84 1231 args->pag, XFS_BTNUM_BNO);
56b2de80 1232
2bd0ea18
NS
1233 /*
1234 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1235 * Look for the closest free block <= bno, it must contain bno
1236 * if any free block does.
1237 */
56b2de80
DC
1238 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1239 if (error)
2bd0ea18 1240 goto error0;
56b2de80
DC
1241 if (!i)
1242 goto not_found;
1243
2bd0ea18
NS
1244 /*
1245 * Grab the freespace record.
1246 */
56b2de80
DC
1247 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1248 if (error)
2bd0ea18 1249 goto error0;
fbb4fa7f
DW
1250 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1251 error = -EFSCORRUPTED;
1252 goto error0;
1253 }
2bd0ea18 1254 ASSERT(fbno <= args->agbno);
56b2de80 1255
5000d01d 1256 /*
a2ceac1f
DC
1257 * Check for overlapping busy extents.
1258 */
cd80de04
CH
1259 tbno = fbno;
1260 tlen = flen;
1261 xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
a2ceac1f
DC
1262
1263 /*
1264 * Give up if the start of the extent is busy, or the freespace isn't
1265 * long enough for the minimum request.
2bd0ea18 1266 */
a2ceac1f
DC
1267 if (tbno > args->agbno)
1268 goto not_found;
1269 if (tlen < args->minlen)
1270 goto not_found;
1271 tend = tbno + tlen;
1272 if (tend < args->agbno + args->minlen)
56b2de80
DC
1273 goto not_found;
1274
2bd0ea18
NS
1275 /*
1276 * End of extent will be smaller of the freespace end and the
1277 * maximal requested end.
56b2de80 1278 *
2bd0ea18
NS
1279 * Fix the length according to mod and prod if given.
1280 */
a2ceac1f
DC
1281 args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1282 - args->agbno;
2bd0ea18 1283 xfs_alloc_fix_len(args);
a2ceac1f 1284 ASSERT(args->agbno + args->len <= tend);
56b2de80 1285
2bd0ea18 1286 /*
a2ceac1f 1287 * We are allocating agbno for args->len
2bd0ea18
NS
1288 * Allocate/initialize a cursor for the by-size btree.
1289 */
b194c7d8 1290 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
ecb44e84 1291 args->pag, XFS_BTNUM_CNT);
0bc284c2 1292 ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
56b2de80
DC
1293 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1294 args->len, XFSA_FIXUP_BNO_OK);
1295 if (error) {
2bd0ea18
NS
1296 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1297 goto error0;
1298 }
a2ceac1f 1299
2bd0ea18
NS
1300 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1301 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
a2ceac1f 1302
2bd0ea18 1303 args->wasfromfl = 0;
56b2de80
DC
1304 trace_xfs_alloc_exact_done(args);
1305 return 0;
1306
1307not_found:
1308 /* Didn't find it, return null. */
1309 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1310 args->agbno = NULLAGBLOCK;
1311 trace_xfs_alloc_exact_notfound(args);
2bd0ea18
NS
1312 return 0;
1313
1314error0:
1315 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
56b2de80
DC
1316 trace_xfs_alloc_exact_error(args);
1317 return error;
1318}
1319
1320/*
4f2eee5a
BF
1321 * Search a given number of btree records in a given direction. Check each
1322 * record against the good extent we've already found.
56b2de80
DC
1323 */
1324STATIC int
4f2eee5a 1325xfs_alloc_walk_iter(
3ca39168
BF
1326 struct xfs_alloc_arg *args,
1327 struct xfs_alloc_cur *acur,
1328 struct xfs_btree_cur *cur,
4f2eee5a
BF
1329 bool increment,
1330 bool find_one, /* quit on first candidate */
1331 int count, /* rec count (-1 for infinite) */
1332 int *stat)
56b2de80 1333{
56b2de80
DC
1334 int error;
1335 int i;
56b2de80 1336
4f2eee5a
BF
1337 *stat = 0;
1338
56b2de80 1339 /*
3ca39168
BF
1340 * Search so long as the cursor is active or we find a better extent.
1341 * The cursor is deactivated if it extends beyond the range of the
1342 * current allocation candidate.
56b2de80 1343 */
4f2eee5a 1344 while (xfs_alloc_cur_active(cur) && count) {
3ca39168 1345 error = xfs_alloc_cur_check(args, acur, cur, &i);
56b2de80 1346 if (error)
3ca39168 1347 return error;
4f2eee5a
BF
1348 if (i == 1) {
1349 *stat = 1;
1350 if (find_one)
1351 break;
1352 }
3ca39168
BF
1353 if (!xfs_alloc_cur_active(cur))
1354 break;
56b2de80 1355
3ca39168
BF
1356 if (increment)
1357 error = xfs_btree_increment(cur, 0, &i);
56b2de80 1358 else
3ca39168 1359 error = xfs_btree_decrement(cur, 0, &i);
56b2de80 1360 if (error)
3ca39168
BF
1361 return error;
1362 if (i == 0)
63789af3 1363 cur->bc_ag.abt.active = false;
4f2eee5a
BF
1364
1365 if (count > 0)
1366 count--;
3ca39168 1367 }
56b2de80 1368
56b2de80 1369 return 0;
2bd0ea18
NS
1370}
1371
d0ebd9ee 1372/*
4d66edb1
BF
1373 * Search the by-bno and by-size btrees in parallel in search of an extent with
1374 * ideal locality based on the NEAR mode ->agbno locality hint.
d0ebd9ee
BF
1375 */
1376STATIC int
4d66edb1 1377xfs_alloc_ag_vextent_locality(
d0ebd9ee
BF
1378 struct xfs_alloc_arg *args,
1379 struct xfs_alloc_cur *acur,
1380 int *stat)
1381{
1382 struct xfs_btree_cur *fbcur = NULL;
1383 int error;
1384 int i;
1385 bool fbinc;
1386
1387 ASSERT(acur->len == 0);
1388 ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
1389
1390 *stat = 0;
1391
4d66edb1
BF
1392 error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1393 if (error)
1394 return error;
d0ebd9ee
BF
1395 error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1396 if (error)
1397 return error;
1398 error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1399 if (error)
1400 return error;
1401
1402 /*
4d66edb1
BF
1403 * Search the bnobt and cntbt in parallel. Search the bnobt left and
1404 * right and lookup the closest extent to the locality hint for each
1405 * extent size key in the cntbt. The entire search terminates
1406 * immediately on a bnobt hit because that means we've found best case
1407 * locality. Otherwise the search continues until the cntbt cursor runs
1408 * off the end of the tree. If no allocation candidate is found at this
1409 * point, give up on locality, walk backwards from the end of the cntbt
1410 * and take the first available extent.
1411 *
1412 * The parallel tree searches balance each other out to provide fairly
1413 * consistent performance for various situations. The bnobt search can
1414 * have pathological behavior in the worst case scenario of larger
1415 * allocation requests and fragmented free space. On the other hand, the
1416 * bnobt is able to satisfy most smaller allocation requests much more
1417 * quickly than the cntbt. The cntbt search can sift through fragmented
1418 * free space and sets of free extents for larger allocation requests
1419 * more quickly than the bnobt. Since the locality hint is just a hint
1420 * and we don't want to scan the entire bnobt for perfect locality, the
1421 * cntbt search essentially bounds the bnobt search such that we can
1422 * find good enough locality at reasonable performance in most cases.
d0ebd9ee
BF
1423 */
1424 while (xfs_alloc_cur_active(acur->bnolt) ||
4d66edb1
BF
1425 xfs_alloc_cur_active(acur->bnogt) ||
1426 xfs_alloc_cur_active(acur->cnt)) {
1427
1428 trace_xfs_alloc_cur_lookup(args);
1429
1430 /*
1431 * Search the bnobt left and right. In the case of a hit, finish
1432 * the search in the opposite direction and we're done.
1433 */
d0ebd9ee
BF
1434 error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1435 true, 1, &i);
1436 if (error)
1437 return error;
1438 if (i == 1) {
1439 trace_xfs_alloc_cur_left(args);
1440 fbcur = acur->bnogt;
1441 fbinc = true;
1442 break;
1443 }
d0ebd9ee
BF
1444 error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1445 1, &i);
1446 if (error)
1447 return error;
1448 if (i == 1) {
1449 trace_xfs_alloc_cur_right(args);
1450 fbcur = acur->bnolt;
1451 fbinc = false;
1452 break;
1453 }
4d66edb1
BF
1454
1455 /*
1456 * Check the extent with best locality based on the current
1457 * extent size search key and keep track of the best candidate.
1458 */
1459 error = xfs_alloc_cntbt_iter(args, acur);
1460 if (error)
1461 return error;
1462 if (!xfs_alloc_cur_active(acur->cnt)) {
1463 trace_xfs_alloc_cur_lookup_done(args);
1464 break;
1465 }
1466 }
1467
1468 /*
1469 * If we failed to find anything due to busy extents, return empty
1470 * handed so the caller can flush and retry. If no busy extents were
1471 * found, walk backwards from the end of the cntbt as a last resort.
1472 */
1473 if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1474 error = xfs_btree_decrement(acur->cnt, 0, &i);
1475 if (error)
1476 return error;
1477 if (i) {
63789af3 1478 acur->cnt->bc_ag.abt.active = true;
4d66edb1
BF
1479 fbcur = acur->cnt;
1480 fbinc = false;
1481 }
d0ebd9ee
BF
1482 }
1483
4d66edb1
BF
1484 /*
1485 * Search in the opposite direction for a better entry in the case of
1486 * a bnobt hit or walk backwards from the end of the cntbt.
1487 */
d0ebd9ee
BF
1488 if (fbcur) {
1489 error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1490 &i);
1491 if (error)
1492 return error;
1493 }
1494
1495 if (acur->len)
1496 *stat = 1;
1497
1498 return 0;
1499}
1500
0bb673ce
DW
1501/* Check the last block of the cnt btree for allocations. */
1502static int
1503xfs_alloc_ag_vextent_lastblock(
1504 struct xfs_alloc_arg *args,
1505 struct xfs_alloc_cur *acur,
1506 xfs_agblock_t *bno,
1507 xfs_extlen_t *len,
1508 bool *allocated)
1509{
1510 int error;
1511 int i;
1512
1513#ifdef DEBUG
1514 /* Randomly don't execute the first algorithm. */
9a046f96 1515 if (get_random_u32_below(2))
0bb673ce
DW
1516 return 0;
1517#endif
1518
1519 /*
1520 * Start from the entry that lookup found, sequence through all larger
1521 * free blocks. If we're actually pointing at a record smaller than
1522 * maxlen, go to the start of this block, and skip all those smaller
1523 * than minlen.
1524 */
2c711623 1525 if (*len || args->alignment > 1) {
5df9b067 1526 acur->cnt->bc_levels[0].ptr = 1;
0bb673ce
DW
1527 do {
1528 error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1529 if (error)
1530 return error;
fbb4fa7f
DW
1531 if (XFS_IS_CORRUPT(args->mp, i != 1))
1532 return -EFSCORRUPTED;
0bb673ce
DW
1533 if (*len >= args->minlen)
1534 break;
1535 error = xfs_btree_increment(acur->cnt, 0, &i);
1536 if (error)
1537 return error;
1538 } while (i);
1539 ASSERT(*len >= args->minlen);
1540 if (!i)
1541 return 0;
1542 }
1543
1544 error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1545 if (error)
1546 return error;
1547
1548 /*
1549 * It didn't work. We COULD be in a case where there's a good record
1550 * somewhere, so try again.
1551 */
1552 if (acur->len == 0)
1553 return 0;
1554
1555 trace_xfs_alloc_near_first(args);
1556 *allocated = true;
1557 return 0;
1558}
1559
2bd0ea18
NS
1560/*
1561 * Allocate a variable extent near bno in the allocation group agno.
1562 * Extent's length (returned in len) will be between minlen and maxlen,
1563 * and of the form k * prod + mod unless there's nothing that large.
1564 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1565 */
2950989c 1566STATIC int
2bd0ea18 1567xfs_alloc_ag_vextent_near(
2950989c 1568 struct xfs_alloc_arg *args)
2bd0ea18 1569{
2950989c 1570 struct xfs_alloc_cur acur = {};
3ca39168
BF
1571 int error; /* error code */
1572 int i; /* result code, temporary */
3ca39168
BF
1573 xfs_agblock_t bno;
1574 xfs_extlen_t len;
a2ceac1f 1575
9bef1574 1576 /* handle uninitialized agbno range so caller doesn't have to */
ff3263dd
BF
1577 if (!args->min_agbno && !args->max_agbno)
1578 args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1579 ASSERT(args->min_agbno <= args->max_agbno);
1580
1581 /* clamp agbno to the range if it's outside */
1582 if (args->agbno < args->min_agbno)
1583 args->agbno = args->min_agbno;
1584 if (args->agbno > args->max_agbno)
1585 args->agbno = args->max_agbno;
1586
a2ceac1f 1587restart:
3ca39168 1588 len = 0;
a2ceac1f 1589
2bd0ea18 1590 /*
2950989c
BF
1591 * Set up cursors and see if there are any free extents as big as
1592 * maxlen. If not, pick the last entry in the tree unless the tree is
1593 * empty.
5000d01d 1594 */
2950989c
BF
1595 error = xfs_alloc_cur_setup(args, &acur);
1596 if (error == -ENOSPC) {
3ca39168
BF
1597 error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1598 &len, &i);
2950989c
BF
1599 if (error)
1600 goto out;
3ca39168 1601 if (i == 0 || len == 0) {
a2ceac1f 1602 trace_xfs_alloc_near_noentry(args);
2950989c 1603 goto out;
2bd0ea18
NS
1604 }
1605 ASSERT(i == 1);
2950989c
BF
1606 } else if (error) {
1607 goto out;
2bd0ea18 1608 }
a2ceac1f 1609
5000d01d 1610 /*
2bd0ea18
NS
1611 * First algorithm.
1612 * If the requested extent is large wrt the freespaces available
1613 * in this a.g., then the cursor will be pointing to a btree entry
1614 * near the right edge of the tree. If it's in the last btree leaf
1615 * block, then we just examine all the entries in that block
1616 * that are big enough, and pick the best one.
2bd0ea18 1617 */
0bb673ce
DW
1618 if (xfs_btree_islastblock(acur.cnt, 0)) {
1619 bool allocated = false;
4f2eee5a 1620
0bb673ce
DW
1621 error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1622 &allocated);
4f2eee5a
BF
1623 if (error)
1624 goto out;
0bb673ce
DW
1625 if (allocated)
1626 goto alloc_finish;
2bd0ea18 1627 }
2950989c 1628
2bd0ea18 1629 /*
4d66edb1
BF
1630 * Second algorithm. Combined cntbt and bnobt search to find ideal
1631 * locality.
2bd0ea18 1632 */
4d66edb1 1633 error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
2950989c
BF
1634 if (error)
1635 goto out;
1636
2bd0ea18
NS
1637 /*
1638 * If we couldn't get anything, give up.
1639 */
3ca39168 1640 if (!acur.len) {
e055d59e 1641 if (acur.busy) {
a2ceac1f 1642 trace_xfs_alloc_near_busy(args);
e055d59e
BF
1643 xfs_extent_busy_flush(args->mp, args->pag,
1644 acur.busy_gen);
a2ceac1f
DC
1645 goto restart;
1646 }
56b2de80 1647 trace_xfs_alloc_size_neither(args);
2bd0ea18 1648 args->agbno = NULLAGBLOCK;
2950989c 1649 goto out;
2bd0ea18 1650 }
56b2de80 1651
0bb673ce 1652alloc_finish:
dacde37d
BF
1653 /* fix up btrees on a successful allocation */
1654 error = xfs_alloc_cur_finish(args, &acur);
56b2de80 1655
2950989c
BF
1656out:
1657 xfs_alloc_cur_close(&acur, error);
2bd0ea18
NS
1658 return error;
1659}
1660
1661/*
1662 * Allocate a variable extent anywhere in the allocation group agno.
1663 * Extent's length (returned in len) will be between minlen and maxlen,
1664 * and of the form k * prod + mod unless there's nothing that large.
1665 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1666 */
1667STATIC int /* error */
1668xfs_alloc_ag_vextent_size(
dfc130f3 1669 xfs_alloc_arg_t *args) /* allocation argument structure */
2bd0ea18 1670{
0bc284c2 1671 struct xfs_agf *agf = args->agbp->b_addr;
ec924d04
DW
1672 struct xfs_btree_cur *bno_cur; /* cursor for bno btree */
1673 struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */
2bd0ea18
NS
1674 int error; /* error result */
1675 xfs_agblock_t fbno; /* start of found freespace */
1676 xfs_extlen_t flen; /* length of found freespace */
2bd0ea18
NS
1677 int i; /* temp status variable */
1678 xfs_agblock_t rbno; /* returned block number */
1679 xfs_extlen_t rlen; /* length of returned extent */
cd80de04
CH
1680 bool busy;
1681 unsigned busy_gen;
2bd0ea18 1682
a2ceac1f 1683restart:
2bd0ea18
NS
1684 /*
1685 * Allocate and initialize a cursor for the by-size btree.
1686 */
b194c7d8 1687 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
ecb44e84 1688 args->pag, XFS_BTNUM_CNT);
2bd0ea18 1689 bno_cur = NULL;
a2ceac1f 1690
2bd0ea18
NS
1691 /*
1692 * Look for an entry >= maxlen+alignment-1 blocks.
1693 */
0e266570
NS
1694 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1695 args->maxlen + args->alignment - 1, &i)))
2bd0ea18 1696 goto error0;
a2ceac1f 1697
2bd0ea18 1698 /*
cd80de04
CH
1699 * If none then we have to settle for a smaller extent. In the case that
1700 * there are no large extents, this will return the last entry in the
1701 * tree unless the tree is empty. In the case that there are only busy
1702 * large extents, this will return the largest small extent unless there
a2ceac1f 1703 * are no smaller extents available.
5000d01d 1704 */
cd80de04 1705 if (!i) {
a2ceac1f
DC
1706 error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1707 &fbno, &flen, &i);
1708 if (error)
2bd0ea18
NS
1709 goto error0;
1710 if (i == 0 || flen == 0) {
1711 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
56b2de80 1712 trace_xfs_alloc_size_noentry(args);
2bd0ea18
NS
1713 return 0;
1714 }
1715 ASSERT(i == 1);
cd80de04
CH
1716 busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1717 &rlen, &busy_gen);
a2ceac1f
DC
1718 } else {
1719 /*
1720 * Search for a non-busy extent that is large enough.
a2ceac1f
DC
1721 */
1722 for (;;) {
1723 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1724 if (error)
1725 goto error0;
fbb4fa7f
DW
1726 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1727 error = -EFSCORRUPTED;
1728 goto error0;
1729 }
a2ceac1f 1730
cd80de04
CH
1731 busy = xfs_alloc_compute_aligned(args, fbno, flen,
1732 &rbno, &rlen, &busy_gen);
a2ceac1f
DC
1733
1734 if (rlen >= args->maxlen)
1735 break;
1736
1737 error = xfs_btree_increment(cnt_cur, 0, &i);
1738 if (error)
1739 goto error0;
1740 if (i == 0) {
1741 /*
1742 * Our only valid extents must have been busy.
1743 * Make it unbusy by forcing the log out and
cd80de04 1744 * retrying.
a2ceac1f
DC
1745 */
1746 xfs_btree_del_cursor(cnt_cur,
1747 XFS_BTREE_NOERROR);
1748 trace_xfs_alloc_size_busy(args);
cd80de04
CH
1749 xfs_extent_busy_flush(args->mp,
1750 args->pag, busy_gen);
a2ceac1f
DC
1751 goto restart;
1752 }
1753 }
2bd0ea18 1754 }
a2ceac1f 1755
2bd0ea18
NS
1756 /*
1757 * In the first case above, we got the last entry in the
1758 * by-size btree. Now we check to see if the space hits maxlen
1759 * once aligned; if not, we search left for something better.
1760 * This can't happen in the second case above.
1761 */
2bd0ea18 1762 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
fbb4fa7f
DW
1763 if (XFS_IS_CORRUPT(args->mp,
1764 rlen != 0 &&
1765 (rlen > flen ||
1766 rbno + rlen > fbno + flen))) {
1767 error = -EFSCORRUPTED;
1768 goto error0;
1769 }
2bd0ea18
NS
1770 if (rlen < args->maxlen) {
1771 xfs_agblock_t bestfbno;
1772 xfs_extlen_t bestflen;
1773 xfs_agblock_t bestrbno;
1774 xfs_extlen_t bestrlen;
1775
1776 bestrlen = rlen;
1777 bestrbno = rbno;
1778 bestflen = flen;
1779 bestfbno = fbno;
1780 for (;;) {
b194c7d8 1781 if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
2bd0ea18
NS
1782 goto error0;
1783 if (i == 0)
1784 break;
0e266570
NS
1785 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1786 &i)))
2bd0ea18 1787 goto error0;
fbb4fa7f
DW
1788 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1789 error = -EFSCORRUPTED;
1790 goto error0;
1791 }
2bd0ea18
NS
1792 if (flen < bestrlen)
1793 break;
cd80de04
CH
1794 busy = xfs_alloc_compute_aligned(args, fbno, flen,
1795 &rbno, &rlen, &busy_gen);
2bd0ea18 1796 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
fbb4fa7f
DW
1797 if (XFS_IS_CORRUPT(args->mp,
1798 rlen != 0 &&
1799 (rlen > flen ||
1800 rbno + rlen > fbno + flen))) {
1801 error = -EFSCORRUPTED;
1802 goto error0;
1803 }
2bd0ea18
NS
1804 if (rlen > bestrlen) {
1805 bestrlen = rlen;
1806 bestrbno = rbno;
1807 bestflen = flen;
1808 bestfbno = fbno;
1809 if (rlen == args->maxlen)
1810 break;
1811 }
5000d01d 1812 }
0e266570
NS
1813 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1814 &i)))
2bd0ea18 1815 goto error0;
fbb4fa7f
DW
1816 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1817 error = -EFSCORRUPTED;
1818 goto error0;
1819 }
2bd0ea18
NS
1820 rlen = bestrlen;
1821 rbno = bestrbno;
1822 flen = bestflen;
1823 fbno = bestfbno;
1824 }
1825 args->wasfromfl = 0;
1826 /*
1827 * Fix up the length.
1828 */
1829 args->len = rlen;
a2ceac1f 1830 if (rlen < args->minlen) {
cd80de04 1831 if (busy) {
a2ceac1f
DC
1832 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1833 trace_xfs_alloc_size_busy(args);
cd80de04 1834 xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
a2ceac1f
DC
1835 goto restart;
1836 }
1837 goto out_nominleft;
2bd0ea18 1838 }
a2ceac1f
DC
1839 xfs_alloc_fix_len(args);
1840
2bd0ea18 1841 rlen = args->len;
fbb4fa7f
DW
1842 if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1843 error = -EFSCORRUPTED;
1844 goto error0;
1845 }
2bd0ea18
NS
1846 /*
1847 * Allocate and initialize a cursor for the by-block tree.
1848 */
b194c7d8 1849 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
ecb44e84 1850 args->pag, XFS_BTNUM_BNO);
0e266570
NS
1851 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1852 rbno, rlen, XFSA_FIXUP_CNT_OK)))
2bd0ea18
NS
1853 goto error0;
1854 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1855 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1856 cnt_cur = bno_cur = NULL;
1857 args->len = rlen;
1858 args->agbno = rbno;
fbb4fa7f
DW
1859 if (XFS_IS_CORRUPT(args->mp,
1860 args->agbno + args->len >
0bc284c2 1861 be32_to_cpu(agf->agf_length))) {
fbb4fa7f
DW
1862 error = -EFSCORRUPTED;
1863 goto error0;
1864 }
56b2de80 1865 trace_xfs_alloc_size_done(args);
2bd0ea18
NS
1866 return 0;
1867
1868error0:
56b2de80 1869 trace_xfs_alloc_size_error(args);
2bd0ea18
NS
1870 if (cnt_cur)
1871 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1872 if (bno_cur)
1873 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1874 return error;
a2ceac1f
DC
1875
1876out_nominleft:
1877 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1878 trace_xfs_alloc_size_nominleft(args);
1879 args->agbno = NULLAGBLOCK;
1880 return 0;
2bd0ea18
NS
1881}
1882
2bd0ea18
NS
1883/*
1884 * Free the extent starting at agno/bno for length.
1885 */
85aec44f 1886STATIC int
2bd0ea18 1887xfs_free_ag_extent(
5837e73b
DW
1888 struct xfs_trans *tp,
1889 struct xfs_buf *agbp,
1890 xfs_agnumber_t agno,
1891 xfs_agblock_t bno,
1892 xfs_extlen_t len,
1893 const struct xfs_owner_info *oinfo,
1894 enum xfs_ag_resv_type type)
2bd0ea18 1895{
5837e73b 1896 struct xfs_mount *mp;
5837e73b
DW
1897 struct xfs_btree_cur *bno_cur;
1898 struct xfs_btree_cur *cnt_cur;
1899 xfs_agblock_t gtbno; /* start of right neighbor */
1900 xfs_extlen_t gtlen; /* length of right neighbor */
1901 xfs_agblock_t ltbno; /* start of left neighbor */
1902 xfs_extlen_t ltlen; /* length of left neighbor */
1903 xfs_agblock_t nbno; /* new starting block of freesp */
1904 xfs_extlen_t nlen; /* new length of freespace */
1905 int haveleft; /* have a left neighbor */
1906 int haveright; /* have a right neighbor */
1907 int i;
1908 int error;
195c248c 1909 struct xfs_perag *pag = agbp->b_pag;
2bd0ea18 1910
631ac87a 1911 bno_cur = cnt_cur = NULL;
2bd0ea18 1912 mp = tp->t_mountp;
631ac87a 1913
3ee858aa 1914 if (!xfs_rmap_should_skip_owner_update(oinfo)) {
195c248c 1915 error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo);
631ac87a
DW
1916 if (error)
1917 goto error0;
1918 }
1919
5000d01d 1920 /*
2bd0ea18
NS
1921 * Allocate and initialize a cursor for the by-block btree.
1922 */
ecb44e84 1923 bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
5000d01d 1924 /*
2bd0ea18
NS
1925 * Look for a neighboring block on the left (lower block numbers)
1926 * that is contiguous with this space.
1927 */
0e266570 1928 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
2bd0ea18
NS
1929 goto error0;
1930 if (haveleft) {
1931 /*
1932 * There is a block to our left.
1933 */
0e266570 1934 if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
2bd0ea18 1935 goto error0;
fbb4fa7f
DW
1936 if (XFS_IS_CORRUPT(mp, i != 1)) {
1937 error = -EFSCORRUPTED;
1938 goto error0;
1939 }
2bd0ea18
NS
1940 /*
1941 * It's not contiguous, though.
1942 */
1943 if (ltbno + ltlen < bno)
1944 haveleft = 0;
1945 else {
1946 /*
1947 * If this failure happens the request to free this
1948 * space was invalid, it's (partly) already free.
1949 * Very bad.
1950 */
fbb4fa7f
DW
1951 if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
1952 error = -EFSCORRUPTED;
1953 goto error0;
1954 }
2bd0ea18
NS
1955 }
1956 }
5000d01d 1957 /*
2bd0ea18
NS
1958 * Look for a neighboring block on the right (higher block numbers)
1959 * that is contiguous with this space.
1960 */
b194c7d8 1961 if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
2bd0ea18
NS
1962 goto error0;
1963 if (haveright) {
1964 /*
1965 * There is a block to our right.
1966 */
0e266570 1967 if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
2bd0ea18 1968 goto error0;
fbb4fa7f
DW
1969 if (XFS_IS_CORRUPT(mp, i != 1)) {
1970 error = -EFSCORRUPTED;
1971 goto error0;
1972 }
2bd0ea18
NS
1973 /*
1974 * It's not contiguous, though.
1975 */
1976 if (bno + len < gtbno)
1977 haveright = 0;
1978 else {
1979 /*
1980 * If this failure happens the request to free this
1981 * space was invalid, it's (partly) already free.
1982 * Very bad.
1983 */
fbb4fa7f
DW
1984 if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
1985 error = -EFSCORRUPTED;
1986 goto error0;
1987 }
2bd0ea18
NS
1988 }
1989 }
1990 /*
1991 * Now allocate and initialize a cursor for the by-size tree.
1992 */
ecb44e84 1993 cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
2bd0ea18
NS
1994 /*
1995 * Have both left and right contiguous neighbors.
1996 * Merge all three into a single free block.
1997 */
1998 if (haveleft && haveright) {
1999 /*
2000 * Delete the old by-size entry on the left.
2001 */
0e266570 2002 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2bd0ea18 2003 goto error0;
fbb4fa7f
DW
2004 if (XFS_IS_CORRUPT(mp, i != 1)) {
2005 error = -EFSCORRUPTED;
2006 goto error0;
2007 }
b194c7d8 2008 if ((error = xfs_btree_delete(cnt_cur, &i)))
2bd0ea18 2009 goto error0;
fbb4fa7f
DW
2010 if (XFS_IS_CORRUPT(mp, i != 1)) {
2011 error = -EFSCORRUPTED;
2012 goto error0;
2013 }
2bd0ea18
NS
2014 /*
2015 * Delete the old by-size entry on the right.
2016 */
0e266570 2017 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2bd0ea18 2018 goto error0;
fbb4fa7f
DW
2019 if (XFS_IS_CORRUPT(mp, i != 1)) {
2020 error = -EFSCORRUPTED;
2021 goto error0;
2022 }
b194c7d8 2023 if ((error = xfs_btree_delete(cnt_cur, &i)))
2bd0ea18 2024 goto error0;
fbb4fa7f
DW
2025 if (XFS_IS_CORRUPT(mp, i != 1)) {
2026 error = -EFSCORRUPTED;
2027 goto error0;
2028 }
2bd0ea18
NS
2029 /*
2030 * Delete the old by-block entry for the right block.
2031 */
b194c7d8 2032 if ((error = xfs_btree_delete(bno_cur, &i)))
2bd0ea18 2033 goto error0;
fbb4fa7f
DW
2034 if (XFS_IS_CORRUPT(mp, i != 1)) {
2035 error = -EFSCORRUPTED;
2036 goto error0;
2037 }
2bd0ea18
NS
2038 /*
2039 * Move the by-block cursor back to the left neighbor.
2040 */
b194c7d8 2041 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2bd0ea18 2042 goto error0;
fbb4fa7f
DW
2043 if (XFS_IS_CORRUPT(mp, i != 1)) {
2044 error = -EFSCORRUPTED;
2045 goto error0;
2046 }
2bd0ea18
NS
2047#ifdef DEBUG
2048 /*
2049 * Check that this is the right record: delete didn't
2050 * mangle the cursor.
2051 */
2052 {
2053 xfs_agblock_t xxbno;
2054 xfs_extlen_t xxlen;
2055
0e266570
NS
2056 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2057 &i)))
2bd0ea18 2058 goto error0;
fbb4fa7f
DW
2059 if (XFS_IS_CORRUPT(mp,
2060 i != 1 ||
2061 xxbno != ltbno ||
2062 xxlen != ltlen)) {
2063 error = -EFSCORRUPTED;
2064 goto error0;
2065 }
2bd0ea18
NS
2066 }
2067#endif
2068 /*
2069 * Update remaining by-block entry to the new, joined block.
2070 */
2071 nbno = ltbno;
2072 nlen = len + ltlen + gtlen;
0e266570 2073 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2bd0ea18
NS
2074 goto error0;
2075 }
2076 /*
2077 * Have only a left contiguous neighbor.
2078 * Merge it together with the new freespace.
2079 */
2080 else if (haveleft) {
2081 /*
2082 * Delete the old by-size entry on the left.
2083 */
0e266570 2084 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2bd0ea18 2085 goto error0;
fbb4fa7f
DW
2086 if (XFS_IS_CORRUPT(mp, i != 1)) {
2087 error = -EFSCORRUPTED;
2088 goto error0;
2089 }
b194c7d8 2090 if ((error = xfs_btree_delete(cnt_cur, &i)))
2bd0ea18 2091 goto error0;
fbb4fa7f
DW
2092 if (XFS_IS_CORRUPT(mp, i != 1)) {
2093 error = -EFSCORRUPTED;
2094 goto error0;
2095 }
2bd0ea18
NS
2096 /*
2097 * Back up the by-block cursor to the left neighbor, and
2098 * update its length.
2099 */
b194c7d8 2100 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2bd0ea18 2101 goto error0;
fbb4fa7f
DW
2102 if (XFS_IS_CORRUPT(mp, i != 1)) {
2103 error = -EFSCORRUPTED;
2104 goto error0;
2105 }
2bd0ea18
NS
2106 nbno = ltbno;
2107 nlen = len + ltlen;
0e266570 2108 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2bd0ea18
NS
2109 goto error0;
2110 }
2111 /*
2112 * Have only a right contiguous neighbor.
2113 * Merge it together with the new freespace.
2114 */
2115 else if (haveright) {
2116 /*
2117 * Delete the old by-size entry on the right.
2118 */
0e266570 2119 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2bd0ea18 2120 goto error0;
fbb4fa7f
DW
2121 if (XFS_IS_CORRUPT(mp, i != 1)) {
2122 error = -EFSCORRUPTED;
2123 goto error0;
2124 }
b194c7d8 2125 if ((error = xfs_btree_delete(cnt_cur, &i)))
2bd0ea18 2126 goto error0;
fbb4fa7f
DW
2127 if (XFS_IS_CORRUPT(mp, i != 1)) {
2128 error = -EFSCORRUPTED;
2129 goto error0;
2130 }
2bd0ea18 2131 /*
5000d01d 2132 * Update the starting block and length of the right
2bd0ea18
NS
2133 * neighbor in the by-block tree.
2134 */
2135 nbno = bno;
2136 nlen = len + gtlen;
0e266570 2137 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2bd0ea18
NS
2138 goto error0;
2139 }
2140 /*
2141 * No contiguous neighbors.
2142 * Insert the new freespace into the by-block tree.
2143 */
2144 else {
2145 nbno = bno;
2146 nlen = len;
b194c7d8 2147 if ((error = xfs_btree_insert(bno_cur, &i)))
2bd0ea18 2148 goto error0;
fbb4fa7f
DW
2149 if (XFS_IS_CORRUPT(mp, i != 1)) {
2150 error = -EFSCORRUPTED;
2151 goto error0;
2152 }
2bd0ea18
NS
2153 }
2154 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2155 bno_cur = NULL;
2156 /*
2157 * In all cases we need to insert the new freespace in the by-size tree.
2158 */
0e266570 2159 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2bd0ea18 2160 goto error0;
fbb4fa7f
DW
2161 if (XFS_IS_CORRUPT(mp, i != 0)) {
2162 error = -EFSCORRUPTED;
2163 goto error0;
2164 }
b194c7d8 2165 if ((error = xfs_btree_insert(cnt_cur, &i)))
2bd0ea18 2166 goto error0;
fbb4fa7f
DW
2167 if (XFS_IS_CORRUPT(mp, i != 1)) {
2168 error = -EFSCORRUPTED;
2169 goto error0;
2170 }
2bd0ea18
NS
2171 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2172 cnt_cur = NULL;
a2ceac1f 2173
2bd0ea18
NS
2174 /*
2175 * Update the freespace totals in the ag and superblock.
2176 */
10c0a390
GX
2177 error = xfs_alloc_update_counters(tp, agbp, len);
2178 xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
a2ceac1f
DC
2179 if (error)
2180 goto error0;
2181
79896434
BD
2182 XFS_STATS_INC(mp, xs_freex);
2183 XFS_STATS_ADD(mp, xs_freeb, len);
56b2de80 2184
65a15e06 2185 trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
3e535bba 2186
2bd0ea18
NS
2187 return 0;
2188
2189 error0:
65a15e06 2190 trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2bd0ea18
NS
2191 if (bno_cur)
2192 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2193 if (cnt_cur)
2194 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2195 return error;
2196}
2197
5000d01d 2198/*
2bd0ea18
NS
2199 * Visible (exported) allocation/free functions.
2200 * Some of these are used just by xfs_alloc_btree.c and this file.
2201 */
2202
2203/*
4aa2259d 2204 * Compute and fill in value of m_alloc_maxlevels.
2bd0ea18
NS
2205 */
2206void
2207xfs_alloc_compute_maxlevels(
2208 xfs_mount_t *mp) /* file system mount structure */
2209{
4aa2259d 2210 mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
730e2a19 2211 (mp->m_sb.sb_agblocks + 1) / 2);
441815c7 2212 ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
2bd0ea18
NS
2213}
2214
56b2de80 2215/*
cf8ce220
DW
2216 * Find the length of the longest extent in an AG. The 'need' parameter
2217 * specifies how much space we're going to need for the AGFL and the
2218 * 'reserved' parameter tells us how many blocks in this AG are reserved for
2219 * other callers.
56b2de80
DC
2220 */
2221xfs_extlen_t
2222xfs_alloc_longest_free_extent(
72bda06d 2223 struct xfs_perag *pag,
cf8ce220
DW
2224 xfs_extlen_t need,
2225 xfs_extlen_t reserved)
56b2de80 2226{
72bda06d 2227 xfs_extlen_t delta = 0;
56b2de80 2228
cf8ce220
DW
2229 /*
2230 * If the AGFL needs a recharge, we'll have to subtract that from the
2231 * longest extent.
2232 */
56b2de80
DC
2233 if (need > pag->pagf_flcount)
2234 delta = need - pag->pagf_flcount;
2235
cf8ce220
DW
2236 /*
2237 * If we cannot maintain others' reservations with space from the
2238 * not-longest freesp extents, we'll have to subtract /that/ from
2239 * the longest extent too.
2240 */
2241 if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2242 delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2243
2244 /*
2245 * If the longest extent is long enough to satisfy all the
2246 * reservations and AGFL rules in place, we can return this extent.
2247 */
56b2de80 2248 if (pag->pagf_longest > delta)
14058d94
DC
2249 return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
2250 pag->pagf_longest - delta);
cf8ce220
DW
2251
2252 /* Otherwise, let the caller try for 1 block if there's space. */
56b2de80
DC
2253 return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2254}
2255
b75bb1bd
DW
2256/*
2257 * Compute the minimum length of the AGFL in the given AG. If @pag is NULL,
2258 * return the largest possible minimum length.
2259 */
de046644
DC
2260unsigned int
2261xfs_alloc_min_freelist(
2262 struct xfs_mount *mp,
2263 struct xfs_perag *pag)
2264{
b75bb1bd
DW
2265 /* AG btrees have at least 1 level. */
2266 static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
2267 const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
de046644
DC
2268 unsigned int min_free;
2269
4aa2259d 2270 ASSERT(mp->m_alloc_maxlevels > 0);
b75bb1bd 2271
de046644 2272 /* space needed by-bno freespace btree */
b75bb1bd 2273 min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
4aa2259d 2274 mp->m_alloc_maxlevels);
de046644 2275 /* space needed by-size freespace btree */
b75bb1bd 2276 min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
4aa2259d 2277 mp->m_alloc_maxlevels);
b8a8d6e5 2278 /* space needed reverse mapping used space btree */
94541a16 2279 if (xfs_has_rmapbt(mp))
b75bb1bd
DW
2280 min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
2281 mp->m_rmap_maxlevels);
de046644
DC
2282
2283 return min_free;
2284}
2285
5515b7c1
DC
2286/*
2287 * Check if the operation we are fixing up the freelist for should go ahead or
2288 * not. If we are freeing blocks, we always allow it, otherwise the allocation
2289 * is dependent on whether the size and shape of free space available will
2290 * permit the requested allocation to take place.
2291 */
2292static bool
2293xfs_alloc_space_available(
2294 struct xfs_alloc_arg *args,
2295 xfs_extlen_t min_free,
2296 int flags)
2297{
2298 struct xfs_perag *pag = args->pag;
3fe4a6dd 2299 xfs_extlen_t alloc_len, longest;
cf8ce220 2300 xfs_extlen_t reservation; /* blocks that are still reserved */
5515b7c1 2301 int available;
580a4380 2302 xfs_extlen_t agflcount;
5515b7c1
DC
2303
2304 if (flags & XFS_ALLOC_FLAG_FREEING)
2305 return true;
2306
cf8ce220
DW
2307 reservation = xfs_ag_resv_needed(pag, args->resv);
2308
5515b7c1 2309 /* do we have enough contiguous free space for the allocation? */
3fe4a6dd 2310 alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
1421de38 2311 longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
3fe4a6dd 2312 if (longest < alloc_len)
5515b7c1
DC
2313 return false;
2314
580a4380
BF
2315 /*
2316 * Do we have enough free space remaining for the allocation? Don't
2317 * account extra agfl blocks because we are about to defer free them,
2318 * making them unavailable until the current transaction commits.
2319 */
2320 agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2321 available = (int)(pag->pagf_freeblks + agflcount -
2c003dc2 2322 reservation - min_free - args->minleft);
3fe4a6dd 2323 if (available < (int)max(args->total, alloc_len))
5515b7c1
DC
2324 return false;
2325
2c003dc2
CH
2326 /*
2327 * Clamp maxlen to the amount of free space available for the actual
2328 * extent allocation.
2329 */
2330 if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2331 args->maxlen = available;
2332 ASSERT(args->maxlen > 0);
2333 ASSERT(args->maxlen >= args->minlen);
2334 }
2335
5515b7c1
DC
2336 return true;
2337}
2338
30c8be8a
BF
2339int
2340xfs_free_agfl_block(
2341 struct xfs_trans *tp,
2342 xfs_agnumber_t agno,
2343 xfs_agblock_t agbno,
2344 struct xfs_buf *agbp,
2345 struct xfs_owner_info *oinfo)
2346{
2347 int error;
2348 struct xfs_buf *bp;
2349
2350 error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2351 XFS_AG_RESV_AGFL);
2352 if (error)
2353 return error;
2354
78fcd346
DW
2355 error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
2356 XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
2357 tp->t_mountp->m_bsize, 0, &bp);
2358 if (error)
2359 return error;
30c8be8a
BF
2360 xfs_trans_binval(tp, bp);
2361
2362 return 0;
2363}
2364
8dbee8f5
BF
2365/*
2366 * Check the agfl fields of the agf for inconsistency or corruption. The purpose
2367 * is to detect an agfl header padding mismatch between current and early v5
2368 * kernels. This problem manifests as a 1-slot size difference between the
2369 * on-disk flcount and the active [first, last] range of a wrapped agfl. This
2370 * may also catch variants of agfl count corruption unrelated to padding. Either
2371 * way, we'll reset the agfl and warn the user.
2372 *
2373 * Return true if a reset is required before the agfl can be used, false
2374 * otherwise.
2375 */
2376static bool
2377xfs_agfl_needs_reset(
2378 struct xfs_mount *mp,
2379 struct xfs_agf *agf)
2380{
2381 uint32_t f = be32_to_cpu(agf->agf_flfirst);
2382 uint32_t l = be32_to_cpu(agf->agf_fllast);
2383 uint32_t c = be32_to_cpu(agf->agf_flcount);
2384 int agfl_size = xfs_agfl_size(mp);
2385 int active;
2386
2387 /* no agfl header on v4 supers */
b16a427a 2388 if (!xfs_has_crc(mp))
8dbee8f5
BF
2389 return false;
2390
2391 /*
2392 * The agf read verifier catches severe corruption of these fields.
2393 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2394 * the verifier allows it.
2395 */
2396 if (f >= agfl_size || l >= agfl_size)
2397 return true;
2398 if (c > agfl_size)
2399 return true;
2400
2401 /*
2402 * Check consistency between the on-disk count and the active range. An
2403 * agfl padding mismatch manifests as an inconsistent flcount.
2404 */
2405 if (c && l >= f)
2406 active = l - f + 1;
2407 else if (c)
2408 active = agfl_size - f + l + 1;
2409 else
2410 active = 0;
2411
2412 return active != c;
2413}
2414
2415/*
2416 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2417 * agfl content cannot be trusted. Warn the user that a repair is required to
2418 * recover leaked blocks.
2419 *
2420 * The purpose of this mechanism is to handle filesystems affected by the agfl
2421 * header padding mismatch problem. A reset keeps the filesystem online with a
2422 * relatively minor free space accounting inconsistency rather than suffer the
2423 * inevitable crash from use of an invalid agfl block.
2424 */
2425static void
2426xfs_agfl_reset(
2427 struct xfs_trans *tp,
2428 struct xfs_buf *agbp,
2429 struct xfs_perag *pag)
2430{
2431 struct xfs_mount *mp = tp->t_mountp;
0bc284c2 2432 struct xfs_agf *agf = agbp->b_addr;
8dbee8f5 2433
03dc2ef2 2434 ASSERT(xfs_perag_agfl_needs_reset(pag));
8dbee8f5
BF
2435 trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2436
2437 xfs_warn(mp,
2438 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2439 "Please unmount and run xfs_repair.",
2440 pag->pag_agno, pag->pagf_flcount);
2441
2442 agf->agf_flfirst = 0;
2443 agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2444 agf->agf_flcount = 0;
2445 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2446 XFS_AGF_FLCOUNT);
2447
2448 pag->pagf_flcount = 0;
03dc2ef2 2449 clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
8dbee8f5
BF
2450}
2451
d5c1b462
BF
2452/*
2453 * Defer an AGFL block free. This is effectively equivalent to
7d84b02d 2454 * xfs_free_extent_later() with some special handling particular to AGFL blocks.
d5c1b462
BF
2455 *
2456 * Deferring AGFL frees helps prevent log reservation overruns due to too many
2457 * allocation operations in a transaction. AGFL frees are prone to this problem
2458 * because for one they are always freed one at a time. Further, an immediate
2459 * AGFL block free can cause a btree join and require another block free before
2460 * the real allocation can proceed. Deferring the free disconnects freeing up
2461 * the AGFL slot from freeing the block.
2462 */
2463STATIC void
2464xfs_defer_agfl_block(
21375e5d 2465 struct xfs_trans *tp,
d5c1b462
BF
2466 xfs_agnumber_t agno,
2467 xfs_fsblock_t agbno,
2468 struct xfs_owner_info *oinfo)
2469{
21375e5d 2470 struct xfs_mount *mp = tp->t_mountp;
feee990c 2471 struct xfs_extent_free_item *xefi;
d5c1b462 2472
7d84b02d 2473 ASSERT(xfs_extfree_item_cache != NULL);
d5c1b462
BF
2474 ASSERT(oinfo != NULL);
2475
feee990c 2476 xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
d7faa18a 2477 GFP_KERNEL | __GFP_NOFAIL);
feee990c
DW
2478 xefi->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
2479 xefi->xefi_blockcount = 1;
2480 xefi->xefi_owner = oinfo->oi_owner;
d5c1b462
BF
2481
2482 trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2483
feee990c 2484 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
d5c1b462
BF
2485}
2486
7d84b02d
DW
2487/*
2488 * Add the extent to the list of extents to be free at transaction end.
2489 * The list is maintained sorted (by block number).
2490 */
2491void
2492__xfs_free_extent_later(
2493 struct xfs_trans *tp,
2494 xfs_fsblock_t bno,
2495 xfs_filblks_t len,
2496 const struct xfs_owner_info *oinfo,
2497 bool skip_discard)
2498{
feee990c 2499 struct xfs_extent_free_item *xefi;
7d84b02d
DW
2500#ifdef DEBUG
2501 struct xfs_mount *mp = tp->t_mountp;
2502 xfs_agnumber_t agno;
2503 xfs_agblock_t agbno;
2504
2505 ASSERT(bno != NULLFSBLOCK);
2506 ASSERT(len > 0);
d3e0c71f 2507 ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
7d84b02d
DW
2508 ASSERT(!isnullstartblock(bno));
2509 agno = XFS_FSB_TO_AGNO(mp, bno);
2510 agbno = XFS_FSB_TO_AGBNO(mp, bno);
2511 ASSERT(agno < mp->m_sb.sb_agcount);
2512 ASSERT(agbno < mp->m_sb.sb_agblocks);
2513 ASSERT(len < mp->m_sb.sb_agblocks);
2514 ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
2515#endif
2516 ASSERT(xfs_extfree_item_cache != NULL);
2517
feee990c 2518 xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
7d84b02d 2519 GFP_KERNEL | __GFP_NOFAIL);
feee990c
DW
2520 xefi->xefi_startblock = bno;
2521 xefi->xefi_blockcount = (xfs_extlen_t)len;
6d72c6ea 2522 if (skip_discard)
feee990c 2523 xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD;
6d72c6ea
DW
2524 if (oinfo) {
2525 ASSERT(oinfo->oi_offset == 0);
2526
2527 if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
feee990c 2528 xefi->xefi_flags |= XFS_EFI_ATTR_FORK;
6d72c6ea 2529 if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
feee990c
DW
2530 xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK;
2531 xefi->xefi_owner = oinfo->oi_owner;
6d72c6ea 2532 } else {
feee990c 2533 xefi->xefi_owner = XFS_RMAP_OWN_NULL;
6d72c6ea 2534 }
7d84b02d
DW
2535 trace_xfs_bmap_free_defer(tp->t_mountp,
2536 XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
2537 XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
feee990c 2538 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
7d84b02d
DW
2539}
2540
3006cea4
CB
2541#ifdef DEBUG
2542/*
2543 * Check if an AGF has a free extent record whose length is equal to
2544 * args->minlen.
2545 */
2546STATIC int
2547xfs_exact_minlen_extent_available(
2548 struct xfs_alloc_arg *args,
2549 struct xfs_buf *agbp,
2550 int *stat)
2551{
2552 struct xfs_btree_cur *cnt_cur;
2553 xfs_agblock_t fbno;
2554 xfs_extlen_t flen;
2555 int error = 0;
2556
2557 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
ecb44e84 2558 args->pag, XFS_BTNUM_CNT);
3006cea4
CB
2559 error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
2560 if (error)
2561 goto out;
2562
2563 if (*stat == 0) {
2564 error = -EFSCORRUPTED;
2565 goto out;
2566 }
2567
2568 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
2569 if (error)
2570 goto out;
2571
2572 if (*stat == 1 && flen != args->minlen)
2573 *stat = 0;
2574
2575out:
2576 xfs_btree_del_cursor(cnt_cur, error);
2577
2578 return error;
2579}
2580#endif
2581
2bd0ea18
NS
2582/*
2583 * Decide whether to use this allocation group for this allocation.
2584 * If so, fix up the btree freelist's size.
2bd0ea18 2585 */
ff105f75 2586int /* error */
2bd0ea18 2587xfs_alloc_fix_freelist(
c98e644e
DC
2588 struct xfs_alloc_arg *args, /* allocation argument structure */
2589 int flags) /* XFS_ALLOC_FLAG_... */
2bd0ea18 2590{
c98e644e
DC
2591 struct xfs_mount *mp = args->mp;
2592 struct xfs_perag *pag = args->pag;
2593 struct xfs_trans *tp = args->tp;
2594 struct xfs_buf *agbp = NULL;
2595 struct xfs_buf *agflbp = NULL;
2596 struct xfs_alloc_arg targs; /* local allocation arguments */
2597 xfs_agblock_t bno; /* freelist block */
2598 xfs_extlen_t need; /* total blocks needed in freelist */
fcdd428c 2599 int error = 0;
c98e644e 2600
d85d3424
BF
2601 /* deferred ops (AGFL block frees) require permanent transactions */
2602 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2603
03dc2ef2 2604 if (!xfs_perag_initialised_agf(pag)) {
f9084bd9 2605 error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
475f184c
DW
2606 if (error) {
2607 /* Couldn't lock the AGF so skip this AG. */
2608 if (error == -EAGAIN)
2609 error = 0;
c98e644e 2610 goto out_no_agbp;
2bd0ea18 2611 }
c98e644e 2612 }
34317449 2613
5e656dbb 2614 /*
c98e644e
DC
2615 * If this is a metadata preferred pag and we are user data then try
2616 * somewhere else if we are not being asked to try harder at this
2617 * point
34317449 2618 */
03dc2ef2
DC
2619 if (xfs_perag_prefers_metadata(pag) &&
2620 (args->datatype & XFS_ALLOC_USERDATA) &&
5e656dbb
BN
2621 (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2622 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
c98e644e 2623 goto out_agbp_relse;
34317449
NS
2624 }
2625
de046644 2626 need = xfs_alloc_min_freelist(mp, pag);
2c003dc2
CH
2627 if (!xfs_alloc_space_available(args, need, flags |
2628 XFS_ALLOC_FLAG_CHECK))
c98e644e 2629 goto out_agbp_relse;
5e656dbb 2630
2bd0ea18
NS
2631 /*
2632 * Get the a.g. freespace buffer.
2633 * Can fail if we're not blocking on locks, and it's held.
2634 */
c98e644e 2635 if (!agbp) {
f9084bd9 2636 error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
475f184c
DW
2637 if (error) {
2638 /* Couldn't lock the AGF so skip this AG. */
2639 if (error == -EAGAIN)
2640 error = 0;
c98e644e 2641 goto out_no_agbp;
2bd0ea18
NS
2642 }
2643 }
72bda06d 2644
8dbee8f5 2645 /* reset a padding mismatched agfl before final free space check */
03dc2ef2 2646 if (xfs_perag_agfl_needs_reset(pag))
8dbee8f5
BF
2647 xfs_agfl_reset(tp, agbp, pag);
2648
72bda06d 2649 /* If there isn't enough total space or single-extent, reject it. */
de046644 2650 need = xfs_alloc_min_freelist(mp, pag);
c98e644e
DC
2651 if (!xfs_alloc_space_available(args, need, flags))
2652 goto out_agbp_relse;
5515b7c1 2653
3006cea4
CB
2654#ifdef DEBUG
2655 if (args->alloc_minlen_only) {
2656 int stat;
2657
2658 error = xfs_exact_minlen_extent_available(args, agbp, &stat);
2659 if (error || !stat)
2660 goto out_agbp_relse;
2661 }
2662#endif
2bd0ea18
NS
2663 /*
2664 * Make the freelist shorter if it's too long.
72bda06d 2665 *
c98e644e
DC
2666 * Note that from this point onwards, we will always release the agf and
2667 * agfl buffers on error. This handles the case where we error out and
2668 * the buffers are clean or may not have been joined to the transaction
2669 * and hence need to be released manually. If they have been joined to
2670 * the transaction, then xfs_trans_brelse() will handle them
2671 * appropriately based on the recursion count and dirty state of the
2672 * buffer.
2673 *
72bda06d
DC
2674 * XXX (dgc): When we have lots of free space, does this buy us
2675 * anything other than extra overhead when we need to put more blocks
2676 * back on the free list? Maybe we should only do this when space is
2677 * getting low or the AGFL is more than half full?
e365af6f
DW
2678 *
2679 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2680 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2681 * updating the rmapbt. Both flags are used in xfs_repair while we're
2682 * rebuilding the rmapbt, and neither are used by the kernel. They're
2683 * both required to ensure that rmaps are correctly recorded for the
2684 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and
2685 * repair/rmap.c in xfsprogs for details.
2bd0ea18 2686 */
e365af6f 2687 memset(&targs, 0, sizeof(targs));
007347e3 2688 /* struct copy below */
e365af6f 2689 if (flags & XFS_ALLOC_FLAG_NORMAP)
007347e3 2690 targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
e365af6f 2691 else
007347e3 2692 targs.oinfo = XFS_RMAP_OINFO_AG;
e365af6f 2693 while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
1d202c10 2694 error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
5e656dbb 2695 if (error)
c98e644e 2696 goto out_agbp_relse;
30c8be8a 2697
6dc24128
BF
2698 /* defer agfl frees */
2699 xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2bd0ea18 2700 }
72bda06d 2701
2bd0ea18
NS
2702 targs.tp = tp;
2703 targs.mp = mp;
2704 targs.agbp = agbp;
2705 targs.agno = args->agno;
cf8ce220 2706 targs.alignment = targs.minlen = targs.prod = 1;
2bd0ea18
NS
2707 targs.type = XFS_ALLOCTYPE_THIS_AG;
2708 targs.pag = pag;
75c01ccc 2709 error = xfs_alloc_read_agfl(pag, tp, &agflbp);
72bda06d 2710 if (error)
c98e644e 2711 goto out_agbp_relse;
72bda06d
DC
2712
2713 /* Make the freelist longer if it's too short. */
2714 while (pag->pagf_flcount < need) {
2bd0ea18 2715 targs.agbno = 0;
72bda06d 2716 targs.maxlen = need - pag->pagf_flcount;
9760cac2 2717 targs.resv = XFS_AG_RESV_AGFL;
72bda06d
DC
2718
2719 /* Allocate as many blocks as possible at once. */
2720 error = xfs_alloc_ag_vextent(&targs);
c98e644e
DC
2721 if (error)
2722 goto out_agflbp_relse;
2723
2bd0ea18 2724 /*
dfc130f3
RC
2725 * Stop if we run out. Won't happen if callers are obeying
2726 * the restrictions correctly. Can happen for free calls
2bd0ea18
NS
2727 * on a completely full ag.
2728 */
5e656dbb
BN
2729 if (targs.agbno == NULLAGBLOCK) {
2730 if (flags & XFS_ALLOC_FLAG_FREEING)
2731 break;
c98e644e 2732 goto out_agflbp_relse;
5e656dbb 2733 }
2bd0ea18
NS
2734 /*
2735 * Put each allocated block on the list.
2736 */
2737 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
9a73333d 2738 error = xfs_alloc_put_freelist(pag, tp, agbp,
5e656dbb
BN
2739 agflbp, bno, 0);
2740 if (error)
c98e644e 2741 goto out_agflbp_relse;
2bd0ea18
NS
2742 }
2743 }
cb4deb22 2744 xfs_trans_brelse(tp, agflbp);
2bd0ea18
NS
2745 args->agbp = agbp;
2746 return 0;
c98e644e
DC
2747
2748out_agflbp_relse:
2749 xfs_trans_brelse(tp, agflbp);
2750out_agbp_relse:
2751 if (agbp)
2752 xfs_trans_brelse(tp, agbp);
2753out_no_agbp:
2754 args->agbp = NULL;
2755 return error;
2bd0ea18
NS
2756}
2757
2758/*
2759 * Get a block from the freelist.
2760 * Returns with the buffer for the block gotten.
2761 */
d2ff101f 2762int
2bd0ea18 2763xfs_alloc_get_freelist(
1d202c10 2764 struct xfs_perag *pag,
d2ff101f
DC
2765 struct xfs_trans *tp,
2766 struct xfs_buf *agbp,
2767 xfs_agblock_t *bnop,
2768 int btreeblk)
2bd0ea18 2769{
d2ff101f
DC
2770 struct xfs_agf *agf = agbp->b_addr;
2771 struct xfs_buf *agflbp;
2772 xfs_agblock_t bno;
2773 __be32 *agfl_bno;
2774 int error;
a562dd2f 2775 uint32_t logflags;
d2ff101f 2776 struct xfs_mount *mp = tp->t_mountp;
2bd0ea18 2777
2bd0ea18
NS
2778 /*
2779 * Freelist is empty, give up.
2780 */
46eca962 2781 if (!agf->agf_flcount) {
2bd0ea18
NS
2782 *bnop = NULLAGBLOCK;
2783 return 0;
2784 }
2785 /*
2786 * Read the array of free blocks.
2787 */
75c01ccc 2788 error = xfs_alloc_read_agfl(pag, tp, &agflbp);
dd5b876e 2789 if (error)
2bd0ea18 2790 return error;
dd5b876e
DC
2791
2792
2bd0ea18
NS
2793 /*
2794 * Get the block number and update the data structures.
2795 */
b134e771 2796 agfl_bno = xfs_buf_to_agfl_bno(agflbp);
dd5b876e 2797 bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
5e656dbb 2798 be32_add_cpu(&agf->agf_flfirst, 1);
2bd0ea18 2799 xfs_trans_brelse(tp, agflbp);
b8165508 2800 if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
46eca962 2801 agf->agf_flfirst = 0;
56b2de80 2802
03dc2ef2 2803 ASSERT(!xfs_perag_agfl_needs_reset(pag));
5e656dbb 2804 be32_add_cpu(&agf->agf_flcount, -1);
2bd0ea18 2805 pag->pagf_flcount--;
cdded3d8
DC
2806
2807 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2808 if (btreeblk) {
5e656dbb 2809 be32_add_cpu(&agf->agf_btreeblks, 1);
cdded3d8
DC
2810 pag->pagf_btreeblks++;
2811 logflags |= XFS_AGF_BTREEBLKS;
2812 }
2813
cdded3d8 2814 xfs_alloc_log_agf(tp, agbp, logflags);
2bd0ea18 2815 *bnop = bno;
3e535bba 2816
2bd0ea18
NS
2817 return 0;
2818}
2819
2820/*
2821 * Log the given fields from the agf structure.
2822 */
2823void
2824xfs_alloc_log_agf(
a562dd2f
DC
2825 struct xfs_trans *tp,
2826 struct xfs_buf *bp,
2827 uint32_t fields)
2bd0ea18
NS
2828{
2829 int first; /* first byte offset */
2830 int last; /* last byte offset */
2831 static const short offsets[] = {
2832 offsetof(xfs_agf_t, agf_magicnum),
2833 offsetof(xfs_agf_t, agf_versionnum),
2834 offsetof(xfs_agf_t, agf_seqno),
2835 offsetof(xfs_agf_t, agf_length),
2836 offsetof(xfs_agf_t, agf_roots[0]),
2837 offsetof(xfs_agf_t, agf_levels[0]),
2838 offsetof(xfs_agf_t, agf_flfirst),
2839 offsetof(xfs_agf_t, agf_fllast),
2840 offsetof(xfs_agf_t, agf_flcount),
2841 offsetof(xfs_agf_t, agf_freeblks),
2842 offsetof(xfs_agf_t, agf_longest),
cdded3d8 2843 offsetof(xfs_agf_t, agf_btreeblks),
dd5b876e 2844 offsetof(xfs_agf_t, agf_uuid),
8511b71a 2845 offsetof(xfs_agf_t, agf_rmap_blocks),
bc859611
DW
2846 offsetof(xfs_agf_t, agf_refcount_blocks),
2847 offsetof(xfs_agf_t, agf_refcount_root),
2848 offsetof(xfs_agf_t, agf_refcount_level),
8511b71a
DW
2849 /* needed so that we don't log the whole rest of the structure: */
2850 offsetof(xfs_agf_t, agf_spare64),
2bd0ea18
NS
2851 sizeof(xfs_agf_t)
2852 };
2853
0bc284c2 2854 trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
56b2de80 2855
bdc16ee5 2856 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
dd5b876e 2857
2bd0ea18
NS
2858 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2859 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2860}
2861
2bd0ea18
NS
2862/*
2863 * Put the block on the freelist for the allocation group.
2864 */
d2ff101f 2865int
2bd0ea18 2866xfs_alloc_put_freelist(
9a73333d 2867 struct xfs_perag *pag,
d2ff101f
DC
2868 struct xfs_trans *tp,
2869 struct xfs_buf *agbp,
2870 struct xfs_buf *agflbp,
2871 xfs_agblock_t bno,
2872 int btreeblk)
2bd0ea18 2873{
0bc284c2
CH
2874 struct xfs_mount *mp = tp->t_mountp;
2875 struct xfs_agf *agf = agbp->b_addr;
d2ff101f 2876 __be32 *blockp;
2bd0ea18 2877 int error;
a562dd2f 2878 uint32_t logflags;
dd5b876e
DC
2879 __be32 *agfl_bno;
2880 int startoff;
2bd0ea18 2881
75c01ccc
DC
2882 if (!agflbp) {
2883 error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2884 if (error)
2885 return error;
2886 }
2887
5e656dbb 2888 be32_add_cpu(&agf->agf_fllast, 1);
b8165508 2889 if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
46eca962 2890 agf->agf_fllast = 0;
56b2de80 2891
03dc2ef2 2892 ASSERT(!xfs_perag_agfl_needs_reset(pag));
5e656dbb 2893 be32_add_cpu(&agf->agf_flcount, 1);
2bd0ea18 2894 pag->pagf_flcount++;
cdded3d8
DC
2895
2896 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2897 if (btreeblk) {
5e656dbb 2898 be32_add_cpu(&agf->agf_btreeblks, -1);
cdded3d8
DC
2899 pag->pagf_btreeblks--;
2900 logflags |= XFS_AGF_BTREEBLKS;
2901 }
2902
5e656dbb
BN
2903 xfs_alloc_log_agf(tp, agbp, logflags);
2904
b8165508 2905 ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
dd5b876e 2906
b134e771 2907 agfl_bno = xfs_buf_to_agfl_bno(agflbp);
dd5b876e 2908 blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
5e656dbb 2909 *blockp = cpu_to_be32(bno);
dd5b876e
DC
2910 startoff = (char *)blockp - (char *)agflbp->b_addr;
2911
cdded3d8 2912 xfs_alloc_log_agf(tp, agbp, logflags);
dd5b876e 2913
bdc16ee5 2914 xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
dd5b876e
DC
2915 xfs_trans_log_buf(tp, agflbp, startoff,
2916 startoff + sizeof(xfs_agblock_t) - 1);
2bd0ea18
NS
2917 return 0;
2918}
2919
bc01119d 2920static xfs_failaddr_t
a2ceac1f 2921xfs_agf_verify(
95d9582b
DW
2922 struct xfs_buf *bp)
2923{
7861ef77 2924 struct xfs_mount *mp = bp->b_mount;
0bc284c2 2925 struct xfs_agf *agf = bp->b_addr;
a2ceac1f 2926
b16a427a 2927 if (xfs_has_crc(mp)) {
a65d8d29 2928 if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
bc01119d 2929 return __this_address;
0bc284c2 2930 if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
bc01119d 2931 return __this_address;
a65d8d29 2932 }
a2ceac1f 2933
68dbe77f
BF
2934 if (!xfs_verify_magic(bp, agf->agf_magicnum))
2935 return __this_address;
2936
2937 if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
dd5b876e 2938 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
b8165508
DC
2939 be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
2940 be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
2941 be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
bc01119d 2942 return __this_address;
a2ceac1f 2943
0f24228d
ZB
2944 if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks)
2945 return __this_address;
2946
2947 if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
2948 be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))
2949 return __this_address;
2950
00795aae
DW
2951 if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
2952 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
4aa2259d
DW
2953 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
2954 mp->m_alloc_maxlevels ||
2955 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
2956 mp->m_alloc_maxlevels)
bc01119d 2957 return __this_address;
5a35bf2c 2958
b16a427a 2959 if (xfs_has_rmapbt(mp) &&
00795aae 2960 (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
4aa2259d
DW
2961 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
2962 mp->m_rmap_maxlevels))
bc01119d 2963 return __this_address;
e37838e5 2964
94541a16 2965 if (xfs_has_rmapbt(mp) &&
0f24228d
ZB
2966 be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
2967 return __this_address;
2968
a2ceac1f
DC
2969 /*
2970 * during growfs operations, the perag is not fully initialised,
2971 * so we can't use it for any useful checking. growfs ensures we can't
2972 * use it by using uncached buffers that don't have the perag attached
2973 * so we can detect and avoid this problem.
2974 */
dd5b876e 2975 if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
bc01119d 2976 return __this_address;
a2ceac1f 2977
94541a16 2978 if (xfs_has_lazysbcount(mp) &&
dd5b876e 2979 be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
bc01119d 2980 return __this_address;
dd5b876e 2981
94541a16 2982 if (xfs_has_reflink(mp) &&
0f24228d
ZB
2983 be32_to_cpu(agf->agf_refcount_blocks) >
2984 be32_to_cpu(agf->agf_length))
2985 return __this_address;
2986
94541a16 2987 if (xfs_has_reflink(mp) &&
00795aae 2988 (be32_to_cpu(agf->agf_refcount_level) < 1 ||
55369096 2989 be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels))
bc01119d 2990 return __this_address;
88ce0792 2991
bc01119d 2992 return NULL;
a2ceac1f 2993
a2ceac1f
DC
2994}
2995
2996static void
2997xfs_agf_read_verify(
2998 struct xfs_buf *bp)
2999{
7861ef77 3000 struct xfs_mount *mp = bp->b_mount;
1e697959 3001 xfs_failaddr_t fa;
dd5b876e 3002
b16a427a 3003 if (xfs_has_crc(mp) &&
45922933 3004 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
1e697959
DW
3005 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
3006 else {
95d9582b 3007 fa = xfs_agf_verify(bp);
1e697959
DW
3008 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
3009 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3010 }
a2ceac1f
DC
3011}
3012
3013static void
3014xfs_agf_write_verify(
3015 struct xfs_buf *bp)
3016{
7861ef77 3017 struct xfs_mount *mp = bp->b_mount;
37d086ca 3018 struct xfs_buf_log_item *bip = bp->b_log_item;
0bc284c2 3019 struct xfs_agf *agf = bp->b_addr;
1e697959 3020 xfs_failaddr_t fa;
dd5b876e 3021
95d9582b 3022 fa = xfs_agf_verify(bp);
1e697959
DW
3023 if (fa) {
3024 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
dd5b876e
DC
3025 return;
3026 }
3027
b16a427a 3028 if (!xfs_has_crc(mp))
dd5b876e
DC
3029 return;
3030
3031 if (bip)
0bc284c2 3032 agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
dd5b876e 3033
43b5aeed 3034 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
a2ceac1f
DC
3035}
3036
3037const struct xfs_buf_ops xfs_agf_buf_ops = {
a3fac935 3038 .name = "xfs_agf",
68dbe77f 3039 .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
a2ceac1f
DC
3040 .verify_read = xfs_agf_read_verify,
3041 .verify_write = xfs_agf_write_verify,
95d9582b 3042 .verify_struct = xfs_agf_verify,
a2ceac1f
DC
3043};
3044
2bd0ea18
NS
3045/*
3046 * Read in the allocation group header (free/alloc section).
3047 */
c1030eda 3048int
56b2de80 3049xfs_read_agf(
c1030eda
DC
3050 struct xfs_perag *pag,
3051 struct xfs_trans *tp,
3052 int flags,
3053 struct xfs_buf **agfbpp)
2bd0ea18 3054{
c1030eda
DC
3055 struct xfs_mount *mp = pag->pag_mount;
3056 int error;
2bd0ea18 3057
c1030eda 3058 trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
ff105f75 3059
3e6069a1 3060 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
c1030eda
DC
3061 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
3062 XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
9440d84d 3063 if (error)
2bd0ea18 3064 return error;
56b2de80 3065
c1030eda 3066 xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
56b2de80
DC
3067 return 0;
3068}
3069
3070/*
87db57ba
DC
3071 * Read in the allocation group header (free/alloc section) and initialise the
3072 * perag structure if necessary. If the caller provides @agfbpp, then return the
3073 * locked buffer to the caller, otherwise free it.
56b2de80 3074 */
f9084bd9 3075int
56b2de80 3076xfs_alloc_read_agf(
f9084bd9
DC
3077 struct xfs_perag *pag,
3078 struct xfs_trans *tp,
3079 int flags,
87db57ba 3080 struct xfs_buf **agfbpp)
56b2de80 3081{
87db57ba 3082 struct xfs_buf *agfbp;
f9084bd9 3083 struct xfs_agf *agf;
56b2de80 3084 int error;
92c76be5 3085 int allocbt_blks;
56b2de80 3086
f9084bd9 3087 trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
56b2de80 3088
475f184c
DW
3089 /* We don't support trylock when freeing. */
3090 ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
3091 (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
c1030eda 3092 error = xfs_read_agf(pag, tp,
56b2de80 3093 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
87db57ba 3094 &agfbp);
56b2de80
DC
3095 if (error)
3096 return error;
56b2de80 3097
87db57ba 3098 agf = agfbp->b_addr;
03dc2ef2 3099 if (!xfs_perag_initialised_agf(pag)) {
6e3140c7 3100 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
cdded3d8 3101 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
6e3140c7
NS
3102 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3103 pag->pagf_longest = be32_to_cpu(agf->agf_longest);
2bd0ea18 3104 pag->pagf_levels[XFS_BTNUM_BNOi] =
6e3140c7 3105 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
2bd0ea18 3106 pag->pagf_levels[XFS_BTNUM_CNTi] =
6e3140c7 3107 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
e37838e5
DW
3108 pag->pagf_levels[XFS_BTNUM_RMAPi] =
3109 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
88ce0792 3110 pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
03dc2ef2
DC
3111 if (xfs_agfl_needs_reset(pag->pag_mount, agf))
3112 set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
92c76be5
BF
3113
3114 /*
3115 * Update the in-core allocbt counter. Filter out the rmapbt
3116 * subset of the btreeblks counter because the rmapbt is managed
3117 * by perag reservation. Subtract one for the rmapbt root block
3118 * because the rmap counter includes it while the btreeblks
3119 * counter only tracks non-root blocks.
3120 */
3121 allocbt_blks = pag->pagf_btreeblks;
f9084bd9 3122 if (xfs_has_rmapbt(pag->pag_mount))
92c76be5
BF
3123 allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
3124 if (allocbt_blks > 0)
f9084bd9
DC
3125 atomic64_add(allocbt_blks,
3126 &pag->pag_mount->m_allocbt_blks);
03dc2ef2
DC
3127
3128 set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
2bd0ea18
NS
3129 }
3130#ifdef DEBUG
f9084bd9 3131 else if (!xfs_is_shutdown(pag->pag_mount)) {
6e3140c7 3132 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
cdded3d8 3133 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
6e3140c7
NS
3134 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
3135 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
2bd0ea18 3136 ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
6e3140c7 3137 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
2bd0ea18 3138 ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
6e3140c7 3139 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
2bd0ea18
NS
3140 }
3141#endif
87db57ba
DC
3142 if (agfbpp)
3143 *agfbpp = agfbp;
3144 else
3145 xfs_trans_brelse(tp, agfbp);
2bd0ea18
NS
3146 return 0;
3147}
3148
3149/*
3150 * Allocate an extent (variable-size).
3151 * Depending on the allocation type, we either look in a single allocation
3152 * group or loop over the allocation groups to find the result.
3153 */
3154int /* error */
3155xfs_alloc_vextent(
03aeb29d 3156 struct xfs_alloc_arg *args) /* allocation argument structure */
2bd0ea18 3157{
03aeb29d
BF
3158 xfs_agblock_t agsize; /* allocation group size */
3159 int error;
3160 int flags; /* XFS_ALLOC_FLAG_... locking flags */
3161 struct xfs_mount *mp; /* mount structure pointer */
3162 xfs_agnumber_t sagno; /* starting allocation group number */
3163 xfs_alloctype_t type; /* input allocation type */
3164 int bump_rotor = 0;
3165 xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */
755477b4 3166 xfs_agnumber_t minimum_agno = 0;
2bd0ea18
NS
3167
3168 mp = args->mp;
3169 type = args->otype = args->type;
3170 args->agbno = NULLAGBLOCK;
9cdecd7e
DC
3171 if (args->tp->t_highest_agno != NULLAGNUMBER)
3172 minimum_agno = args->tp->t_highest_agno;
2bd0ea18
NS
3173 /*
3174 * Just fix this up, for the case where the last a.g. is shorter
3175 * (or there's only one a.g.) and the caller couldn't easily figure
3176 * that out (xfs_bmap_alloc).
3177 */
3178 agsize = mp->m_sb.sb_agblocks;
3179 if (args->maxlen > agsize)
3180 args->maxlen = agsize;
3181 if (args->alignment == 0)
3182 args->alignment = 1;
3183 ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
3184 ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
3185 ASSERT(args->minlen <= args->maxlen);
3186 ASSERT(args->minlen <= agsize);
3187 ASSERT(args->mod < args->prod);
3188 if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
3189 XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
3190 args->minlen > args->maxlen || args->minlen > agsize ||
3191 args->mod >= args->prod) {
3192 args->fsbno = NULLFSBLOCK;
56b2de80 3193 trace_xfs_alloc_vextent_badargs(args);
2bd0ea18
NS
3194 return 0;
3195 }
9baa549b 3196
2bd0ea18
NS
3197 switch (type) {
3198 case XFS_ALLOCTYPE_THIS_AG:
3199 case XFS_ALLOCTYPE_NEAR_BNO:
3200 case XFS_ALLOCTYPE_THIS_BNO:
3201 /*
3202 * These three force us into a single a.g.
3203 */
3204 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
56b2de80 3205 args->pag = xfs_perag_get(mp, args->agno);
755477b4
DC
3206
3207 if (minimum_agno > args->agno) {
3208 trace_xfs_alloc_vextent_skip_deadlock(args);
3209 error = 0;
3210 break;
3211 }
3212
2bd0ea18 3213 error = xfs_alloc_fix_freelist(args, 0);
2bd0ea18 3214 if (error) {
56b2de80 3215 trace_xfs_alloc_vextent_nofix(args);
2bd0ea18
NS
3216 goto error0;
3217 }
3218 if (!args->agbp) {
56b2de80 3219 trace_xfs_alloc_vextent_noagbp(args);
2bd0ea18
NS
3220 break;
3221 }
3222 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
0e266570 3223 if ((error = xfs_alloc_ag_vextent(args)))
2bd0ea18 3224 goto error0;
2bd0ea18
NS
3225 break;
3226 case XFS_ALLOCTYPE_START_BNO:
3227 /*
3228 * Try near allocation first, then anywhere-in-ag after
3229 * the first a.g. fails.
3230 */
1fccd5c8 3231 if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
0ee9753e 3232 xfs_is_inode32(mp)) {
46eca962
NS
3233 args->fsbno = XFS_AGB_TO_FSB(mp,
3234 ((mp->m_agfrotor / rotorstep) %
3235 mp->m_sb.sb_agcount), 0);
34317449
NS
3236 bump_rotor = 1;
3237 }
2bd0ea18
NS
3238 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3239 args->type = XFS_ALLOCTYPE_NEAR_BNO;
df9c7d8d 3240 fallthrough;
2bd0ea18
NS
3241 case XFS_ALLOCTYPE_FIRST_AG:
3242 /*
3243 * Rotate through the allocation groups looking for a winner.
755477b4
DC
3244 * If we are blocking, we must obey minimum_agno contraints for
3245 * avoiding ABBA deadlocks on AGF locking.
2bd0ea18 3246 */
f3eda3a5 3247 if (type == XFS_ALLOCTYPE_FIRST_AG) {
2bd0ea18
NS
3248 /*
3249 * Start with allocation group given by bno.
3250 */
3251 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3252 args->type = XFS_ALLOCTYPE_THIS_AG;
755477b4 3253 sagno = minimum_agno;
2bd0ea18
NS
3254 flags = 0;
3255 } else {
2bd0ea18
NS
3256 /*
3257 * Start with the given allocation group.
3258 */
3259 args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3260 flags = XFS_ALLOC_FLAG_TRYLOCK;
3261 }
755477b4 3262
2bd0ea18
NS
3263 /*
3264 * Loop over allocation groups twice; first time with
3265 * trylock set, second time without.
3266 */
3267 for (;;) {
56b2de80 3268 args->pag = xfs_perag_get(mp, args->agno);
9baa549b 3269 error = xfs_alloc_fix_freelist(args, flags);
9baa549b 3270 if (error) {
56b2de80 3271 trace_xfs_alloc_vextent_nofix(args);
2bd0ea18
NS
3272 goto error0;
3273 }
3274 /*
3275 * If we get a buffer back then the allocation will fly.
3276 */
3277 if (args->agbp) {
0e266570 3278 if ((error = xfs_alloc_ag_vextent(args)))
2bd0ea18 3279 goto error0;
2bd0ea18
NS
3280 break;
3281 }
56b2de80
DC
3282
3283 trace_xfs_alloc_vextent_loopfailed(args);
3284
2bd0ea18
NS
3285 /*
3286 * Didn't work, figure out the next iteration.
3287 */
3288 if (args->agno == sagno &&
3289 type == XFS_ALLOCTYPE_START_BNO)
3290 args->type = XFS_ALLOCTYPE_THIS_AG;
755477b4 3291
5e656dbb 3292 /*
755477b4
DC
3293 * If we are try-locking, we can't deadlock on AGF
3294 * locks, so we can wrap all the way back to the first
3295 * AG. Otherwise, wrap back to the start AG so we can't
3296 * deadlock, and let the end of scan handler decide what
3297 * to do next.
3298 */
5e656dbb 3299 if (++(args->agno) == mp->m_sb.sb_agcount) {
755477b4 3300 if (flags & XFS_ALLOC_FLAG_TRYLOCK)
5e656dbb 3301 args->agno = 0;
755477b4
DC
3302 else
3303 args->agno = sagno;
5e656dbb 3304 }
755477b4 3305
5000d01d 3306 /*
2bd0ea18
NS
3307 * Reached the starting a.g., must either be done
3308 * or switch to non-trylock mode.
3309 */
3310 if (args->agno == sagno) {
a3b4a951 3311 if (flags == 0) {
2bd0ea18 3312 args->agbno = NULLAGBLOCK;
56b2de80 3313 trace_xfs_alloc_vextent_allfailed(args);
2bd0ea18
NS
3314 break;
3315 }
a3b4a951 3316
755477b4
DC
3317 /*
3318 * Blocking pass next, so we must obey minimum
3319 * agno constraints to avoid ABBA AGF deadlocks.
3320 */
a3b4a951 3321 flags = 0;
755477b4
DC
3322 if (minimum_agno > sagno)
3323 sagno = minimum_agno;
3324
a3b4a951
CH
3325 if (type == XFS_ALLOCTYPE_START_BNO) {
3326 args->agbno = XFS_FSB_TO_AGBNO(mp,
3327 args->fsbno);
3328 args->type = XFS_ALLOCTYPE_NEAR_BNO;
2bd0ea18
NS
3329 }
3330 }
56b2de80 3331 xfs_perag_put(args->pag);
2bd0ea18 3332 }
f3eda3a5 3333 if (bump_rotor) {
46eca962
NS
3334 if (args->agno == sagno)
3335 mp->m_agfrotor = (mp->m_agfrotor + 1) %
3336 (mp->m_sb.sb_agcount * rotorstep);
3337 else
3338 mp->m_agfrotor = (args->agno * rotorstep + 1) %
3339 (mp->m_sb.sb_agcount * rotorstep);
3340 }
2bd0ea18
NS
3341 break;
3342 default:
3343 ASSERT(0);
3344 /* NOTREACHED */
3345 }
755477b4 3346 if (args->agbno == NULLAGBLOCK) {
2bd0ea18 3347 args->fsbno = NULLFSBLOCK;
755477b4 3348 } else {
2bd0ea18
NS
3349 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
3350#ifdef DEBUG
3351 ASSERT(args->len >= args->minlen);
3352 ASSERT(args->len <= args->maxlen);
3353 ASSERT(args->agbno % args->alignment == 0);
3354 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
3355 args->len);
3356#endif
9542ae13 3357
2bd0ea18 3358 }
755477b4
DC
3359
3360 /*
3361 * We end up here with a locked AGF. If we failed, the caller is likely
3362 * going to try to allocate again with different parameters, and that
3363 * can widen the AGs that are searched for free space. If we have to do
3364 * BMBT block allocation, we have to do a new allocation.
3365 *
3366 * Hence leaving this function with the AGF locked opens up potential
3367 * ABBA AGF deadlocks because a future allocation attempt in this
3368 * transaction may attempt to lock a lower number AGF.
3369 *
3370 * We can't release the AGF until the transaction is commited, so at
3371 * this point we must update the "firstblock" tracker to point at this
3372 * AG if the tracker is empty or points to a lower AG. This allows the
3373 * next allocation attempt to be modified appropriately to avoid
3374 * deadlocks.
3375 */
3376 if (args->agbp &&
9cdecd7e
DC
3377 (args->tp->t_highest_agno == NULLAGNUMBER ||
3378 args->pag->pag_agno > minimum_agno))
3379 args->tp->t_highest_agno = args->pag->pag_agno;
56b2de80 3380 xfs_perag_put(args->pag);
2bd0ea18
NS
3381 return 0;
3382error0:
56b2de80 3383 xfs_perag_put(args->pag);
2bd0ea18
NS
3384 return error;
3385}
3386
2a6da3b8
DC
3387/* Ensure that the freelist is at full capacity. */
3388int
3389xfs_free_extent_fix_freelist(
3390 struct xfs_trans *tp,
7635c486 3391 struct xfs_perag *pag,
2a6da3b8 3392 struct xfs_buf **agbp)
2bd0ea18 3393{
2a6da3b8
DC
3394 struct xfs_alloc_arg args;
3395 int error;
2bd0ea18 3396
2a6da3b8 3397 memset(&args, 0, sizeof(struct xfs_alloc_arg));
2bd0ea18
NS
3398 args.tp = tp;
3399 args.mp = tp->t_mountp;
7635c486
DC
3400 args.agno = pag->pag_agno;
3401 args.pag = pag;
a2ceac1f
DC
3402
3403 /*
3404 * validate that the block number is legal - the enables us to detect
3405 * and handle a silent filesystem corruption rather than crashing.
3406 */
a2ceac1f 3407 if (args.agno >= args.mp->m_sb.sb_agcount)
12b53197 3408 return -EFSCORRUPTED;
a2ceac1f 3409
a2ceac1f
DC
3410 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3411 if (error)
7635c486 3412 return error;
2a6da3b8
DC
3413
3414 *agbp = args.agbp;
7635c486 3415 return 0;
2a6da3b8
DC
3416}
3417
3418/*
3419 * Free an extent.
3420 * Just break up the extent address and hand off to xfs_free_ag_extent
3421 * after fixing up the freelist.
3422 */
5837e73b 3423int
3a13f959 3424__xfs_free_extent(
5837e73b
DW
3425 struct xfs_trans *tp,
3426 xfs_fsblock_t bno,
3427 xfs_extlen_t len,
3428 const struct xfs_owner_info *oinfo,
3429 enum xfs_ag_resv_type type,
3430 bool skip_discard)
2a6da3b8 3431{
5837e73b
DW
3432 struct xfs_mount *mp = tp->t_mountp;
3433 struct xfs_buf *agbp;
3434 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
3435 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
0bc284c2 3436 struct xfs_agf *agf;
5837e73b
DW
3437 int error;
3438 unsigned int busy_flags = 0;
7635c486 3439 struct xfs_perag *pag;
2a6da3b8
DC
3440
3441 ASSERT(len != 0);
9760cac2 3442 ASSERT(type != XFS_AG_RESV_AGFL);
2a6da3b8 3443
a9da40de 3444 if (XFS_TEST_ERROR(false, mp,
e2a190dd 3445 XFS_ERRTAG_FREE_EXTENT))
a9da40de
DW
3446 return -EIO;
3447
7635c486
DC
3448 pag = xfs_perag_get(mp, agno);
3449 error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
2a6da3b8 3450 if (error)
7635c486 3451 goto err;
0bc284c2 3452 agf = agbp->b_addr;
2a6da3b8 3453
fbb4fa7f
DW
3454 if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
3455 error = -EFSCORRUPTED;
7635c486 3456 goto err_release;
fbb4fa7f 3457 }
a2ceac1f
DC
3458
3459 /* validate the extent size is legal now we have the agf locked */
0bc284c2 3460 if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
fbb4fa7f 3461 error = -EFSCORRUPTED;
7635c486 3462 goto err_release;
fbb4fa7f 3463 }
a2ceac1f 3464
cf8ce220 3465 error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
2a6da3b8 3466 if (error)
7635c486 3467 goto err_release;
2a6da3b8 3468
3a13f959
BF
3469 if (skip_discard)
3470 busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
7635c486
DC
3471 xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
3472 xfs_perag_put(pag);
2a6da3b8
DC
3473 return 0;
3474
7635c486 3475err_release:
2a6da3b8 3476 xfs_trans_brelse(tp, agbp);
7635c486
DC
3477err:
3478 xfs_perag_put(pag);
2bd0ea18
NS
3479 return error;
3480}
b3d83fa6
DW
3481
3482struct xfs_alloc_query_range_info {
3483 xfs_alloc_query_range_fn fn;
3484 void *priv;
3485};
3486
3487/* Format btree record and pass to our callback. */
3488STATIC int
3489xfs_alloc_query_range_helper(
3490 struct xfs_btree_cur *cur,
e62318a3 3491 const union xfs_btree_rec *rec,
b3d83fa6
DW
3492 void *priv)
3493{
3494 struct xfs_alloc_query_range_info *query = priv;
3495 struct xfs_alloc_rec_incore irec;
3496
3497 irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
3498 irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
3499 return query->fn(cur, &irec, query->priv);
3500}
3501
3502/* Find all free space within a given range of blocks. */
3503int
3504xfs_alloc_query_range(
3505 struct xfs_btree_cur *cur,
d34c6373
DW
3506 const struct xfs_alloc_rec_incore *low_rec,
3507 const struct xfs_alloc_rec_incore *high_rec,
b3d83fa6
DW
3508 xfs_alloc_query_range_fn fn,
3509 void *priv)
3510{
3511 union xfs_btree_irec low_brec;
3512 union xfs_btree_irec high_brec;
3513 struct xfs_alloc_query_range_info query;
3514
3515 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3516 low_brec.a = *low_rec;
3517 high_brec.a = *high_rec;
3518 query.priv = priv;
3519 query.fn = fn;
3520 return xfs_btree_query_range(cur, &low_brec, &high_brec,
3521 xfs_alloc_query_range_helper, &query);
3522}
7e05e856
DW
3523
3524/* Find all free space records. */
3525int
3526xfs_alloc_query_all(
3527 struct xfs_btree_cur *cur,
3528 xfs_alloc_query_range_fn fn,
3529 void *priv)
3530{
3531 struct xfs_alloc_query_range_info query;
3532
3533 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3534 query.priv = priv;
3535 query.fn = fn;
3536 return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
3537}
9bef6258 3538
1fe41a73
DW
3539/* Is there a record covering a given extent? */
3540int
3541xfs_alloc_has_record(
3542 struct xfs_btree_cur *cur,
3543 xfs_agblock_t bno,
3544 xfs_extlen_t len,
3545 bool *exists)
3546{
3547 union xfs_btree_irec low;
3548 union xfs_btree_irec high;
3549
3550 memset(&low, 0, sizeof(low));
3551 low.a.ar_startblock = bno;
3552 memset(&high, 0xFF, sizeof(high));
3553 high.a.ar_startblock = bno + len - 1;
3554
3555 return xfs_btree_has_record(cur, &low, &high, exists);
3556}
71a98c66
DW
3557
3558/*
3559 * Walk all the blocks in the AGFL. The @walk_fn can return any negative
4a509d6d 3560 * error code or XFS_ITER_*.
71a98c66
DW
3561 */
3562int
3563xfs_agfl_walk(
3564 struct xfs_mount *mp,
3565 struct xfs_agf *agf,
3566 struct xfs_buf *agflbp,
3567 xfs_agfl_walk_fn walk_fn,
3568 void *priv)
3569{
3570 __be32 *agfl_bno;
3571 unsigned int i;
3572 int error;
3573
b134e771 3574 agfl_bno = xfs_buf_to_agfl_bno(agflbp);
71a98c66
DW
3575 i = be32_to_cpu(agf->agf_flfirst);
3576
3577 /* Nothing to walk in an empty AGFL. */
3578 if (agf->agf_flcount == cpu_to_be32(0))
3579 return 0;
3580
3581 /* Otherwise, walk from first to last, wrapping as needed. */
3582 for (;;) {
3583 error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
3584 if (error)
3585 return error;
3586 if (i == be32_to_cpu(agf->agf_fllast))
3587 break;
3588 if (++i == xfs_agfl_size(mp))
3589 i = 0;
3590 }
3591
3592 return 0;
3593}
7d84b02d
DW
3594
3595int __init
3596xfs_extfree_intent_init_cache(void)
3597{
3598 xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
3599 sizeof(struct xfs_extent_free_item),
3600 0, 0, NULL);
3601
3602 return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
3603}
3604
3605void
3606xfs_extfree_intent_destroy_cache(void)
3607{
3608 kmem_cache_destroy(xfs_extfree_item_cache);
3609 xfs_extfree_item_cache = NULL;
3610}