]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blame - libxfs/xfs_alloc.c
xfs: refactor and reuse best extent scanning logic
[thirdparty/xfsprogs-dev.git] / libxfs / xfs_alloc.c
CommitLineData
37b3b4d6 1// SPDX-License-Identifier: GPL-2.0
2bd0ea18 2/*
5e656dbb 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
da23017d 4 * All Rights Reserved.
2bd0ea18 5 */
9c799827 6#include "libxfs_priv.h"
b626fb59
DC
7#include "xfs_fs.h"
8#include "xfs_format.h"
9#include "xfs_log_format.h"
10#include "xfs_shared.h"
11#include "xfs_trans_resv.h"
12#include "xfs_bit.h"
13#include "xfs_sb.h"
14#include "xfs_mount.h"
f944d3d0 15#include "xfs_defer.h"
b626fb59 16#include "xfs_btree.h"
631ac87a 17#include "xfs_rmap.h"
b626fb59
DC
18#include "xfs_alloc_btree.h"
19#include "xfs_alloc.h"
b626fb59
DC
20#include "xfs_trace.h"
21#include "xfs_trans.h"
cf8ce220 22#include "xfs_ag_resv.h"
d5c1b462
BF
23#include "xfs_bmap.h"
24
25extern kmem_zone_t *xfs_bmap_free_item_zone;
2bd0ea18 26
ff105f75
DC
27struct workqueue_struct *xfs_alloc_wq;
28
2bd0ea18 29#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
5e656dbb
BN
30
31#define XFSA_FIXUP_BNO_OK 1
32#define XFSA_FIXUP_CNT_OK 2
33
5e656dbb
BN
34STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
35STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
36STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
2bd0ea18 37
b8165508
DC
38/*
39 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
40 * the beginning of the block for a proper header with the location information
41 * and CRC.
42 */
43unsigned int
44xfs_agfl_size(
45 struct xfs_mount *mp)
46{
47 unsigned int size = mp->m_sb.sb_sectsize;
48
49 if (xfs_sb_version_hascrc(&mp->m_sb))
50 size -= sizeof(struct xfs_agfl);
51
52 return size / sizeof(xfs_agblock_t);
53}
54
2a96beb9
DW
55unsigned int
56xfs_refc_block(
57 struct xfs_mount *mp)
58{
59 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
60 return XFS_RMAP_BLOCK(mp) + 1;
61 if (xfs_sb_version_hasfinobt(&mp->m_sb))
62 return XFS_FIBT_BLOCK(mp) + 1;
63 return XFS_IBT_BLOCK(mp) + 1;
64}
65
ef5340cd
DW
66xfs_extlen_t
67xfs_prealloc_blocks(
68 struct xfs_mount *mp)
69{
2a96beb9
DW
70 if (xfs_sb_version_hasreflink(&mp->m_sb))
71 return xfs_refc_block(mp) + 1;
ef5340cd
DW
72 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
73 return XFS_RMAP_BLOCK(mp) + 1;
74 if (xfs_sb_version_hasfinobt(&mp->m_sb))
75 return XFS_FIBT_BLOCK(mp) + 1;
76 return XFS_IBT_BLOCK(mp) + 1;
77}
78
b8a8d6e5
DW
79/*
80 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
81 * AGF buffer (PV 947395), we place constraints on the relationship among
82 * actual allocations for data blocks, freelist blocks, and potential file data
83 * bmap btree blocks. However, these restrictions may result in no actual space
84 * allocated for a delayed extent, for example, a data block in a certain AG is
85 * allocated but there is no additional block for the additional bmap btree
86 * block due to a split of the bmap btree of the file. The result of this may
87 * lead to an infinite loop when the file gets flushed to disk and all delayed
88 * extents need to be actually allocated. To get around this, we explicitly set
89 * aside a few blocks which will not be reserved in delayed allocation.
90 *
cf8ce220
DW
91 * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
92 * potential split of the file's bmap btree.
b8a8d6e5
DW
93 */
94unsigned int
95xfs_alloc_set_aside(
96 struct xfs_mount *mp)
97{
8eeb15ea 98 return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4);
b8a8d6e5
DW
99}
100
101/*
102 * When deciding how much space to allocate out of an AG, we limit the
103 * allocation maximum size to the size the AG. However, we cannot use all the
104 * blocks in the AG - some are permanently used by metadata. These
105 * blocks are generally:
106 * - the AG superblock, AGF, AGI and AGFL
107 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally
108 * the AGI free inode and rmap btree root blocks.
109 * - blocks on the AGFL according to xfs_alloc_set_aside() limits
110 * - the rmapbt root block
111 *
112 * The AG headers are sector sized, so the amount of space they take up is
113 * dependent on filesystem geometry. The others are all single blocks.
114 */
115unsigned int
116xfs_alloc_ag_max_usable(
117 struct xfs_mount *mp)
118{
119 unsigned int blocks;
120
121 blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
122 blocks += XFS_ALLOC_AGFL_RESERVE;
123 blocks += 3; /* AGF, AGI btree root blocks */
124 if (xfs_sb_version_hasfinobt(&mp->m_sb))
125 blocks++; /* finobt root block */
126 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
127 blocks++; /* rmap root block */
868c70e3
DW
128 if (xfs_sb_version_hasreflink(&mp->m_sb))
129 blocks++; /* refcount root block */
b8a8d6e5
DW
130
131 return mp->m_sb.sb_agblocks - blocks;
132}
133
b194c7d8
BN
134/*
135 * Lookup the record equal to [bno, len] in the btree given by cur.
136 */
137STATIC int /* error */
138xfs_alloc_lookup_eq(
139 struct xfs_btree_cur *cur, /* btree cursor */
140 xfs_agblock_t bno, /* starting block of extent */
141 xfs_extlen_t len, /* length of extent */
142 int *stat) /* success/failure */
143{
7777d8c6
BF
144 int error;
145
b194c7d8
BN
146 cur->bc_rec.a.ar_startblock = bno;
147 cur->bc_rec.a.ar_blockcount = len;
7777d8c6
BF
148 error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
149 cur->bc_private.a.priv.abt.active = (*stat == 1);
150 return error;
b194c7d8
BN
151}
152
153/*
154 * Lookup the first record greater than or equal to [bno, len]
155 * in the btree given by cur.
156 */
a2ceac1f 157int /* error */
b194c7d8
BN
158xfs_alloc_lookup_ge(
159 struct xfs_btree_cur *cur, /* btree cursor */
160 xfs_agblock_t bno, /* starting block of extent */
161 xfs_extlen_t len, /* length of extent */
162 int *stat) /* success/failure */
163{
7777d8c6
BF
164 int error;
165
b194c7d8
BN
166 cur->bc_rec.a.ar_startblock = bno;
167 cur->bc_rec.a.ar_blockcount = len;
7777d8c6
BF
168 error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
169 cur->bc_private.a.priv.abt.active = (*stat == 1);
170 return error;
b194c7d8
BN
171}
172
173/*
174 * Lookup the first record less than or equal to [bno, len]
175 * in the btree given by cur.
176 */
1fe41a73 177int /* error */
b194c7d8
BN
178xfs_alloc_lookup_le(
179 struct xfs_btree_cur *cur, /* btree cursor */
180 xfs_agblock_t bno, /* starting block of extent */
181 xfs_extlen_t len, /* length of extent */
182 int *stat) /* success/failure */
183{
7777d8c6 184 int error;
b194c7d8
BN
185 cur->bc_rec.a.ar_startblock = bno;
186 cur->bc_rec.a.ar_blockcount = len;
7777d8c6
BF
187 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
188 cur->bc_private.a.priv.abt.active = (*stat == 1);
189 return error;
190}
191
192static inline bool
193xfs_alloc_cur_active(
194 struct xfs_btree_cur *cur)
195{
196 return cur && cur->bc_private.a.priv.abt.active;
b194c7d8
BN
197}
198
199/*
200 * Update the record referred to by cur to the value given
201 * by [bno, len].
202 * This either works (return 0) or gets an EFSCORRUPTED error.
203 */
204STATIC int /* error */
205xfs_alloc_update(
206 struct xfs_btree_cur *cur, /* btree cursor */
207 xfs_agblock_t bno, /* starting block of extent */
208 xfs_extlen_t len) /* length of extent */
209{
210 union xfs_btree_rec rec;
211
212 rec.alloc.ar_startblock = cpu_to_be32(bno);
213 rec.alloc.ar_blockcount = cpu_to_be32(len);
214 return xfs_btree_update(cur, &rec);
215}
216
217/*
218 * Get the data from the pointed-to record.
219 */
a2ceac1f 220int /* error */
b194c7d8
BN
221xfs_alloc_get_rec(
222 struct xfs_btree_cur *cur, /* btree cursor */
223 xfs_agblock_t *bno, /* output: starting block of extent */
224 xfs_extlen_t *len, /* output: length of extent */
225 int *stat) /* output: success/failure */
226{
ec291989
DC
227 struct xfs_mount *mp = cur->bc_mp;
228 xfs_agnumber_t agno = cur->bc_private.a.agno;
b194c7d8
BN
229 union xfs_btree_rec *rec;
230 int error;
231
232 error = xfs_btree_get_rec(cur, &rec, stat);
3741f971
DW
233 if (error || !(*stat))
234 return error;
3741f971
DW
235
236 *bno = be32_to_cpu(rec->alloc.ar_startblock);
237 *len = be32_to_cpu(rec->alloc.ar_blockcount);
238
c25e4305
CM
239 if (*len == 0)
240 goto out_bad_rec;
241
ec291989
DC
242 /* check for valid extent range, including overflow */
243 if (!xfs_verify_agbno(mp, agno, *bno))
244 goto out_bad_rec;
245 if (*bno > *bno + *len)
246 goto out_bad_rec;
247 if (!xfs_verify_agbno(mp, agno, *bno + *len - 1))
248 goto out_bad_rec;
249
250 return 0;
251
252out_bad_rec:
253 xfs_warn(mp,
254 "%s Freespace BTree record corruption in AG %d detected!",
255 cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno);
256 xfs_warn(mp,
257 "start block 0x%x block count 0x%x", *bno, *len);
258 return -EFSCORRUPTED;
b194c7d8
BN
259}
260
2bd0ea18
NS
261/*
262 * Compute aligned version of the found extent.
263 * Takes alignment and min length into account.
264 */
cd80de04 265STATIC bool
2bd0ea18 266xfs_alloc_compute_aligned(
a2ceac1f 267 xfs_alloc_arg_t *args, /* allocation argument structure */
2bd0ea18
NS
268 xfs_agblock_t foundbno, /* starting block in found extent */
269 xfs_extlen_t foundlen, /* length in found extent */
2bd0ea18 270 xfs_agblock_t *resbno, /* result block number */
cd80de04
CH
271 xfs_extlen_t *reslen, /* result length */
272 unsigned *busy_gen)
2bd0ea18 273{
cd80de04
CH
274 xfs_agblock_t bno = foundbno;
275 xfs_extlen_t len = foundlen;
ff3263dd 276 xfs_extlen_t diff;
cd80de04 277 bool busy;
2bd0ea18 278
a2ceac1f 279 /* Trim busy sections out of found extent */
cd80de04 280 busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
a2ceac1f 281
ff3263dd
BF
282 /*
283 * If we have a largish extent that happens to start before min_agbno,
284 * see if we can shift it into range...
285 */
286 if (bno < args->min_agbno && bno + len > args->min_agbno) {
287 diff = args->min_agbno - bno;
288 if (len > diff) {
289 bno += diff;
290 len -= diff;
291 }
292 }
293
a2ceac1f
DC
294 if (args->alignment > 1 && len >= args->minlen) {
295 xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
ff3263dd
BF
296
297 diff = aligned_bno - bno;
a2ceac1f
DC
298
299 *resbno = aligned_bno;
300 *reslen = diff >= len ? 0 : len - diff;
2bd0ea18 301 } else {
a2ceac1f
DC
302 *resbno = bno;
303 *reslen = len;
2bd0ea18 304 }
cd80de04
CH
305
306 return busy;
2bd0ea18
NS
307}
308
309/*
310 * Compute best start block and diff for "near" allocations.
311 * freelen >= wantlen already checked by caller.
312 */
313STATIC xfs_extlen_t /* difference value (absolute) */
314xfs_alloc_compute_diff(
315 xfs_agblock_t wantbno, /* target starting block */
316 xfs_extlen_t wantlen, /* target length */
317 xfs_extlen_t alignment, /* target alignment */
1fccd5c8 318 int datatype, /* are we allocating data? */
2bd0ea18
NS
319 xfs_agblock_t freebno, /* freespace's starting block */
320 xfs_extlen_t freelen, /* freespace's length */
321 xfs_agblock_t *newbnop) /* result: best start block from free */
322{
323 xfs_agblock_t freeend; /* end of freespace extent */
324 xfs_agblock_t newbno1; /* return block number */
325 xfs_agblock_t newbno2; /* other new block number */
0e266570
NS
326 xfs_extlen_t newlen1=0; /* length with newbno1 */
327 xfs_extlen_t newlen2=0; /* length with newbno2 */
2bd0ea18 328 xfs_agblock_t wantend; /* end of target extent */
1fccd5c8 329 bool userdata = xfs_alloc_is_userdata(datatype);
2bd0ea18
NS
330
331 ASSERT(freelen >= wantlen);
332 freeend = freebno + freelen;
333 wantend = wantbno + wantlen;
84a62eea
DC
334 /*
335 * We want to allocate from the start of a free extent if it is past
336 * the desired block or if we are allocating user data and the free
337 * extent is before desired block. The second case is there to allow
338 * for contiguous allocation from the remaining free space if the file
339 * grows in the short term.
340 */
341 if (freebno >= wantbno || (userdata && freeend < wantend)) {
2bd0ea18
NS
342 if ((newbno1 = roundup(freebno, alignment)) >= freeend)
343 newbno1 = NULLAGBLOCK;
344 } else if (freeend >= wantend && alignment > 1) {
345 newbno1 = roundup(wantbno, alignment);
346 newbno2 = newbno1 - alignment;
347 if (newbno1 >= freeend)
348 newbno1 = NULLAGBLOCK;
349 else
350 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
351 if (newbno2 < freebno)
352 newbno2 = NULLAGBLOCK;
353 else
354 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
355 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
356 if (newlen1 < newlen2 ||
357 (newlen1 == newlen2 &&
358 XFS_ABSDIFF(newbno1, wantbno) >
359 XFS_ABSDIFF(newbno2, wantbno)))
360 newbno1 = newbno2;
361 } else if (newbno2 != NULLAGBLOCK)
362 newbno1 = newbno2;
363 } else if (freeend >= wantend) {
364 newbno1 = wantbno;
365 } else if (alignment > 1) {
366 newbno1 = roundup(freeend - wantlen, alignment);
367 if (newbno1 > freeend - wantlen &&
368 newbno1 - alignment >= freebno)
369 newbno1 -= alignment;
370 else if (newbno1 >= freeend)
371 newbno1 = NULLAGBLOCK;
372 } else
373 newbno1 = freeend - wantlen;
374 *newbnop = newbno1;
375 return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
376}
377
378/*
379 * Fix up the length, based on mod and prod.
380 * len should be k * prod + mod for some k.
381 * If len is too small it is returned unchanged.
382 * If len hits maxlen it is left alone.
383 */
384STATIC void
385xfs_alloc_fix_len(
dfc130f3 386 xfs_alloc_arg_t *args) /* allocation argument structure */
2bd0ea18
NS
387{
388 xfs_extlen_t k;
389 xfs_extlen_t rlen;
390
391 ASSERT(args->mod < args->prod);
392 rlen = args->len;
393 ASSERT(rlen >= args->minlen);
394 ASSERT(rlen <= args->maxlen);
395 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
396 (args->mod == 0 && rlen < args->prod))
397 return;
398 k = rlen % args->prod;
399 if (k == args->mod)
400 return;
ff105f75
DC
401 if (k > args->mod)
402 rlen = rlen - (k - args->mod);
403 else
404 rlen = rlen - args->prod + (args->mod - k);
19ebedcf 405 /* casts to (int) catch length underflows */
ff105f75
DC
406 if ((int)rlen < (int)args->minlen)
407 return;
408 ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
409 ASSERT(rlen % args->prod == args->mod);
2c003dc2
CH
410 ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
411 rlen + args->minleft);
2bd0ea18
NS
412 args->len = rlen;
413}
414
2bd0ea18
NS
415/*
416 * Update the two btrees, logically removing from freespace the extent
417 * starting at rbno, rlen blocks. The extent is contained within the
418 * actual (current) free extent fbno for flen blocks.
419 * Flags are passed in indicating whether the cursors are set to the
420 * relevant records.
421 */
422STATIC int /* error code */
423xfs_alloc_fixup_trees(
dfc130f3
RC
424 xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */
425 xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */
2bd0ea18
NS
426 xfs_agblock_t fbno, /* starting block of free extent */
427 xfs_extlen_t flen, /* length of free extent */
428 xfs_agblock_t rbno, /* starting block of returned extent */
429 xfs_extlen_t rlen, /* length of returned extent */
430 int flags) /* flags, XFSA_FIXUP_... */
431{
432 int error; /* error code */
433 int i; /* operation results */
434 xfs_agblock_t nfbno1; /* first new free startblock */
435 xfs_agblock_t nfbno2; /* second new free startblock */
0e266570
NS
436 xfs_extlen_t nflen1=0; /* first new free length */
437 xfs_extlen_t nflen2=0; /* second new free length */
19ebedcf
DC
438 struct xfs_mount *mp;
439
440 mp = cnt_cur->bc_mp;
2bd0ea18
NS
441
442 /*
443 * Look up the record in the by-size tree if necessary.
444 */
445 if (flags & XFSA_FIXUP_CNT_OK) {
446#ifdef DEBUG
0e266570 447 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
2bd0ea18 448 return error;
19ebedcf 449 XFS_WANT_CORRUPTED_RETURN(mp,
2bd0ea18
NS
450 i == 1 && nfbno1 == fbno && nflen1 == flen);
451#endif
452 } else {
0e266570 453 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
2bd0ea18 454 return error;
19ebedcf 455 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
2bd0ea18
NS
456 }
457 /*
458 * Look up the record in the by-block tree if necessary.
459 */
460 if (flags & XFSA_FIXUP_BNO_OK) {
461#ifdef DEBUG
0e266570 462 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
2bd0ea18 463 return error;
19ebedcf 464 XFS_WANT_CORRUPTED_RETURN(mp,
2bd0ea18
NS
465 i == 1 && nfbno1 == fbno && nflen1 == flen);
466#endif
467 } else {
0e266570 468 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
2bd0ea18 469 return error;
19ebedcf 470 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
2bd0ea18 471 }
b3563c19 472
2bd0ea18 473#ifdef DEBUG
b3563c19
BN
474 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
475 struct xfs_btree_block *bnoblock;
476 struct xfs_btree_block *cntblock;
477
478 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
479 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
2bd0ea18 480
19ebedcf 481 XFS_WANT_CORRUPTED_RETURN(mp,
b3563c19 482 bnoblock->bb_numrecs == cntblock->bb_numrecs);
2bd0ea18
NS
483 }
484#endif
b3563c19 485
2bd0ea18
NS
486 /*
487 * Deal with all four cases: the allocated record is contained
488 * within the freespace record, so we can have new freespace
489 * at either (or both) end, or no freespace remaining.
490 */
491 if (rbno == fbno && rlen == flen)
492 nfbno1 = nfbno2 = NULLAGBLOCK;
493 else if (rbno == fbno) {
494 nfbno1 = rbno + rlen;
495 nflen1 = flen - rlen;
496 nfbno2 = NULLAGBLOCK;
497 } else if (rbno + rlen == fbno + flen) {
498 nfbno1 = fbno;
499 nflen1 = flen - rlen;
500 nfbno2 = NULLAGBLOCK;
501 } else {
502 nfbno1 = fbno;
503 nflen1 = rbno - fbno;
504 nfbno2 = rbno + rlen;
505 nflen2 = (fbno + flen) - nfbno2;
506 }
507 /*
508 * Delete the entry from the by-size btree.
509 */
b194c7d8 510 if ((error = xfs_btree_delete(cnt_cur, &i)))
2bd0ea18 511 return error;
19ebedcf 512 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
2bd0ea18
NS
513 /*
514 * Add new by-size btree entry(s).
515 */
516 if (nfbno1 != NULLAGBLOCK) {
0e266570 517 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
2bd0ea18 518 return error;
19ebedcf 519 XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
b194c7d8 520 if ((error = xfs_btree_insert(cnt_cur, &i)))
2bd0ea18 521 return error;
19ebedcf 522 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
2bd0ea18
NS
523 }
524 if (nfbno2 != NULLAGBLOCK) {
0e266570 525 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
2bd0ea18 526 return error;
19ebedcf 527 XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
b194c7d8 528 if ((error = xfs_btree_insert(cnt_cur, &i)))
2bd0ea18 529 return error;
19ebedcf 530 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
2bd0ea18
NS
531 }
532 /*
533 * Fix up the by-block btree entry(s).
534 */
535 if (nfbno1 == NULLAGBLOCK) {
536 /*
537 * No remaining freespace, just delete the by-block tree entry.
538 */
b194c7d8 539 if ((error = xfs_btree_delete(bno_cur, &i)))
2bd0ea18 540 return error;
19ebedcf 541 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
2bd0ea18
NS
542 } else {
543 /*
544 * Update the by-block entry to start later|be shorter.
545 */
0e266570 546 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
2bd0ea18
NS
547 return error;
548 }
549 if (nfbno2 != NULLAGBLOCK) {
550 /*
551 * 2 resulting free entries, need to add one.
552 */
0e266570 553 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
2bd0ea18 554 return error;
19ebedcf 555 XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
b194c7d8 556 if ((error = xfs_btree_insert(bno_cur, &i)))
2bd0ea18 557 return error;
19ebedcf 558 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
2bd0ea18
NS
559 }
560 return 0;
561}
562
bc01119d 563static xfs_failaddr_t
a2ceac1f
DC
564xfs_agfl_verify(
565 struct xfs_buf *bp)
566{
7861ef77 567 struct xfs_mount *mp = bp->b_mount;
a2ceac1f 568 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
a2ceac1f
DC
569 int i;
570
95d9582b
DW
571 /*
572 * There is no verification of non-crc AGFLs because mkfs does not
573 * initialise the AGFL to zero or NULL. Hence the only valid part of the
574 * AGFL is what the AGF says is active. We can't get to the AGF, so we
575 * can't verify just those entries are valid.
576 */
577 if (!xfs_sb_version_hascrc(&mp->m_sb))
578 return NULL;
579
68dbe77f 580 if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
bc01119d 581 return __this_address;
68dbe77f 582 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
bc01119d 583 return __this_address;
dd5b876e
DC
584 /*
585 * during growfs operations, the perag is not fully initialised,
586 * so we can't use it for any useful checking. growfs ensures we can't
587 * use it by using uncached buffers that don't have the perag attached
588 * so we can detect and avoid this problem.
589 */
590 if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
bc01119d 591 return __this_address;
dd5b876e 592
b8165508 593 for (i = 0; i < xfs_agfl_size(mp); i++) {
dd5b876e 594 if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK &&
a2ceac1f 595 be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
bc01119d 596 return __this_address;
a2ceac1f 597 }
a65d8d29 598
bc01119d
DW
599 if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
600 return __this_address;
601 return NULL;
dd5b876e
DC
602}
603
604static void
605xfs_agfl_read_verify(
606 struct xfs_buf *bp)
607{
7861ef77 608 struct xfs_mount *mp = bp->b_mount;
1e697959 609 xfs_failaddr_t fa;
dd5b876e
DC
610
611 /*
612 * There is no verification of non-crc AGFLs because mkfs does not
613 * initialise the AGFL to zero or NULL. Hence the only valid part of the
614 * AGFL is what the AGF says is active. We can't get to the AGF, so we
615 * can't verify just those entries are valid.
616 */
617 if (!xfs_sb_version_hascrc(&mp->m_sb))
618 return;
619
45922933 620 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
1e697959
DW
621 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
622 else {
623 fa = xfs_agfl_verify(bp);
624 if (fa)
625 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
626 }
a2ceac1f
DC
627}
628
629static void
630xfs_agfl_write_verify(
631 struct xfs_buf *bp)
632{
7861ef77 633 struct xfs_mount *mp = bp->b_mount;
37d086ca 634 struct xfs_buf_log_item *bip = bp->b_log_item;
1e697959 635 xfs_failaddr_t fa;
a2ceac1f 636
dd5b876e
DC
637 /* no verification of non-crc AGFLs */
638 if (!xfs_sb_version_hascrc(&mp->m_sb))
639 return;
640
1e697959
DW
641 fa = xfs_agfl_verify(bp);
642 if (fa) {
643 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
dd5b876e
DC
644 return;
645 }
646
647 if (bip)
648 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
649
43b5aeed 650 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
a2ceac1f
DC
651}
652
653const struct xfs_buf_ops xfs_agfl_buf_ops = {
a3fac935 654 .name = "xfs_agfl",
68dbe77f 655 .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
a2ceac1f
DC
656 .verify_read = xfs_agfl_read_verify,
657 .verify_write = xfs_agfl_write_verify,
95d9582b 658 .verify_struct = xfs_agfl_verify,
a2ceac1f
DC
659};
660
2bd0ea18
NS
661/*
662 * Read in the allocation group free block array.
663 */
50bb67d6 664int /* error */
2bd0ea18
NS
665xfs_alloc_read_agfl(
666 xfs_mount_t *mp, /* mount point structure */
667 xfs_trans_t *tp, /* transaction pointer */
668 xfs_agnumber_t agno, /* allocation group number */
669 xfs_buf_t **bpp) /* buffer for the ag free block array */
670{
671 xfs_buf_t *bp; /* return value */
2bd0ea18
NS
672 int error;
673
674 ASSERT(agno != NULLAGNUMBER);
9440d84d
NS
675 error = xfs_trans_read_buf(
676 mp, tp, mp->m_ddev_targp,
677 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
a2ceac1f 678 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
9440d84d 679 if (error)
2bd0ea18 680 return error;
a2ceac1f 681 xfs_buf_set_ref(bp, XFS_AGFL_REF);
2bd0ea18
NS
682 *bpp = bp;
683 return 0;
684}
685
a2ceac1f
DC
686STATIC int
687xfs_alloc_update_counters(
688 struct xfs_trans *tp,
689 struct xfs_perag *pag,
690 struct xfs_buf *agbp,
691 long len)
692{
693 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
694
695 pag->pagf_freeblks += len;
696 be32_add_cpu(&agf->agf_freeblks, len);
697
698 xfs_trans_agblocks_delta(tp, len);
699 if (unlikely(be32_to_cpu(agf->agf_freeblks) >
700 be32_to_cpu(agf->agf_length)))
12b53197 701 return -EFSCORRUPTED;
a2ceac1f
DC
702
703 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
704 return 0;
705}
706
2bd0ea18 707/*
2950989c 708 * Block allocation algorithm and data structures.
2bd0ea18 709 */
2950989c
BF
710struct xfs_alloc_cur {
711 struct xfs_btree_cur *cnt; /* btree cursors */
712 struct xfs_btree_cur *bnolt;
713 struct xfs_btree_cur *bnogt;
19fe42e3
BF
714 xfs_agblock_t rec_bno;/* extent startblock */
715 xfs_extlen_t rec_len;/* extent length */
716 xfs_agblock_t bno; /* alloc bno */
717 xfs_extlen_t len; /* alloc len */
718 xfs_extlen_t diff; /* diff from search bno */
e055d59e
BF
719 unsigned int busy_gen;/* busy state */
720 bool busy;
2950989c
BF
721};
722
723/*
724 * Set up cursors, etc. in the extent allocation cursor. This function can be
725 * called multiple times to reset an initialized structure without having to
726 * reallocate cursors.
727 */
728static int
729xfs_alloc_cur_setup(
730 struct xfs_alloc_arg *args,
731 struct xfs_alloc_cur *acur)
732{
733 int error;
734 int i;
735
736 ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
737
19fe42e3
BF
738 acur->rec_bno = 0;
739 acur->rec_len = 0;
740 acur->bno = 0;
741 acur->len = 0;
04ea0ac1 742 acur->diff = -1;
e055d59e
BF
743 acur->busy = false;
744 acur->busy_gen = 0;
745
2950989c
BF
746 /*
747 * Perform an initial cntbt lookup to check for availability of maxlen
748 * extents. If this fails, we'll return -ENOSPC to signal the caller to
749 * attempt a small allocation.
750 */
751 if (!acur->cnt)
752 acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
753 args->agbp, args->agno, XFS_BTNUM_CNT);
754 error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
755 if (error)
756 return error;
757
758 /*
759 * Allocate the bnobt left and right search cursors.
760 */
761 if (!acur->bnolt)
762 acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
763 args->agbp, args->agno, XFS_BTNUM_BNO);
764 if (!acur->bnogt)
765 acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
766 args->agbp, args->agno, XFS_BTNUM_BNO);
767 return i == 1 ? 0 : -ENOSPC;
768}
769
770static void
771xfs_alloc_cur_close(
772 struct xfs_alloc_cur *acur,
773 bool error)
774{
775 int cur_error = XFS_BTREE_NOERROR;
776
777 if (error)
778 cur_error = XFS_BTREE_ERROR;
779
780 if (acur->cnt)
781 xfs_btree_del_cursor(acur->cnt, cur_error);
782 if (acur->bnolt)
783 xfs_btree_del_cursor(acur->bnolt, cur_error);
784 if (acur->bnogt)
785 xfs_btree_del_cursor(acur->bnogt, cur_error);
786 acur->cnt = acur->bnolt = acur->bnogt = NULL;
787}
2bd0ea18 788
04ea0ac1
BF
789/*
790 * Check an extent for allocation and track the best available candidate in the
791 * allocation structure. The cursor is deactivated if it has entered an out of
792 * range state based on allocation arguments. Optionally return the extent
793 * extent geometry and allocation status if requested by the caller.
794 */
795static int
796xfs_alloc_cur_check(
797 struct xfs_alloc_arg *args,
798 struct xfs_alloc_cur *acur,
799 struct xfs_btree_cur *cur,
800 int *new)
801{
802 int error, i;
803 xfs_agblock_t bno, bnoa, bnew;
804 xfs_extlen_t len, lena, diff = -1;
805 bool busy;
806 unsigned busy_gen = 0;
807 bool deactivate = false;
3ca39168 808 bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
04ea0ac1
BF
809
810 *new = 0;
811
812 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
813 if (error)
814 return error;
815 XFS_WANT_CORRUPTED_RETURN(args->mp, i == 1);
816
817 /*
818 * Check minlen and deactivate a cntbt cursor if out of acceptable size
819 * range (i.e., walking backwards looking for a minlen extent).
820 */
821 if (len < args->minlen) {
3ca39168 822 deactivate = !isbnobt;
04ea0ac1
BF
823 goto out;
824 }
825
826 busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
827 &busy_gen);
828 acur->busy |= busy;
829 if (busy)
830 acur->busy_gen = busy_gen;
831 /* deactivate a bnobt cursor outside of locality range */
3ca39168
BF
832 if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
833 deactivate = isbnobt;
04ea0ac1 834 goto out;
3ca39168 835 }
04ea0ac1
BF
836 if (lena < args->minlen)
837 goto out;
838
839 args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
840 xfs_alloc_fix_len(args);
841 ASSERT(args->len >= args->minlen);
842 if (args->len < acur->len)
843 goto out;
844
845 /*
846 * We have an aligned record that satisfies minlen and beats or matches
847 * the candidate extent size. Compare locality for near allocation mode.
848 */
849 ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
850 diff = xfs_alloc_compute_diff(args->agbno, args->len,
851 args->alignment, args->datatype,
852 bnoa, lena, &bnew);
853 if (bnew == NULLAGBLOCK)
854 goto out;
3ca39168
BF
855
856 /*
857 * Deactivate a bnobt cursor with worse locality than the current best.
858 */
859 if (diff > acur->diff) {
860 deactivate = isbnobt;
04ea0ac1 861 goto out;
3ca39168 862 }
04ea0ac1
BF
863
864 ASSERT(args->len > acur->len ||
865 (args->len == acur->len && diff <= acur->diff));
866 acur->rec_bno = bno;
867 acur->rec_len = len;
868 acur->bno = bnew;
869 acur->len = args->len;
870 acur->diff = diff;
871 *new = 1;
872
4f2eee5a
BF
873 /*
874 * We're done if we found a perfect allocation. This only deactivates
875 * the current cursor, but this is just an optimization to terminate a
876 * cntbt search that otherwise runs to the edge of the tree.
877 */
878 if (acur->diff == 0 && acur->len == args->maxlen)
879 deactivate = true;
04ea0ac1
BF
880out:
881 if (deactivate)
882 cur->bc_private.a.priv.abt.active = false;
883 trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
884 *new);
885 return 0;
886}
887
ba02381c
BF
888/*
889 * Deal with the case where only small freespaces remain. Either return the
890 * contents of the last freespace record, or allocate space from the freelist if
891 * there is nothing in the tree.
892 */
893STATIC int /* error */
894xfs_alloc_ag_vextent_small(
895 struct xfs_alloc_arg *args, /* allocation argument structure */
896 struct xfs_btree_cur *ccur, /* optional by-size cursor */
897 xfs_agblock_t *fbnop, /* result block number */
898 xfs_extlen_t *flenp, /* result length */
899 int *stat) /* status: 0-freelist, 1-normal/none */
900{
901 int error = 0;
902 xfs_agblock_t fbno = NULLAGBLOCK;
903 xfs_extlen_t flen = 0;
9e1862f0 904 int i = 0;
ba02381c 905
9e1862f0
BF
906 /*
907 * If a cntbt cursor is provided, try to allocate the largest record in
908 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
909 * allocation. Make sure to respect minleft even when pulling from the
910 * freelist.
911 */
912 if (ccur)
913 error = xfs_btree_decrement(ccur, 0, &i);
ba02381c
BF
914 if (error)
915 goto error;
916 if (i) {
917 error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
918 if (error)
919 goto error;
920 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error);
921 goto out;
922 }
923
924 if (args->minlen != 1 || args->alignment != 1 ||
925 args->resv == XFS_AG_RESV_AGFL ||
926 (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) <=
927 args->minleft))
928 goto out;
929
930 error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
931 if (error)
932 goto error;
933 if (fbno == NULLAGBLOCK)
934 goto out;
935
936 xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
937 xfs_alloc_allow_busy_reuse(args->datatype));
938
939 if (xfs_alloc_is_userdata(args->datatype)) {
940 struct xfs_buf *bp;
941
942 bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno);
943 if (!bp) {
944 error = -EFSCORRUPTED;
945 goto error;
946 }
947 xfs_trans_binval(args->tp, bp);
948 }
2d7ea81f
BF
949 *fbnop = args->agbno = fbno;
950 *flenp = args->len = 1;
ba02381c
BF
951 XFS_WANT_CORRUPTED_GOTO(args->mp,
952 fbno < be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
953 error);
954 args->wasfromfl = 1;
955 trace_xfs_alloc_small_freelist(args);
956
957 /*
958 * If we're feeding an AGFL block to something that doesn't live in the
959 * free space, we need to clear out the OWN_AG rmap.
960 */
961 error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
962 &XFS_RMAP_OINFO_AG);
963 if (error)
964 goto error;
965
966 *stat = 0;
967 return 0;
968
969out:
970 /*
971 * Can't do the allocation, give up.
972 */
973 if (flen < args->minlen) {
974 args->agbno = NULLAGBLOCK;
975 trace_xfs_alloc_small_notenough(args);
976 flen = 0;
977 }
978 *fbnop = fbno;
979 *flenp = flen;
980 *stat = 1;
981 trace_xfs_alloc_small_done(args);
982 return 0;
983
984error:
985 trace_xfs_alloc_small_error(args);
986 return error;
987}
988
2bd0ea18
NS
989/*
990 * Allocate a variable extent in the allocation group agno.
991 * Type and bno are used to determine where in the allocation group the
992 * extent will start.
993 * Extent's length (returned in *len) will be between minlen and maxlen,
994 * and of the form k * prod + mod unless there's nothing that large.
995 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
996 */
997STATIC int /* error */
998xfs_alloc_ag_vextent(
dfc130f3 999 xfs_alloc_arg_t *args) /* argument structure for allocation */
2bd0ea18 1000{
0e266570 1001 int error=0;
2bd0ea18
NS
1002
1003 ASSERT(args->minlen > 0);
1004 ASSERT(args->maxlen > 0);
1005 ASSERT(args->minlen <= args->maxlen);
1006 ASSERT(args->mod < args->prod);
1007 ASSERT(args->alignment > 0);
cf8ce220 1008
2bd0ea18
NS
1009 /*
1010 * Branch to correct routine based on the type.
1011 */
1012 args->wasfromfl = 0;
1013 switch (args->type) {
1014 case XFS_ALLOCTYPE_THIS_AG:
1015 error = xfs_alloc_ag_vextent_size(args);
1016 break;
1017 case XFS_ALLOCTYPE_NEAR_BNO:
1018 error = xfs_alloc_ag_vextent_near(args);
1019 break;
1020 case XFS_ALLOCTYPE_THIS_BNO:
1021 error = xfs_alloc_ag_vextent_exact(args);
1022 break;
1023 default:
1024 ASSERT(0);
1025 /* NOTREACHED */
1026 }
a2ceac1f
DC
1027
1028 if (error || args->agbno == NULLAGBLOCK)
2bd0ea18 1029 return error;
2bd0ea18 1030
a2ceac1f
DC
1031 ASSERT(args->len >= args->minlen);
1032 ASSERT(args->len <= args->maxlen);
9760cac2 1033 ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
a2ceac1f
DC
1034 ASSERT(args->agbno % args->alignment == 0);
1035
631ac87a 1036 /* if not file data, insert new block into the reverse map btree */
3ee858aa 1037 if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
631ac87a
DW
1038 error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
1039 args->agbno, args->len, &args->oinfo);
1040 if (error)
1041 return error;
1042 }
1043
a2ceac1f
DC
1044 if (!args->wasfromfl) {
1045 error = xfs_alloc_update_counters(args->tp, args->pag,
1046 args->agbp,
1047 -((long)(args->len)));
1048 if (error)
1049 return error;
1050
1051 ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
1052 args->agbno, args->len));
2bd0ea18 1053 }
a2ceac1f 1054
cf8ce220 1055 xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
a2ceac1f 1056
79896434
BD
1057 XFS_STATS_INC(args->mp, xs_allocx);
1058 XFS_STATS_ADD(args->mp, xs_allocb, args->len);
a2ceac1f 1059 return error;
2bd0ea18
NS
1060}
1061
1062/*
1063 * Allocate a variable extent at exactly agno/bno.
1064 * Extent's length (returned in *len) will be between minlen and maxlen,
1065 * and of the form k * prod + mod unless there's nothing that large.
1066 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1067 */
1068STATIC int /* error */
1069xfs_alloc_ag_vextent_exact(
dfc130f3 1070 xfs_alloc_arg_t *args) /* allocation argument structure */
2bd0ea18 1071{
dfc130f3
RC
1072 xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
1073 xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
2bd0ea18
NS
1074 int error;
1075 xfs_agblock_t fbno; /* start block of found extent */
2bd0ea18 1076 xfs_extlen_t flen; /* length of found extent */
cd80de04
CH
1077 xfs_agblock_t tbno; /* start block of busy extent */
1078 xfs_extlen_t tlen; /* length of busy extent */
1079 xfs_agblock_t tend; /* end block of busy extent */
2bd0ea18 1080 int i; /* success/failure of operation */
cd80de04 1081 unsigned busy_gen;
2bd0ea18
NS
1082
1083 ASSERT(args->alignment == 1);
a2ceac1f 1084
2bd0ea18
NS
1085 /*
1086 * Allocate/initialize a cursor for the by-number freespace btree.
1087 */
b194c7d8 1088 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
56b2de80
DC
1089 args->agno, XFS_BTNUM_BNO);
1090
2bd0ea18
NS
1091 /*
1092 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1093 * Look for the closest free block <= bno, it must contain bno
1094 * if any free block does.
1095 */
56b2de80
DC
1096 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1097 if (error)
2bd0ea18 1098 goto error0;
56b2de80
DC
1099 if (!i)
1100 goto not_found;
1101
2bd0ea18
NS
1102 /*
1103 * Grab the freespace record.
1104 */
56b2de80
DC
1105 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1106 if (error)
2bd0ea18 1107 goto error0;
19ebedcf 1108 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
2bd0ea18 1109 ASSERT(fbno <= args->agbno);
56b2de80 1110
5000d01d 1111 /*
a2ceac1f
DC
1112 * Check for overlapping busy extents.
1113 */
cd80de04
CH
1114 tbno = fbno;
1115 tlen = flen;
1116 xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
a2ceac1f
DC
1117
1118 /*
1119 * Give up if the start of the extent is busy, or the freespace isn't
1120 * long enough for the minimum request.
2bd0ea18 1121 */
a2ceac1f
DC
1122 if (tbno > args->agbno)
1123 goto not_found;
1124 if (tlen < args->minlen)
1125 goto not_found;
1126 tend = tbno + tlen;
1127 if (tend < args->agbno + args->minlen)
56b2de80
DC
1128 goto not_found;
1129
2bd0ea18
NS
1130 /*
1131 * End of extent will be smaller of the freespace end and the
1132 * maximal requested end.
56b2de80 1133 *
2bd0ea18
NS
1134 * Fix the length according to mod and prod if given.
1135 */
a2ceac1f
DC
1136 args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1137 - args->agbno;
2bd0ea18 1138 xfs_alloc_fix_len(args);
a2ceac1f 1139 ASSERT(args->agbno + args->len <= tend);
56b2de80 1140
2bd0ea18 1141 /*
a2ceac1f 1142 * We are allocating agbno for args->len
2bd0ea18
NS
1143 * Allocate/initialize a cursor for the by-size btree.
1144 */
b194c7d8
BN
1145 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1146 args->agno, XFS_BTNUM_CNT);
2bd0ea18 1147 ASSERT(args->agbno + args->len <=
6e3140c7 1148 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
56b2de80
DC
1149 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1150 args->len, XFSA_FIXUP_BNO_OK);
1151 if (error) {
2bd0ea18
NS
1152 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1153 goto error0;
1154 }
a2ceac1f 1155
2bd0ea18
NS
1156 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1157 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
a2ceac1f 1158
2bd0ea18 1159 args->wasfromfl = 0;
56b2de80
DC
1160 trace_xfs_alloc_exact_done(args);
1161 return 0;
1162
1163not_found:
1164 /* Didn't find it, return null. */
1165 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1166 args->agbno = NULLAGBLOCK;
1167 trace_xfs_alloc_exact_notfound(args);
2bd0ea18
NS
1168 return 0;
1169
1170error0:
1171 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
56b2de80
DC
1172 trace_xfs_alloc_exact_error(args);
1173 return error;
1174}
1175
1176/*
4f2eee5a
BF
1177 * Search a given number of btree records in a given direction. Check each
1178 * record against the good extent we've already found.
56b2de80
DC
1179 */
1180STATIC int
4f2eee5a 1181xfs_alloc_walk_iter(
3ca39168
BF
1182 struct xfs_alloc_arg *args,
1183 struct xfs_alloc_cur *acur,
1184 struct xfs_btree_cur *cur,
4f2eee5a
BF
1185 bool increment,
1186 bool find_one, /* quit on first candidate */
1187 int count, /* rec count (-1 for infinite) */
1188 int *stat)
56b2de80 1189{
56b2de80
DC
1190 int error;
1191 int i;
56b2de80 1192
4f2eee5a
BF
1193 *stat = 0;
1194
56b2de80 1195 /*
3ca39168
BF
1196 * Search so long as the cursor is active or we find a better extent.
1197 * The cursor is deactivated if it extends beyond the range of the
1198 * current allocation candidate.
56b2de80 1199 */
4f2eee5a 1200 while (xfs_alloc_cur_active(cur) && count) {
3ca39168 1201 error = xfs_alloc_cur_check(args, acur, cur, &i);
56b2de80 1202 if (error)
3ca39168 1203 return error;
4f2eee5a
BF
1204 if (i == 1) {
1205 *stat = 1;
1206 if (find_one)
1207 break;
1208 }
3ca39168
BF
1209 if (!xfs_alloc_cur_active(cur))
1210 break;
56b2de80 1211
3ca39168
BF
1212 if (increment)
1213 error = xfs_btree_increment(cur, 0, &i);
56b2de80 1214 else
3ca39168 1215 error = xfs_btree_decrement(cur, 0, &i);
56b2de80 1216 if (error)
3ca39168
BF
1217 return error;
1218 if (i == 0)
1219 cur->bc_private.a.priv.abt.active = false;
4f2eee5a
BF
1220
1221 if (count > 0)
1222 count--;
3ca39168 1223 }
56b2de80 1224
56b2de80 1225 return 0;
2bd0ea18
NS
1226}
1227
1228/*
1229 * Allocate a variable extent near bno in the allocation group agno.
1230 * Extent's length (returned in len) will be between minlen and maxlen,
1231 * and of the form k * prod + mod unless there's nothing that large.
1232 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1233 */
2950989c 1234STATIC int
2bd0ea18 1235xfs_alloc_ag_vextent_near(
2950989c 1236 struct xfs_alloc_arg *args)
2bd0ea18 1237{
2950989c 1238 struct xfs_alloc_cur acur = {};
3ca39168
BF
1239 struct xfs_btree_cur *fbcur = NULL;
1240 int error; /* error code */
1241 int i; /* result code, temporary */
3ca39168
BF
1242 xfs_agblock_t bno;
1243 xfs_extlen_t len;
1244 bool fbinc = false;
6beba453 1245#ifdef DEBUG
2bd0ea18
NS
1246 /*
1247 * Randomly don't execute the first algorithm.
1248 */
2bd0ea18 1249 int dofirst; /* set to do first algorithm */
2bd0ea18 1250
49f693fa 1251 dofirst = prandom_u32() & 1;
2bd0ea18 1252#endif
a2ceac1f 1253
ff3263dd
BF
1254 /* handle unitialized agbno range so caller doesn't have to */
1255 if (!args->min_agbno && !args->max_agbno)
1256 args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1257 ASSERT(args->min_agbno <= args->max_agbno);
1258
1259 /* clamp agbno to the range if it's outside */
1260 if (args->agbno < args->min_agbno)
1261 args->agbno = args->min_agbno;
1262 if (args->agbno > args->max_agbno)
1263 args->agbno = args->max_agbno;
1264
a2ceac1f 1265restart:
3ca39168 1266 len = 0;
a2ceac1f 1267
2bd0ea18 1268 /*
2950989c
BF
1269 * Set up cursors and see if there are any free extents as big as
1270 * maxlen. If not, pick the last entry in the tree unless the tree is
1271 * empty.
5000d01d 1272 */
2950989c
BF
1273 error = xfs_alloc_cur_setup(args, &acur);
1274 if (error == -ENOSPC) {
3ca39168
BF
1275 error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1276 &len, &i);
2950989c
BF
1277 if (error)
1278 goto out;
3ca39168 1279 if (i == 0 || len == 0) {
a2ceac1f 1280 trace_xfs_alloc_near_noentry(args);
2950989c 1281 goto out;
2bd0ea18
NS
1282 }
1283 ASSERT(i == 1);
2950989c
BF
1284 } else if (error) {
1285 goto out;
2bd0ea18
NS
1286 }
1287 args->wasfromfl = 0;
a2ceac1f 1288
5000d01d 1289 /*
2bd0ea18
NS
1290 * First algorithm.
1291 * If the requested extent is large wrt the freespaces available
1292 * in this a.g., then the cursor will be pointing to a btree entry
1293 * near the right edge of the tree. If it's in the last btree leaf
1294 * block, then we just examine all the entries in that block
1295 * that are big enough, and pick the best one.
1296 * This is written as a while loop so we can break out of it,
1297 * but we never loop back to the top.
1298 */
2950989c 1299 while (xfs_btree_islastblock(acur.cnt, 0)) {
6beba453
DC
1300#ifdef DEBUG
1301 if (dofirst)
2bd0ea18
NS
1302 break;
1303#endif
1304 /*
1305 * Start from the entry that lookup found, sequence through
1306 * all larger free blocks. If we're actually pointing at a
1307 * record smaller than maxlen, go to the start of this block,
1308 * and skip all those smaller than minlen.
1309 */
3ca39168 1310 if (len || args->alignment > 1) {
2950989c 1311 acur.cnt->bc_ptrs[0] = 1;
2bd0ea18 1312 do {
3ca39168
BF
1313 error = xfs_alloc_get_rec(acur.cnt, &bno, &len,
1314 &i);
2950989c
BF
1315 if (error)
1316 goto out;
1317 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, out);
3ca39168 1318 if (len >= args->minlen)
2bd0ea18 1319 break;
2950989c
BF
1320 error = xfs_btree_increment(acur.cnt, 0, &i);
1321 if (error)
1322 goto out;
2bd0ea18 1323 } while (i);
3ca39168 1324 ASSERT(len >= args->minlen);
2bd0ea18
NS
1325 if (!i)
1326 break;
1327 }
4f2eee5a
BF
1328
1329 error = xfs_alloc_walk_iter(args, &acur, acur.cnt, true, false,
1330 -1, &i);
1331 if (error)
1332 goto out;
1333
2bd0ea18
NS
1334 /*
1335 * It didn't work. We COULD be in a case where
1336 * there's a good record somewhere, so try again.
1337 */
19fe42e3 1338 if (acur.len == 0)
2bd0ea18 1339 break;
2c003dc2 1340
56b2de80 1341 trace_xfs_alloc_near_first(args);
46f7e323 1342 goto alloc;
2bd0ea18 1343 }
2950989c 1344
2bd0ea18
NS
1345 /*
1346 * Second algorithm.
1347 * Search in the by-bno tree to the left and to the right
1348 * simultaneously, until in each case we find a space big enough,
1349 * or run into the edge of the tree. When we run into the edge,
1350 * we deallocate that cursor.
1351 * If both searches succeed, we compare the two spaces and pick
1352 * the better one.
1353 * With alignment, it's possible for both to fail; the upper
1354 * level algorithm that picks allocation groups for allocations
1355 * is not supposed to do this.
1356 */
2950989c
BF
1357 error = xfs_alloc_lookup_le(acur.bnolt, args->agbno, 0, &i);
1358 if (error)
1359 goto out;
1360 error = xfs_alloc_lookup_ge(acur.bnogt, args->agbno, 0, &i);
1361 if (error)
1362 goto out;
1363
2bd0ea18 1364 /*
4f2eee5a
BF
1365 * Loop going left with the leftward cursor, right with the rightward
1366 * cursor, until either both directions give up or we find an entry at
1367 * least as big as minlen.
2bd0ea18
NS
1368 */
1369 do {
4f2eee5a
BF
1370 error = xfs_alloc_walk_iter(args, &acur, acur.bnolt, false,
1371 true, 1, &i);
1372 if (error)
1373 goto out;
1374 if (i == 1) {
1375 trace_xfs_alloc_cur_left(args);
1376 fbcur = acur.bnogt;
1377 fbinc = true;
1378 break;
2bd0ea18 1379 }
4f2eee5a
BF
1380
1381 error = xfs_alloc_walk_iter(args, &acur, acur.bnogt, true, true,
1382 1, &i);
1383 if (error)
1384 goto out;
1385 if (i == 1) {
1386 trace_xfs_alloc_cur_right(args);
1387 fbcur = acur.bnolt;
1388 fbinc = false;
1389 break;
2bd0ea18 1390 }
2950989c
BF
1391 } while (xfs_alloc_cur_active(acur.bnolt) ||
1392 xfs_alloc_cur_active(acur.bnogt));
56b2de80 1393
3ca39168
BF
1394 /* search the opposite direction for a better entry */
1395 if (fbcur) {
4f2eee5a
BF
1396 error = xfs_alloc_walk_iter(args, &acur, fbcur, fbinc, true, -1,
1397 &i);
56b2de80 1398 if (error)
2950989c 1399 goto out;
2bd0ea18 1400 }
56b2de80 1401
2bd0ea18
NS
1402 /*
1403 * If we couldn't get anything, give up.
1404 */
3ca39168 1405 if (!acur.len) {
e055d59e 1406 if (acur.busy) {
a2ceac1f 1407 trace_xfs_alloc_near_busy(args);
e055d59e
BF
1408 xfs_extent_busy_flush(args->mp, args->pag,
1409 acur.busy_gen);
a2ceac1f
DC
1410 goto restart;
1411 }
56b2de80 1412 trace_xfs_alloc_size_neither(args);
2bd0ea18 1413 args->agbno = NULLAGBLOCK;
2950989c 1414 goto out;
2bd0ea18 1415 }
56b2de80 1416
46f7e323 1417alloc:
3ca39168
BF
1418 args->agbno = acur.bno;
1419 args->len = acur.len;
1420 ASSERT(acur.bno >= acur.rec_bno);
1421 ASSERT(acur.bno + acur.len <= acur.rec_bno + acur.rec_len);
1422 ASSERT(acur.rec_bno + acur.rec_len <=
1423 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
56b2de80 1424
3ca39168
BF
1425 error = xfs_alloc_fixup_trees(acur.cnt, acur.bnolt, acur.rec_bno,
1426 acur.rec_len, acur.bno, acur.len, 0);
56b2de80 1427
2950989c
BF
1428out:
1429 xfs_alloc_cur_close(&acur, error);
2bd0ea18
NS
1430 return error;
1431}
1432
1433/*
1434 * Allocate a variable extent anywhere in the allocation group agno.
1435 * Extent's length (returned in len) will be between minlen and maxlen,
1436 * and of the form k * prod + mod unless there's nothing that large.
1437 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1438 */
1439STATIC int /* error */
1440xfs_alloc_ag_vextent_size(
dfc130f3 1441 xfs_alloc_arg_t *args) /* allocation argument structure */
2bd0ea18 1442{
dfc130f3
RC
1443 xfs_btree_cur_t *bno_cur; /* cursor for bno btree */
1444 xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */
2bd0ea18
NS
1445 int error; /* error result */
1446 xfs_agblock_t fbno; /* start of found freespace */
1447 xfs_extlen_t flen; /* length of found freespace */
2bd0ea18
NS
1448 int i; /* temp status variable */
1449 xfs_agblock_t rbno; /* returned block number */
1450 xfs_extlen_t rlen; /* length of returned extent */
cd80de04
CH
1451 bool busy;
1452 unsigned busy_gen;
2bd0ea18 1453
a2ceac1f 1454restart:
2bd0ea18
NS
1455 /*
1456 * Allocate and initialize a cursor for the by-size btree.
1457 */
b194c7d8
BN
1458 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1459 args->agno, XFS_BTNUM_CNT);
2bd0ea18 1460 bno_cur = NULL;
cd80de04 1461 busy = false;
a2ceac1f 1462
2bd0ea18
NS
1463 /*
1464 * Look for an entry >= maxlen+alignment-1 blocks.
1465 */
0e266570
NS
1466 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1467 args->maxlen + args->alignment - 1, &i)))
2bd0ea18 1468 goto error0;
a2ceac1f 1469
2bd0ea18 1470 /*
cd80de04
CH
1471 * If none then we have to settle for a smaller extent. In the case that
1472 * there are no large extents, this will return the last entry in the
1473 * tree unless the tree is empty. In the case that there are only busy
1474 * large extents, this will return the largest small extent unless there
a2ceac1f 1475 * are no smaller extents available.
5000d01d 1476 */
cd80de04 1477 if (!i) {
a2ceac1f
DC
1478 error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1479 &fbno, &flen, &i);
1480 if (error)
2bd0ea18
NS
1481 goto error0;
1482 if (i == 0 || flen == 0) {
1483 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
56b2de80 1484 trace_xfs_alloc_size_noentry(args);
2bd0ea18
NS
1485 return 0;
1486 }
1487 ASSERT(i == 1);
cd80de04
CH
1488 busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1489 &rlen, &busy_gen);
a2ceac1f
DC
1490 } else {
1491 /*
1492 * Search for a non-busy extent that is large enough.
a2ceac1f
DC
1493 */
1494 for (;;) {
1495 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1496 if (error)
1497 goto error0;
19ebedcf 1498 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
a2ceac1f 1499
cd80de04
CH
1500 busy = xfs_alloc_compute_aligned(args, fbno, flen,
1501 &rbno, &rlen, &busy_gen);
a2ceac1f
DC
1502
1503 if (rlen >= args->maxlen)
1504 break;
1505
1506 error = xfs_btree_increment(cnt_cur, 0, &i);
1507 if (error)
1508 goto error0;
1509 if (i == 0) {
1510 /*
1511 * Our only valid extents must have been busy.
1512 * Make it unbusy by forcing the log out and
cd80de04 1513 * retrying.
a2ceac1f
DC
1514 */
1515 xfs_btree_del_cursor(cnt_cur,
1516 XFS_BTREE_NOERROR);
1517 trace_xfs_alloc_size_busy(args);
cd80de04
CH
1518 xfs_extent_busy_flush(args->mp,
1519 args->pag, busy_gen);
a2ceac1f
DC
1520 goto restart;
1521 }
1522 }
2bd0ea18 1523 }
a2ceac1f 1524
2bd0ea18
NS
1525 /*
1526 * In the first case above, we got the last entry in the
1527 * by-size btree. Now we check to see if the space hits maxlen
1528 * once aligned; if not, we search left for something better.
1529 * This can't happen in the second case above.
1530 */
2bd0ea18 1531 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
19ebedcf 1532 XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
2bd0ea18
NS
1533 (rlen <= flen && rbno + rlen <= fbno + flen), error0);
1534 if (rlen < args->maxlen) {
1535 xfs_agblock_t bestfbno;
1536 xfs_extlen_t bestflen;
1537 xfs_agblock_t bestrbno;
1538 xfs_extlen_t bestrlen;
1539
1540 bestrlen = rlen;
1541 bestrbno = rbno;
1542 bestflen = flen;
1543 bestfbno = fbno;
1544 for (;;) {
b194c7d8 1545 if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
2bd0ea18
NS
1546 goto error0;
1547 if (i == 0)
1548 break;
0e266570
NS
1549 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1550 &i)))
2bd0ea18 1551 goto error0;
19ebedcf 1552 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
2bd0ea18
NS
1553 if (flen < bestrlen)
1554 break;
cd80de04
CH
1555 busy = xfs_alloc_compute_aligned(args, fbno, flen,
1556 &rbno, &rlen, &busy_gen);
2bd0ea18 1557 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
19ebedcf 1558 XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
2bd0ea18
NS
1559 (rlen <= flen && rbno + rlen <= fbno + flen),
1560 error0);
1561 if (rlen > bestrlen) {
1562 bestrlen = rlen;
1563 bestrbno = rbno;
1564 bestflen = flen;
1565 bestfbno = fbno;
1566 if (rlen == args->maxlen)
1567 break;
1568 }
5000d01d 1569 }
0e266570
NS
1570 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1571 &i)))
2bd0ea18 1572 goto error0;
19ebedcf 1573 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
2bd0ea18
NS
1574 rlen = bestrlen;
1575 rbno = bestrbno;
1576 flen = bestflen;
1577 fbno = bestfbno;
1578 }
1579 args->wasfromfl = 0;
1580 /*
1581 * Fix up the length.
1582 */
1583 args->len = rlen;
a2ceac1f 1584 if (rlen < args->minlen) {
cd80de04 1585 if (busy) {
a2ceac1f
DC
1586 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1587 trace_xfs_alloc_size_busy(args);
cd80de04 1588 xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
a2ceac1f
DC
1589 goto restart;
1590 }
1591 goto out_nominleft;
2bd0ea18 1592 }
a2ceac1f
DC
1593 xfs_alloc_fix_len(args);
1594
2bd0ea18 1595 rlen = args->len;
19ebedcf 1596 XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0);
2bd0ea18
NS
1597 /*
1598 * Allocate and initialize a cursor for the by-block tree.
1599 */
b194c7d8
BN
1600 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1601 args->agno, XFS_BTNUM_BNO);
0e266570
NS
1602 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1603 rbno, rlen, XFSA_FIXUP_CNT_OK)))
2bd0ea18
NS
1604 goto error0;
1605 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1606 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1607 cnt_cur = bno_cur = NULL;
1608 args->len = rlen;
1609 args->agbno = rbno;
19ebedcf 1610 XFS_WANT_CORRUPTED_GOTO(args->mp,
2bd0ea18 1611 args->agbno + args->len <=
6e3140c7 1612 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
2bd0ea18 1613 error0);
56b2de80 1614 trace_xfs_alloc_size_done(args);
2bd0ea18
NS
1615 return 0;
1616
1617error0:
56b2de80 1618 trace_xfs_alloc_size_error(args);
2bd0ea18
NS
1619 if (cnt_cur)
1620 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1621 if (bno_cur)
1622 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1623 return error;
a2ceac1f
DC
1624
1625out_nominleft:
1626 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1627 trace_xfs_alloc_size_nominleft(args);
1628 args->agbno = NULLAGBLOCK;
1629 return 0;
2bd0ea18
NS
1630}
1631
2bd0ea18
NS
1632/*
1633 * Free the extent starting at agno/bno for length.
1634 */
85aec44f 1635STATIC int
2bd0ea18 1636xfs_free_ag_extent(
5837e73b
DW
1637 struct xfs_trans *tp,
1638 struct xfs_buf *agbp,
1639 xfs_agnumber_t agno,
1640 xfs_agblock_t bno,
1641 xfs_extlen_t len,
1642 const struct xfs_owner_info *oinfo,
1643 enum xfs_ag_resv_type type)
2bd0ea18 1644{
5837e73b
DW
1645 struct xfs_mount *mp;
1646 struct xfs_perag *pag;
1647 struct xfs_btree_cur *bno_cur;
1648 struct xfs_btree_cur *cnt_cur;
1649 xfs_agblock_t gtbno; /* start of right neighbor */
1650 xfs_extlen_t gtlen; /* length of right neighbor */
1651 xfs_agblock_t ltbno; /* start of left neighbor */
1652 xfs_extlen_t ltlen; /* length of left neighbor */
1653 xfs_agblock_t nbno; /* new starting block of freesp */
1654 xfs_extlen_t nlen; /* new length of freespace */
1655 int haveleft; /* have a left neighbor */
1656 int haveright; /* have a right neighbor */
1657 int i;
1658 int error;
2bd0ea18 1659
631ac87a 1660 bno_cur = cnt_cur = NULL;
2bd0ea18 1661 mp = tp->t_mountp;
631ac87a 1662
3ee858aa 1663 if (!xfs_rmap_should_skip_owner_update(oinfo)) {
631ac87a
DW
1664 error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo);
1665 if (error)
1666 goto error0;
1667 }
1668
5000d01d 1669 /*
2bd0ea18
NS
1670 * Allocate and initialize a cursor for the by-block btree.
1671 */
b194c7d8 1672 bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
5000d01d 1673 /*
2bd0ea18
NS
1674 * Look for a neighboring block on the left (lower block numbers)
1675 * that is contiguous with this space.
1676 */
0e266570 1677 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
2bd0ea18
NS
1678 goto error0;
1679 if (haveleft) {
1680 /*
1681 * There is a block to our left.
1682 */
0e266570 1683 if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
2bd0ea18 1684 goto error0;
19ebedcf 1685 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
2bd0ea18
NS
1686 /*
1687 * It's not contiguous, though.
1688 */
1689 if (ltbno + ltlen < bno)
1690 haveleft = 0;
1691 else {
1692 /*
1693 * If this failure happens the request to free this
1694 * space was invalid, it's (partly) already free.
1695 * Very bad.
1696 */
19ebedcf
DC
1697 XFS_WANT_CORRUPTED_GOTO(mp,
1698 ltbno + ltlen <= bno, error0);
2bd0ea18
NS
1699 }
1700 }
5000d01d 1701 /*
2bd0ea18
NS
1702 * Look for a neighboring block on the right (higher block numbers)
1703 * that is contiguous with this space.
1704 */
b194c7d8 1705 if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
2bd0ea18
NS
1706 goto error0;
1707 if (haveright) {
1708 /*
1709 * There is a block to our right.
1710 */
0e266570 1711 if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
2bd0ea18 1712 goto error0;
19ebedcf 1713 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
2bd0ea18
NS
1714 /*
1715 * It's not contiguous, though.
1716 */
1717 if (bno + len < gtbno)
1718 haveright = 0;
1719 else {
1720 /*
1721 * If this failure happens the request to free this
1722 * space was invalid, it's (partly) already free.
1723 * Very bad.
1724 */
19ebedcf 1725 XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0);
2bd0ea18
NS
1726 }
1727 }
1728 /*
1729 * Now allocate and initialize a cursor for the by-size tree.
1730 */
b194c7d8 1731 cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
2bd0ea18
NS
1732 /*
1733 * Have both left and right contiguous neighbors.
1734 * Merge all three into a single free block.
1735 */
1736 if (haveleft && haveright) {
1737 /*
1738 * Delete the old by-size entry on the left.
1739 */
0e266570 1740 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2bd0ea18 1741 goto error0;
19ebedcf 1742 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
b194c7d8 1743 if ((error = xfs_btree_delete(cnt_cur, &i)))
2bd0ea18 1744 goto error0;
19ebedcf 1745 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
2bd0ea18
NS
1746 /*
1747 * Delete the old by-size entry on the right.
1748 */
0e266570 1749 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2bd0ea18 1750 goto error0;
19ebedcf 1751 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
b194c7d8 1752 if ((error = xfs_btree_delete(cnt_cur, &i)))
2bd0ea18 1753 goto error0;
19ebedcf 1754 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
2bd0ea18
NS
1755 /*
1756 * Delete the old by-block entry for the right block.
1757 */
b194c7d8 1758 if ((error = xfs_btree_delete(bno_cur, &i)))
2bd0ea18 1759 goto error0;
19ebedcf 1760 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
2bd0ea18
NS
1761 /*
1762 * Move the by-block cursor back to the left neighbor.
1763 */
b194c7d8 1764 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2bd0ea18 1765 goto error0;
19ebedcf 1766 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
2bd0ea18
NS
1767#ifdef DEBUG
1768 /*
1769 * Check that this is the right record: delete didn't
1770 * mangle the cursor.
1771 */
1772 {
1773 xfs_agblock_t xxbno;
1774 xfs_extlen_t xxlen;
1775
0e266570
NS
1776 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
1777 &i)))
2bd0ea18 1778 goto error0;
19ebedcf 1779 XFS_WANT_CORRUPTED_GOTO(mp,
2bd0ea18
NS
1780 i == 1 && xxbno == ltbno && xxlen == ltlen,
1781 error0);
1782 }
1783#endif
1784 /*
1785 * Update remaining by-block entry to the new, joined block.
1786 */
1787 nbno = ltbno;
1788 nlen = len + ltlen + gtlen;
0e266570 1789 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2bd0ea18
NS
1790 goto error0;
1791 }
1792 /*
1793 * Have only a left contiguous neighbor.
1794 * Merge it together with the new freespace.
1795 */
1796 else if (haveleft) {
1797 /*
1798 * Delete the old by-size entry on the left.
1799 */
0e266570 1800 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2bd0ea18 1801 goto error0;
19ebedcf 1802 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
b194c7d8 1803 if ((error = xfs_btree_delete(cnt_cur, &i)))
2bd0ea18 1804 goto error0;
19ebedcf 1805 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
2bd0ea18
NS
1806 /*
1807 * Back up the by-block cursor to the left neighbor, and
1808 * update its length.
1809 */
b194c7d8 1810 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2bd0ea18 1811 goto error0;
19ebedcf 1812 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
2bd0ea18
NS
1813 nbno = ltbno;
1814 nlen = len + ltlen;
0e266570 1815 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2bd0ea18
NS
1816 goto error0;
1817 }
1818 /*
1819 * Have only a right contiguous neighbor.
1820 * Merge it together with the new freespace.
1821 */
1822 else if (haveright) {
1823 /*
1824 * Delete the old by-size entry on the right.
1825 */
0e266570 1826 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2bd0ea18 1827 goto error0;
19ebedcf 1828 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
b194c7d8 1829 if ((error = xfs_btree_delete(cnt_cur, &i)))
2bd0ea18 1830 goto error0;
19ebedcf 1831 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
2bd0ea18 1832 /*
5000d01d 1833 * Update the starting block and length of the right
2bd0ea18
NS
1834 * neighbor in the by-block tree.
1835 */
1836 nbno = bno;
1837 nlen = len + gtlen;
0e266570 1838 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2bd0ea18
NS
1839 goto error0;
1840 }
1841 /*
1842 * No contiguous neighbors.
1843 * Insert the new freespace into the by-block tree.
1844 */
1845 else {
1846 nbno = bno;
1847 nlen = len;
b194c7d8 1848 if ((error = xfs_btree_insert(bno_cur, &i)))
2bd0ea18 1849 goto error0;
19ebedcf 1850 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
2bd0ea18
NS
1851 }
1852 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1853 bno_cur = NULL;
1854 /*
1855 * In all cases we need to insert the new freespace in the by-size tree.
1856 */
0e266570 1857 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2bd0ea18 1858 goto error0;
19ebedcf 1859 XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0);
b194c7d8 1860 if ((error = xfs_btree_insert(cnt_cur, &i)))
2bd0ea18 1861 goto error0;
19ebedcf 1862 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
2bd0ea18
NS
1863 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1864 cnt_cur = NULL;
a2ceac1f 1865
2bd0ea18
NS
1866 /*
1867 * Update the freespace totals in the ag and superblock.
1868 */
a2ceac1f
DC
1869 pag = xfs_perag_get(mp, agno);
1870 error = xfs_alloc_update_counters(tp, pag, agbp, len);
cf8ce220 1871 xfs_ag_resv_free_extent(pag, type, tp, len);
a2ceac1f
DC
1872 xfs_perag_put(pag);
1873 if (error)
1874 goto error0;
1875
79896434
BD
1876 XFS_STATS_INC(mp, xs_freex);
1877 XFS_STATS_ADD(mp, xs_freeb, len);
56b2de80 1878
65a15e06 1879 trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
3e535bba 1880
2bd0ea18
NS
1881 return 0;
1882
1883 error0:
65a15e06 1884 trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2bd0ea18
NS
1885 if (bno_cur)
1886 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1887 if (cnt_cur)
1888 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1889 return error;
1890}
1891
5000d01d 1892/*
2bd0ea18
NS
1893 * Visible (exported) allocation/free functions.
1894 * Some of these are used just by xfs_alloc_btree.c and this file.
1895 */
1896
1897/*
1898 * Compute and fill in value of m_ag_maxlevels.
1899 */
1900void
1901xfs_alloc_compute_maxlevels(
1902 xfs_mount_t *mp) /* file system mount structure */
1903{
1421de38 1904 mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
730e2a19 1905 (mp->m_sb.sb_agblocks + 1) / 2);
2bd0ea18
NS
1906}
1907
56b2de80 1908/*
cf8ce220
DW
1909 * Find the length of the longest extent in an AG. The 'need' parameter
1910 * specifies how much space we're going to need for the AGFL and the
1911 * 'reserved' parameter tells us how many blocks in this AG are reserved for
1912 * other callers.
56b2de80
DC
1913 */
1914xfs_extlen_t
1915xfs_alloc_longest_free_extent(
72bda06d 1916 struct xfs_perag *pag,
cf8ce220
DW
1917 xfs_extlen_t need,
1918 xfs_extlen_t reserved)
56b2de80 1919{
72bda06d 1920 xfs_extlen_t delta = 0;
56b2de80 1921
cf8ce220
DW
1922 /*
1923 * If the AGFL needs a recharge, we'll have to subtract that from the
1924 * longest extent.
1925 */
56b2de80
DC
1926 if (need > pag->pagf_flcount)
1927 delta = need - pag->pagf_flcount;
1928
cf8ce220
DW
1929 /*
1930 * If we cannot maintain others' reservations with space from the
1931 * not-longest freesp extents, we'll have to subtract /that/ from
1932 * the longest extent too.
1933 */
1934 if (pag->pagf_freeblks - pag->pagf_longest < reserved)
1935 delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
1936
1937 /*
1938 * If the longest extent is long enough to satisfy all the
1939 * reservations and AGFL rules in place, we can return this extent.
1940 */
56b2de80
DC
1941 if (pag->pagf_longest > delta)
1942 return pag->pagf_longest - delta;
cf8ce220
DW
1943
1944 /* Otherwise, let the caller try for 1 block if there's space. */
56b2de80
DC
1945 return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
1946}
1947
de046644
DC
1948unsigned int
1949xfs_alloc_min_freelist(
1950 struct xfs_mount *mp,
1951 struct xfs_perag *pag)
1952{
1953 unsigned int min_free;
1954
1955 /* space needed by-bno freespace btree */
1956 min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1,
1957 mp->m_ag_maxlevels);
1958 /* space needed by-size freespace btree */
1959 min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1,
1960 mp->m_ag_maxlevels);
b8a8d6e5
DW
1961 /* space needed reverse mapping used space btree */
1962 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
1963 min_free += min_t(unsigned int,
1964 pag->pagf_levels[XFS_BTNUM_RMAPi] + 1,
1965 mp->m_rmap_maxlevels);
de046644
DC
1966
1967 return min_free;
1968}
1969
5515b7c1
DC
1970/*
1971 * Check if the operation we are fixing up the freelist for should go ahead or
1972 * not. If we are freeing blocks, we always allow it, otherwise the allocation
1973 * is dependent on whether the size and shape of free space available will
1974 * permit the requested allocation to take place.
1975 */
1976static bool
1977xfs_alloc_space_available(
1978 struct xfs_alloc_arg *args,
1979 xfs_extlen_t min_free,
1980 int flags)
1981{
1982 struct xfs_perag *pag = args->pag;
3fe4a6dd 1983 xfs_extlen_t alloc_len, longest;
cf8ce220 1984 xfs_extlen_t reservation; /* blocks that are still reserved */
5515b7c1 1985 int available;
580a4380 1986 xfs_extlen_t agflcount;
5515b7c1
DC
1987
1988 if (flags & XFS_ALLOC_FLAG_FREEING)
1989 return true;
1990
cf8ce220
DW
1991 reservation = xfs_ag_resv_needed(pag, args->resv);
1992
5515b7c1 1993 /* do we have enough contiguous free space for the allocation? */
3fe4a6dd 1994 alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
1421de38 1995 longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
3fe4a6dd 1996 if (longest < alloc_len)
5515b7c1
DC
1997 return false;
1998
580a4380
BF
1999 /*
2000 * Do we have enough free space remaining for the allocation? Don't
2001 * account extra agfl blocks because we are about to defer free them,
2002 * making them unavailable until the current transaction commits.
2003 */
2004 agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2005 available = (int)(pag->pagf_freeblks + agflcount -
2c003dc2 2006 reservation - min_free - args->minleft);
3fe4a6dd 2007 if (available < (int)max(args->total, alloc_len))
5515b7c1
DC
2008 return false;
2009
2c003dc2
CH
2010 /*
2011 * Clamp maxlen to the amount of free space available for the actual
2012 * extent allocation.
2013 */
2014 if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2015 args->maxlen = available;
2016 ASSERT(args->maxlen > 0);
2017 ASSERT(args->maxlen >= args->minlen);
2018 }
2019
5515b7c1
DC
2020 return true;
2021}
2022
30c8be8a
BF
2023int
2024xfs_free_agfl_block(
2025 struct xfs_trans *tp,
2026 xfs_agnumber_t agno,
2027 xfs_agblock_t agbno,
2028 struct xfs_buf *agbp,
2029 struct xfs_owner_info *oinfo)
2030{
2031 int error;
2032 struct xfs_buf *bp;
2033
2034 error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2035 XFS_AG_RESV_AGFL);
2036 if (error)
2037 return error;
2038
4aa01a59 2039 bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno);
30c8be8a
BF
2040 if (!bp)
2041 return -EFSCORRUPTED;
2042 xfs_trans_binval(tp, bp);
2043
2044 return 0;
2045}
2046
8dbee8f5
BF
2047/*
2048 * Check the agfl fields of the agf for inconsistency or corruption. The purpose
2049 * is to detect an agfl header padding mismatch between current and early v5
2050 * kernels. This problem manifests as a 1-slot size difference between the
2051 * on-disk flcount and the active [first, last] range of a wrapped agfl. This
2052 * may also catch variants of agfl count corruption unrelated to padding. Either
2053 * way, we'll reset the agfl and warn the user.
2054 *
2055 * Return true if a reset is required before the agfl can be used, false
2056 * otherwise.
2057 */
2058static bool
2059xfs_agfl_needs_reset(
2060 struct xfs_mount *mp,
2061 struct xfs_agf *agf)
2062{
2063 uint32_t f = be32_to_cpu(agf->agf_flfirst);
2064 uint32_t l = be32_to_cpu(agf->agf_fllast);
2065 uint32_t c = be32_to_cpu(agf->agf_flcount);
2066 int agfl_size = xfs_agfl_size(mp);
2067 int active;
2068
2069 /* no agfl header on v4 supers */
2070 if (!xfs_sb_version_hascrc(&mp->m_sb))
2071 return false;
2072
2073 /*
2074 * The agf read verifier catches severe corruption of these fields.
2075 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2076 * the verifier allows it.
2077 */
2078 if (f >= agfl_size || l >= agfl_size)
2079 return true;
2080 if (c > agfl_size)
2081 return true;
2082
2083 /*
2084 * Check consistency between the on-disk count and the active range. An
2085 * agfl padding mismatch manifests as an inconsistent flcount.
2086 */
2087 if (c && l >= f)
2088 active = l - f + 1;
2089 else if (c)
2090 active = agfl_size - f + l + 1;
2091 else
2092 active = 0;
2093
2094 return active != c;
2095}
2096
2097/*
2098 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2099 * agfl content cannot be trusted. Warn the user that a repair is required to
2100 * recover leaked blocks.
2101 *
2102 * The purpose of this mechanism is to handle filesystems affected by the agfl
2103 * header padding mismatch problem. A reset keeps the filesystem online with a
2104 * relatively minor free space accounting inconsistency rather than suffer the
2105 * inevitable crash from use of an invalid agfl block.
2106 */
2107static void
2108xfs_agfl_reset(
2109 struct xfs_trans *tp,
2110 struct xfs_buf *agbp,
2111 struct xfs_perag *pag)
2112{
2113 struct xfs_mount *mp = tp->t_mountp;
2114 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
2115
2116 ASSERT(pag->pagf_agflreset);
2117 trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2118
2119 xfs_warn(mp,
2120 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2121 "Please unmount and run xfs_repair.",
2122 pag->pag_agno, pag->pagf_flcount);
2123
2124 agf->agf_flfirst = 0;
2125 agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2126 agf->agf_flcount = 0;
2127 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2128 XFS_AGF_FLCOUNT);
2129
2130 pag->pagf_flcount = 0;
2131 pag->pagf_agflreset = false;
2132}
2133
d5c1b462
BF
2134/*
2135 * Defer an AGFL block free. This is effectively equivalent to
2136 * xfs_bmap_add_free() with some special handling particular to AGFL blocks.
2137 *
2138 * Deferring AGFL frees helps prevent log reservation overruns due to too many
2139 * allocation operations in a transaction. AGFL frees are prone to this problem
2140 * because for one they are always freed one at a time. Further, an immediate
2141 * AGFL block free can cause a btree join and require another block free before
2142 * the real allocation can proceed. Deferring the free disconnects freeing up
2143 * the AGFL slot from freeing the block.
2144 */
2145STATIC void
2146xfs_defer_agfl_block(
21375e5d 2147 struct xfs_trans *tp,
d5c1b462
BF
2148 xfs_agnumber_t agno,
2149 xfs_fsblock_t agbno,
2150 struct xfs_owner_info *oinfo)
2151{
21375e5d 2152 struct xfs_mount *mp = tp->t_mountp;
d5c1b462
BF
2153 struct xfs_extent_free_item *new; /* new element */
2154
2155 ASSERT(xfs_bmap_free_item_zone != NULL);
2156 ASSERT(oinfo != NULL);
2157
6cd1e6db 2158 new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
d5c1b462
BF
2159 new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
2160 new->xefi_blockcount = 1;
2161 new->xefi_oinfo = *oinfo;
2162
2163 trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2164
21375e5d 2165 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
d5c1b462
BF
2166}
2167
2bd0ea18
NS
2168/*
2169 * Decide whether to use this allocation group for this allocation.
2170 * If so, fix up the btree freelist's size.
2bd0ea18 2171 */
ff105f75 2172int /* error */
2bd0ea18 2173xfs_alloc_fix_freelist(
c98e644e
DC
2174 struct xfs_alloc_arg *args, /* allocation argument structure */
2175 int flags) /* XFS_ALLOC_FLAG_... */
2bd0ea18 2176{
c98e644e
DC
2177 struct xfs_mount *mp = args->mp;
2178 struct xfs_perag *pag = args->pag;
2179 struct xfs_trans *tp = args->tp;
2180 struct xfs_buf *agbp = NULL;
2181 struct xfs_buf *agflbp = NULL;
2182 struct xfs_alloc_arg targs; /* local allocation arguments */
2183 xfs_agblock_t bno; /* freelist block */
2184 xfs_extlen_t need; /* total blocks needed in freelist */
fcdd428c 2185 int error = 0;
c98e644e 2186
d85d3424
BF
2187 /* deferred ops (AGFL block frees) require permanent transactions */
2188 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2189
2bd0ea18 2190 if (!pag->pagf_init) {
c98e644e
DC
2191 error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
2192 if (error)
2193 goto out_no_agbp;
2bd0ea18 2194 if (!pag->pagf_init) {
5e656dbb
BN
2195 ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
2196 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
c98e644e 2197 goto out_agbp_relse;
2bd0ea18 2198 }
c98e644e 2199 }
34317449 2200
5e656dbb 2201 /*
c98e644e
DC
2202 * If this is a metadata preferred pag and we are user data then try
2203 * somewhere else if we are not being asked to try harder at this
2204 * point
34317449 2205 */
1fccd5c8 2206 if (pag->pagf_metadata && xfs_alloc_is_userdata(args->datatype) &&
5e656dbb
BN
2207 (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2208 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
c98e644e 2209 goto out_agbp_relse;
34317449
NS
2210 }
2211
de046644 2212 need = xfs_alloc_min_freelist(mp, pag);
2c003dc2
CH
2213 if (!xfs_alloc_space_available(args, need, flags |
2214 XFS_ALLOC_FLAG_CHECK))
c98e644e 2215 goto out_agbp_relse;
5e656dbb 2216
2bd0ea18
NS
2217 /*
2218 * Get the a.g. freespace buffer.
2219 * Can fail if we're not blocking on locks, and it's held.
2220 */
c98e644e
DC
2221 if (!agbp) {
2222 error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
2223 if (error)
2224 goto out_no_agbp;
2225 if (!agbp) {
5e656dbb
BN
2226 ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK);
2227 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
c98e644e 2228 goto out_no_agbp;
2bd0ea18
NS
2229 }
2230 }
72bda06d 2231
8dbee8f5
BF
2232 /* reset a padding mismatched agfl before final free space check */
2233 if (pag->pagf_agflreset)
2234 xfs_agfl_reset(tp, agbp, pag);
2235
72bda06d 2236 /* If there isn't enough total space or single-extent, reject it. */
de046644 2237 need = xfs_alloc_min_freelist(mp, pag);
c98e644e
DC
2238 if (!xfs_alloc_space_available(args, need, flags))
2239 goto out_agbp_relse;
5515b7c1 2240
2bd0ea18
NS
2241 /*
2242 * Make the freelist shorter if it's too long.
72bda06d 2243 *
c98e644e
DC
2244 * Note that from this point onwards, we will always release the agf and
2245 * agfl buffers on error. This handles the case where we error out and
2246 * the buffers are clean or may not have been joined to the transaction
2247 * and hence need to be released manually. If they have been joined to
2248 * the transaction, then xfs_trans_brelse() will handle them
2249 * appropriately based on the recursion count and dirty state of the
2250 * buffer.
2251 *
72bda06d
DC
2252 * XXX (dgc): When we have lots of free space, does this buy us
2253 * anything other than extra overhead when we need to put more blocks
2254 * back on the free list? Maybe we should only do this when space is
2255 * getting low or the AGFL is more than half full?
e365af6f
DW
2256 *
2257 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2258 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2259 * updating the rmapbt. Both flags are used in xfs_repair while we're
2260 * rebuilding the rmapbt, and neither are used by the kernel. They're
2261 * both required to ensure that rmaps are correctly recorded for the
2262 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and
2263 * repair/rmap.c in xfsprogs for details.
2bd0ea18 2264 */
e365af6f 2265 memset(&targs, 0, sizeof(targs));
007347e3 2266 /* struct copy below */
e365af6f 2267 if (flags & XFS_ALLOC_FLAG_NORMAP)
007347e3 2268 targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
e365af6f 2269 else
007347e3 2270 targs.oinfo = XFS_RMAP_OINFO_AG;
e365af6f 2271 while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
5e656dbb
BN
2272 error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
2273 if (error)
c98e644e 2274 goto out_agbp_relse;
30c8be8a 2275
6dc24128
BF
2276 /* defer agfl frees */
2277 xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2bd0ea18 2278 }
72bda06d 2279
2bd0ea18
NS
2280 targs.tp = tp;
2281 targs.mp = mp;
2282 targs.agbp = agbp;
2283 targs.agno = args->agno;
cf8ce220 2284 targs.alignment = targs.minlen = targs.prod = 1;
2bd0ea18
NS
2285 targs.type = XFS_ALLOCTYPE_THIS_AG;
2286 targs.pag = pag;
72bda06d
DC
2287 error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
2288 if (error)
c98e644e 2289 goto out_agbp_relse;
72bda06d
DC
2290
2291 /* Make the freelist longer if it's too short. */
2292 while (pag->pagf_flcount < need) {
2bd0ea18 2293 targs.agbno = 0;
72bda06d 2294 targs.maxlen = need - pag->pagf_flcount;
9760cac2 2295 targs.resv = XFS_AG_RESV_AGFL;
72bda06d
DC
2296
2297 /* Allocate as many blocks as possible at once. */
2298 error = xfs_alloc_ag_vextent(&targs);
c98e644e
DC
2299 if (error)
2300 goto out_agflbp_relse;
2301
2bd0ea18 2302 /*
dfc130f3
RC
2303 * Stop if we run out. Won't happen if callers are obeying
2304 * the restrictions correctly. Can happen for free calls
2bd0ea18
NS
2305 * on a completely full ag.
2306 */
5e656dbb
BN
2307 if (targs.agbno == NULLAGBLOCK) {
2308 if (flags & XFS_ALLOC_FLAG_FREEING)
2309 break;
c98e644e 2310 goto out_agflbp_relse;
5e656dbb 2311 }
2bd0ea18
NS
2312 /*
2313 * Put each allocated block on the list.
2314 */
2315 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
5e656dbb
BN
2316 error = xfs_alloc_put_freelist(tp, agbp,
2317 agflbp, bno, 0);
2318 if (error)
c98e644e 2319 goto out_agflbp_relse;
2bd0ea18
NS
2320 }
2321 }
cb4deb22 2322 xfs_trans_brelse(tp, agflbp);
2bd0ea18
NS
2323 args->agbp = agbp;
2324 return 0;
c98e644e
DC
2325
2326out_agflbp_relse:
2327 xfs_trans_brelse(tp, agflbp);
2328out_agbp_relse:
2329 if (agbp)
2330 xfs_trans_brelse(tp, agbp);
2331out_no_agbp:
2332 args->agbp = NULL;
2333 return error;
2bd0ea18
NS
2334}
2335
2336/*
2337 * Get a block from the freelist.
2338 * Returns with the buffer for the block gotten.
2339 */
2340int /* error */
2341xfs_alloc_get_freelist(
2342 xfs_trans_t *tp, /* transaction pointer */
2343 xfs_buf_t *agbp, /* buffer containing the agf structure */
cdded3d8
DC
2344 xfs_agblock_t *bnop, /* block address retrieved from freelist */
2345 int btreeblk) /* destination is a AGF btree */
2bd0ea18
NS
2346{
2347 xfs_agf_t *agf; /* a.g. freespace structure */
2bd0ea18
NS
2348 xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */
2349 xfs_agblock_t bno; /* block number returned */
dd5b876e 2350 __be32 *agfl_bno;
2bd0ea18 2351 int error;
cdded3d8 2352 int logflags;
dd5b876e 2353 xfs_mount_t *mp = tp->t_mountp;
2bd0ea18
NS
2354 xfs_perag_t *pag; /* per allocation group data */
2355
2bd0ea18
NS
2356 /*
2357 * Freelist is empty, give up.
2358 */
dd5b876e 2359 agf = XFS_BUF_TO_AGF(agbp);
46eca962 2360 if (!agf->agf_flcount) {
2bd0ea18
NS
2361 *bnop = NULLAGBLOCK;
2362 return 0;
2363 }
2364 /*
2365 * Read the array of free blocks.
2366 */
dd5b876e
DC
2367 error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
2368 &agflbp);
2369 if (error)
2bd0ea18 2370 return error;
dd5b876e
DC
2371
2372
2bd0ea18
NS
2373 /*
2374 * Get the block number and update the data structures.
2375 */
dd5b876e
DC
2376 agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
2377 bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
5e656dbb 2378 be32_add_cpu(&agf->agf_flfirst, 1);
2bd0ea18 2379 xfs_trans_brelse(tp, agflbp);
b8165508 2380 if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
46eca962 2381 agf->agf_flfirst = 0;
56b2de80
DC
2382
2383 pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
8dbee8f5 2384 ASSERT(!pag->pagf_agflreset);
5e656dbb 2385 be32_add_cpu(&agf->agf_flcount, -1);
2bd0ea18
NS
2386 xfs_trans_agflist_delta(tp, -1);
2387 pag->pagf_flcount--;
cdded3d8
DC
2388
2389 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2390 if (btreeblk) {
5e656dbb 2391 be32_add_cpu(&agf->agf_btreeblks, 1);
cdded3d8
DC
2392 pag->pagf_btreeblks++;
2393 logflags |= XFS_AGF_BTREEBLKS;
2394 }
8648192f 2395 xfs_perag_put(pag);
cdded3d8 2396
cdded3d8 2397 xfs_alloc_log_agf(tp, agbp, logflags);
2bd0ea18 2398 *bnop = bno;
3e535bba 2399
2bd0ea18
NS
2400 return 0;
2401}
2402
2403/*
2404 * Log the given fields from the agf structure.
2405 */
2406void
2407xfs_alloc_log_agf(
2408 xfs_trans_t *tp, /* transaction pointer */
2409 xfs_buf_t *bp, /* buffer for a.g. freelist header */
dfc130f3 2410 int fields) /* mask of fields to be logged (XFS_AGF_...) */
2bd0ea18
NS
2411{
2412 int first; /* first byte offset */
2413 int last; /* last byte offset */
2414 static const short offsets[] = {
2415 offsetof(xfs_agf_t, agf_magicnum),
2416 offsetof(xfs_agf_t, agf_versionnum),
2417 offsetof(xfs_agf_t, agf_seqno),
2418 offsetof(xfs_agf_t, agf_length),
2419 offsetof(xfs_agf_t, agf_roots[0]),
2420 offsetof(xfs_agf_t, agf_levels[0]),
2421 offsetof(xfs_agf_t, agf_flfirst),
2422 offsetof(xfs_agf_t, agf_fllast),
2423 offsetof(xfs_agf_t, agf_flcount),
2424 offsetof(xfs_agf_t, agf_freeblks),
2425 offsetof(xfs_agf_t, agf_longest),
cdded3d8 2426 offsetof(xfs_agf_t, agf_btreeblks),
dd5b876e 2427 offsetof(xfs_agf_t, agf_uuid),
8511b71a 2428 offsetof(xfs_agf_t, agf_rmap_blocks),
bc859611
DW
2429 offsetof(xfs_agf_t, agf_refcount_blocks),
2430 offsetof(xfs_agf_t, agf_refcount_root),
2431 offsetof(xfs_agf_t, agf_refcount_level),
8511b71a
DW
2432 /* needed so that we don't log the whole rest of the structure: */
2433 offsetof(xfs_agf_t, agf_spare64),
2bd0ea18
NS
2434 sizeof(xfs_agf_t)
2435 };
2436
56b2de80
DC
2437 trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_);
2438
bdc16ee5 2439 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
dd5b876e 2440
2bd0ea18
NS
2441 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2442 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2443}
2444
2445/*
2446 * Interface for inode allocation to force the pag data to be initialized.
2447 */
2448int /* error */
2449xfs_alloc_pagf_init(
2450 xfs_mount_t *mp, /* file system mount structure */
2451 xfs_trans_t *tp, /* transaction pointer */
2452 xfs_agnumber_t agno, /* allocation group number */
2453 int flags) /* XFS_ALLOC_FLAGS_... */
2454{
7a3bffe4 2455 xfs_buf_t *bp;
2bd0ea18
NS
2456 int error;
2457
0e266570 2458 if ((error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp)))
2bd0ea18
NS
2459 return error;
2460 if (bp)
2461 xfs_trans_brelse(tp, bp);
2462 return 0;
2463}
2464
2465/*
2466 * Put the block on the freelist for the allocation group.
2467 */
2468int /* error */
2469xfs_alloc_put_freelist(
2470 xfs_trans_t *tp, /* transaction pointer */
2471 xfs_buf_t *agbp, /* buffer for a.g. freelist header */
2472 xfs_buf_t *agflbp,/* buffer for a.g. free block array */
cdded3d8
DC
2473 xfs_agblock_t bno, /* block being freed */
2474 int btreeblk) /* block came from a AGF btree */
2bd0ea18
NS
2475{
2476 xfs_agf_t *agf; /* a.g. freespace structure */
5e656dbb 2477 __be32 *blockp;/* pointer to array entry */
2bd0ea18 2478 int error;
cdded3d8 2479 int logflags;
2bd0ea18
NS
2480 xfs_mount_t *mp; /* mount structure */
2481 xfs_perag_t *pag; /* per allocation group data */
dd5b876e
DC
2482 __be32 *agfl_bno;
2483 int startoff;
2bd0ea18
NS
2484
2485 agf = XFS_BUF_TO_AGF(agbp);
2486 mp = tp->t_mountp;
2487
2488 if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
6e3140c7 2489 be32_to_cpu(agf->agf_seqno), &agflbp)))
2bd0ea18 2490 return error;
5e656dbb 2491 be32_add_cpu(&agf->agf_fllast, 1);
b8165508 2492 if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
46eca962 2493 agf->agf_fllast = 0;
56b2de80
DC
2494
2495 pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
8dbee8f5 2496 ASSERT(!pag->pagf_agflreset);
5e656dbb 2497 be32_add_cpu(&agf->agf_flcount, 1);
2bd0ea18
NS
2498 xfs_trans_agflist_delta(tp, 1);
2499 pag->pagf_flcount++;
cdded3d8
DC
2500
2501 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2502 if (btreeblk) {
5e656dbb 2503 be32_add_cpu(&agf->agf_btreeblks, -1);
cdded3d8
DC
2504 pag->pagf_btreeblks--;
2505 logflags |= XFS_AGF_BTREEBLKS;
2506 }
56b2de80 2507 xfs_perag_put(pag);
cdded3d8 2508
5e656dbb
BN
2509 xfs_alloc_log_agf(tp, agbp, logflags);
2510
b8165508 2511 ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
dd5b876e
DC
2512
2513 agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
2514 blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
5e656dbb 2515 *blockp = cpu_to_be32(bno);
dd5b876e
DC
2516 startoff = (char *)blockp - (char *)agflbp->b_addr;
2517
cdded3d8 2518 xfs_alloc_log_agf(tp, agbp, logflags);
dd5b876e 2519
bdc16ee5 2520 xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
dd5b876e
DC
2521 xfs_trans_log_buf(tp, agflbp, startoff,
2522 startoff + sizeof(xfs_agblock_t) - 1);
2bd0ea18
NS
2523 return 0;
2524}
2525
bc01119d 2526static xfs_failaddr_t
a2ceac1f 2527xfs_agf_verify(
95d9582b
DW
2528 struct xfs_buf *bp)
2529{
7861ef77 2530 struct xfs_mount *mp = bp->b_mount;
95d9582b 2531 struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
a2ceac1f 2532
a65d8d29
BF
2533 if (xfs_sb_version_hascrc(&mp->m_sb)) {
2534 if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
bc01119d 2535 return __this_address;
a65d8d29
BF
2536 if (!xfs_log_check_lsn(mp,
2537 be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn)))
bc01119d 2538 return __this_address;
a65d8d29 2539 }
a2ceac1f 2540
68dbe77f
BF
2541 if (!xfs_verify_magic(bp, agf->agf_magicnum))
2542 return __this_address;
2543
2544 if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
dd5b876e 2545 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
b8165508
DC
2546 be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
2547 be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
2548 be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
bc01119d 2549 return __this_address;
a2ceac1f 2550
00795aae
DW
2551 if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
2552 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
2553 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
5a35bf2c 2554 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
bc01119d 2555 return __this_address;
5a35bf2c 2556
e37838e5 2557 if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
00795aae
DW
2558 (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
2559 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS))
bc01119d 2560 return __this_address;
e37838e5 2561
a2ceac1f
DC
2562 /*
2563 * during growfs operations, the perag is not fully initialised,
2564 * so we can't use it for any useful checking. growfs ensures we can't
2565 * use it by using uncached buffers that don't have the perag attached
2566 * so we can detect and avoid this problem.
2567 */
dd5b876e 2568 if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
bc01119d 2569 return __this_address;
a2ceac1f 2570
dd5b876e
DC
2571 if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
2572 be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
bc01119d 2573 return __this_address;
dd5b876e 2574
88ce0792 2575 if (xfs_sb_version_hasreflink(&mp->m_sb) &&
00795aae
DW
2576 (be32_to_cpu(agf->agf_refcount_level) < 1 ||
2577 be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS))
bc01119d 2578 return __this_address;
88ce0792 2579
bc01119d 2580 return NULL;
a2ceac1f 2581
a2ceac1f
DC
2582}
2583
2584static void
2585xfs_agf_read_verify(
2586 struct xfs_buf *bp)
2587{
7861ef77 2588 struct xfs_mount *mp = bp->b_mount;
1e697959 2589 xfs_failaddr_t fa;
dd5b876e 2590
45922933
DC
2591 if (xfs_sb_version_hascrc(&mp->m_sb) &&
2592 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
1e697959
DW
2593 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
2594 else {
95d9582b 2595 fa = xfs_agf_verify(bp);
1e697959
DW
2596 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
2597 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2598 }
a2ceac1f
DC
2599}
2600
2601static void
2602xfs_agf_write_verify(
2603 struct xfs_buf *bp)
2604{
7861ef77 2605 struct xfs_mount *mp = bp->b_mount;
37d086ca 2606 struct xfs_buf_log_item *bip = bp->b_log_item;
1e697959 2607 xfs_failaddr_t fa;
dd5b876e 2608
95d9582b 2609 fa = xfs_agf_verify(bp);
1e697959
DW
2610 if (fa) {
2611 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
dd5b876e
DC
2612 return;
2613 }
2614
2615 if (!xfs_sb_version_hascrc(&mp->m_sb))
2616 return;
2617
2618 if (bip)
2619 XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2620
43b5aeed 2621 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
a2ceac1f
DC
2622}
2623
2624const struct xfs_buf_ops xfs_agf_buf_ops = {
a3fac935 2625 .name = "xfs_agf",
68dbe77f 2626 .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
a2ceac1f
DC
2627 .verify_read = xfs_agf_read_verify,
2628 .verify_write = xfs_agf_write_verify,
95d9582b 2629 .verify_struct = xfs_agf_verify,
a2ceac1f
DC
2630};
2631
2bd0ea18
NS
2632/*
2633 * Read in the allocation group header (free/alloc section).
2634 */
2635int /* error */
56b2de80
DC
2636xfs_read_agf(
2637 struct xfs_mount *mp, /* mount point structure */
2638 struct xfs_trans *tp, /* transaction pointer */
2639 xfs_agnumber_t agno, /* allocation group number */
2640 int flags, /* XFS_BUF_ */
2641 struct xfs_buf **bpp) /* buffer for the ag freelist header */
2bd0ea18 2642{
9440d84d 2643 int error;
2bd0ea18 2644
ff105f75
DC
2645 trace_xfs_read_agf(mp, agno);
2646
2bd0ea18 2647 ASSERT(agno != NULLAGNUMBER);
9440d84d
NS
2648 error = xfs_trans_read_buf(
2649 mp, tp, mp->m_ddev_targp,
2650 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
a2ceac1f 2651 XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
9440d84d 2652 if (error)
2bd0ea18 2653 return error;
56b2de80 2654 if (!*bpp)
2bd0ea18 2655 return 0;
56b2de80 2656
a2ceac1f
DC
2657 ASSERT(!(*bpp)->b_error);
2658 xfs_buf_set_ref(*bpp, XFS_AGF_REF);
56b2de80
DC
2659 return 0;
2660}
2661
2662/*
2663 * Read in the allocation group header (free/alloc section).
2664 */
2665int /* error */
2666xfs_alloc_read_agf(
2667 struct xfs_mount *mp, /* mount point structure */
2668 struct xfs_trans *tp, /* transaction pointer */
2669 xfs_agnumber_t agno, /* allocation group number */
2670 int flags, /* XFS_ALLOC_FLAG_... */
2671 struct xfs_buf **bpp) /* buffer for the ag freelist header */
2672{
2673 struct xfs_agf *agf; /* ag freelist header */
2674 struct xfs_perag *pag; /* per allocation group data */
2675 int error;
2676
ff105f75 2677 trace_xfs_alloc_read_agf(mp, agno);
56b2de80 2678
ff105f75 2679 ASSERT(agno != NULLAGNUMBER);
56b2de80
DC
2680 error = xfs_read_agf(mp, tp, agno,
2681 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
2682 bpp);
2683 if (error)
2684 return error;
2685 if (!*bpp)
2686 return 0;
a2ceac1f 2687 ASSERT(!(*bpp)->b_error);
56b2de80
DC
2688
2689 agf = XFS_BUF_TO_AGF(*bpp);
2690 pag = xfs_perag_get(mp, agno);
2bd0ea18 2691 if (!pag->pagf_init) {
6e3140c7 2692 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
cdded3d8 2693 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
6e3140c7
NS
2694 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
2695 pag->pagf_longest = be32_to_cpu(agf->agf_longest);
2bd0ea18 2696 pag->pagf_levels[XFS_BTNUM_BNOi] =
6e3140c7 2697 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
2bd0ea18 2698 pag->pagf_levels[XFS_BTNUM_CNTi] =
6e3140c7 2699 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
e37838e5
DW
2700 pag->pagf_levels[XFS_BTNUM_RMAPi] =
2701 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
88ce0792 2702 pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
2bd0ea18 2703 pag->pagf_init = 1;
8dbee8f5 2704 pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf);
2bd0ea18
NS
2705 }
2706#ifdef DEBUG
2707 else if (!XFS_FORCED_SHUTDOWN(mp)) {
6e3140c7 2708 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
cdded3d8 2709 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
6e3140c7
NS
2710 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
2711 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
2bd0ea18 2712 ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
6e3140c7 2713 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
2bd0ea18 2714 ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
6e3140c7 2715 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
2bd0ea18
NS
2716 }
2717#endif
56b2de80 2718 xfs_perag_put(pag);
2bd0ea18
NS
2719 return 0;
2720}
2721
2722/*
2723 * Allocate an extent (variable-size).
2724 * Depending on the allocation type, we either look in a single allocation
2725 * group or loop over the allocation groups to find the result.
2726 */
2727int /* error */
2728xfs_alloc_vextent(
03aeb29d 2729 struct xfs_alloc_arg *args) /* allocation argument structure */
2bd0ea18 2730{
03aeb29d
BF
2731 xfs_agblock_t agsize; /* allocation group size */
2732 int error;
2733 int flags; /* XFS_ALLOC_FLAG_... locking flags */
2734 struct xfs_mount *mp; /* mount structure pointer */
2735 xfs_agnumber_t sagno; /* starting allocation group number */
2736 xfs_alloctype_t type; /* input allocation type */
2737 int bump_rotor = 0;
2738 xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */
2bd0ea18
NS
2739
2740 mp = args->mp;
2741 type = args->otype = args->type;
2742 args->agbno = NULLAGBLOCK;
2743 /*
2744 * Just fix this up, for the case where the last a.g. is shorter
2745 * (or there's only one a.g.) and the caller couldn't easily figure
2746 * that out (xfs_bmap_alloc).
2747 */
2748 agsize = mp->m_sb.sb_agblocks;
2749 if (args->maxlen > agsize)
2750 args->maxlen = agsize;
2751 if (args->alignment == 0)
2752 args->alignment = 1;
2753 ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
2754 ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
2755 ASSERT(args->minlen <= args->maxlen);
2756 ASSERT(args->minlen <= agsize);
2757 ASSERT(args->mod < args->prod);
2758 if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
2759 XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
2760 args->minlen > args->maxlen || args->minlen > agsize ||
2761 args->mod >= args->prod) {
2762 args->fsbno = NULLFSBLOCK;
56b2de80 2763 trace_xfs_alloc_vextent_badargs(args);
2bd0ea18
NS
2764 return 0;
2765 }
9baa549b 2766
2bd0ea18
NS
2767 switch (type) {
2768 case XFS_ALLOCTYPE_THIS_AG:
2769 case XFS_ALLOCTYPE_NEAR_BNO:
2770 case XFS_ALLOCTYPE_THIS_BNO:
2771 /*
2772 * These three force us into a single a.g.
2773 */
2774 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
56b2de80 2775 args->pag = xfs_perag_get(mp, args->agno);
2bd0ea18 2776 error = xfs_alloc_fix_freelist(args, 0);
2bd0ea18 2777 if (error) {
56b2de80 2778 trace_xfs_alloc_vextent_nofix(args);
2bd0ea18
NS
2779 goto error0;
2780 }
2781 if (!args->agbp) {
56b2de80 2782 trace_xfs_alloc_vextent_noagbp(args);
2bd0ea18
NS
2783 break;
2784 }
2785 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
0e266570 2786 if ((error = xfs_alloc_ag_vextent(args)))
2bd0ea18 2787 goto error0;
2bd0ea18
NS
2788 break;
2789 case XFS_ALLOCTYPE_START_BNO:
2790 /*
2791 * Try near allocation first, then anywhere-in-ag after
2792 * the first a.g. fails.
2793 */
1fccd5c8 2794 if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
34317449 2795 (mp->m_flags & XFS_MOUNT_32BITINODES)) {
46eca962
NS
2796 args->fsbno = XFS_AGB_TO_FSB(mp,
2797 ((mp->m_agfrotor / rotorstep) %
2798 mp->m_sb.sb_agcount), 0);
34317449
NS
2799 bump_rotor = 1;
2800 }
2bd0ea18
NS
2801 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
2802 args->type = XFS_ALLOCTYPE_NEAR_BNO;
2803 /* FALLTHROUGH */
2bd0ea18
NS
2804 case XFS_ALLOCTYPE_FIRST_AG:
2805 /*
2806 * Rotate through the allocation groups looking for a winner.
2807 */
f3eda3a5 2808 if (type == XFS_ALLOCTYPE_FIRST_AG) {
2bd0ea18
NS
2809 /*
2810 * Start with allocation group given by bno.
2811 */
2812 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
2813 args->type = XFS_ALLOCTYPE_THIS_AG;
2814 sagno = 0;
2815 flags = 0;
2816 } else {
2bd0ea18
NS
2817 /*
2818 * Start with the given allocation group.
2819 */
2820 args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
2821 flags = XFS_ALLOC_FLAG_TRYLOCK;
2822 }
2823 /*
2824 * Loop over allocation groups twice; first time with
2825 * trylock set, second time without.
2826 */
2827 for (;;) {
56b2de80 2828 args->pag = xfs_perag_get(mp, args->agno);
9baa549b 2829 error = xfs_alloc_fix_freelist(args, flags);
9baa549b 2830 if (error) {
56b2de80 2831 trace_xfs_alloc_vextent_nofix(args);
2bd0ea18
NS
2832 goto error0;
2833 }
2834 /*
2835 * If we get a buffer back then the allocation will fly.
2836 */
2837 if (args->agbp) {
0e266570 2838 if ((error = xfs_alloc_ag_vextent(args)))
2bd0ea18 2839 goto error0;
2bd0ea18
NS
2840 break;
2841 }
56b2de80
DC
2842
2843 trace_xfs_alloc_vextent_loopfailed(args);
2844
2bd0ea18
NS
2845 /*
2846 * Didn't work, figure out the next iteration.
2847 */
2848 if (args->agno == sagno &&
2849 type == XFS_ALLOCTYPE_START_BNO)
2850 args->type = XFS_ALLOCTYPE_THIS_AG;
5e656dbb
BN
2851 /*
2852 * For the first allocation, we can try any AG to get
2853 * space. However, if we already have allocated a
2854 * block, we don't want to try AGs whose number is below
2855 * sagno. Otherwise, we may end up with out-of-order
2856 * locking of AGF, which might cause deadlock.
2857 */
2858 if (++(args->agno) == mp->m_sb.sb_agcount) {
03aeb29d 2859 if (args->tp->t_firstblock != NULLFSBLOCK)
5e656dbb
BN
2860 args->agno = sagno;
2861 else
2862 args->agno = 0;
2863 }
5000d01d 2864 /*
2bd0ea18
NS
2865 * Reached the starting a.g., must either be done
2866 * or switch to non-trylock mode.
2867 */
2868 if (args->agno == sagno) {
a3b4a951 2869 if (flags == 0) {
2bd0ea18 2870 args->agbno = NULLAGBLOCK;
56b2de80 2871 trace_xfs_alloc_vextent_allfailed(args);
2bd0ea18
NS
2872 break;
2873 }
a3b4a951
CH
2874
2875 flags = 0;
2876 if (type == XFS_ALLOCTYPE_START_BNO) {
2877 args->agbno = XFS_FSB_TO_AGBNO(mp,
2878 args->fsbno);
2879 args->type = XFS_ALLOCTYPE_NEAR_BNO;
2bd0ea18
NS
2880 }
2881 }
56b2de80 2882 xfs_perag_put(args->pag);
2bd0ea18 2883 }
f3eda3a5 2884 if (bump_rotor) {
46eca962
NS
2885 if (args->agno == sagno)
2886 mp->m_agfrotor = (mp->m_agfrotor + 1) %
2887 (mp->m_sb.sb_agcount * rotorstep);
2888 else
2889 mp->m_agfrotor = (args->agno * rotorstep + 1) %
2890 (mp->m_sb.sb_agcount * rotorstep);
2891 }
2bd0ea18
NS
2892 break;
2893 default:
2894 ASSERT(0);
2895 /* NOTREACHED */
2896 }
2897 if (args->agbno == NULLAGBLOCK)
2898 args->fsbno = NULLFSBLOCK;
2899 else {
2900 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
2901#ifdef DEBUG
2902 ASSERT(args->len >= args->minlen);
2903 ASSERT(args->len <= args->maxlen);
2904 ASSERT(args->agbno % args->alignment == 0);
2905 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
2906 args->len);
2907#endif
9542ae13
DC
2908
2909 /* Zero the extent if we were asked to do so */
1fccd5c8 2910 if (args->datatype & XFS_ALLOC_USERDATA_ZERO) {
9542ae13
DC
2911 error = xfs_zero_extent(args->ip, args->fsbno, args->len);
2912 if (error)
2913 goto error0;
2914 }
2915
2bd0ea18 2916 }
56b2de80 2917 xfs_perag_put(args->pag);
2bd0ea18
NS
2918 return 0;
2919error0:
56b2de80 2920 xfs_perag_put(args->pag);
2bd0ea18
NS
2921 return error;
2922}
2923
2a6da3b8
DC
2924/* Ensure that the freelist is at full capacity. */
2925int
2926xfs_free_extent_fix_freelist(
2927 struct xfs_trans *tp,
2928 xfs_agnumber_t agno,
2929 struct xfs_buf **agbp)
2bd0ea18 2930{
2a6da3b8
DC
2931 struct xfs_alloc_arg args;
2932 int error;
2bd0ea18 2933
2a6da3b8 2934 memset(&args, 0, sizeof(struct xfs_alloc_arg));
2bd0ea18
NS
2935 args.tp = tp;
2936 args.mp = tp->t_mountp;
2a6da3b8 2937 args.agno = agno;
a2ceac1f
DC
2938
2939 /*
2940 * validate that the block number is legal - the enables us to detect
2941 * and handle a silent filesystem corruption rather than crashing.
2942 */
a2ceac1f 2943 if (args.agno >= args.mp->m_sb.sb_agcount)
12b53197 2944 return -EFSCORRUPTED;
a2ceac1f 2945
56b2de80 2946 args.pag = xfs_perag_get(args.mp, args.agno);
a2ceac1f
DC
2947 ASSERT(args.pag);
2948
2949 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
2950 if (error)
2a6da3b8
DC
2951 goto out;
2952
2953 *agbp = args.agbp;
2954out:
2955 xfs_perag_put(args.pag);
2956 return error;
2957}
2958
2959/*
2960 * Free an extent.
2961 * Just break up the extent address and hand off to xfs_free_ag_extent
2962 * after fixing up the freelist.
2963 */
5837e73b 2964int
3a13f959 2965__xfs_free_extent(
5837e73b
DW
2966 struct xfs_trans *tp,
2967 xfs_fsblock_t bno,
2968 xfs_extlen_t len,
2969 const struct xfs_owner_info *oinfo,
2970 enum xfs_ag_resv_type type,
2971 bool skip_discard)
2a6da3b8 2972{
5837e73b
DW
2973 struct xfs_mount *mp = tp->t_mountp;
2974 struct xfs_buf *agbp;
2975 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
2976 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
2977 int error;
2978 unsigned int busy_flags = 0;
2a6da3b8
DC
2979
2980 ASSERT(len != 0);
9760cac2 2981 ASSERT(type != XFS_AG_RESV_AGFL);
2a6da3b8 2982
a9da40de 2983 if (XFS_TEST_ERROR(false, mp,
e2a190dd 2984 XFS_ERRTAG_FREE_EXTENT))
a9da40de
DW
2985 return -EIO;
2986
2a6da3b8
DC
2987 error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
2988 if (error)
2989 return error;
2990
2991 XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err);
a2ceac1f
DC
2992
2993 /* validate the extent size is legal now we have the agf locked */
2a6da3b8
DC
2994 XFS_WANT_CORRUPTED_GOTO(mp,
2995 agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
2996 err);
a2ceac1f 2997
cf8ce220 2998 error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
2a6da3b8
DC
2999 if (error)
3000 goto err;
3001
3a13f959
BF
3002 if (skip_discard)
3003 busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
3004 xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags);
2a6da3b8
DC
3005 return 0;
3006
3007err:
3008 xfs_trans_brelse(tp, agbp);
2bd0ea18
NS
3009 return error;
3010}
b3d83fa6
DW
3011
3012struct xfs_alloc_query_range_info {
3013 xfs_alloc_query_range_fn fn;
3014 void *priv;
3015};
3016
3017/* Format btree record and pass to our callback. */
3018STATIC int
3019xfs_alloc_query_range_helper(
3020 struct xfs_btree_cur *cur,
3021 union xfs_btree_rec *rec,
3022 void *priv)
3023{
3024 struct xfs_alloc_query_range_info *query = priv;
3025 struct xfs_alloc_rec_incore irec;
3026
3027 irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
3028 irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
3029 return query->fn(cur, &irec, query->priv);
3030}
3031
3032/* Find all free space within a given range of blocks. */
3033int
3034xfs_alloc_query_range(
3035 struct xfs_btree_cur *cur,
3036 struct xfs_alloc_rec_incore *low_rec,
3037 struct xfs_alloc_rec_incore *high_rec,
3038 xfs_alloc_query_range_fn fn,
3039 void *priv)
3040{
3041 union xfs_btree_irec low_brec;
3042 union xfs_btree_irec high_brec;
3043 struct xfs_alloc_query_range_info query;
3044
3045 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3046 low_brec.a = *low_rec;
3047 high_brec.a = *high_rec;
3048 query.priv = priv;
3049 query.fn = fn;
3050 return xfs_btree_query_range(cur, &low_brec, &high_brec,
3051 xfs_alloc_query_range_helper, &query);
3052}
7e05e856
DW
3053
3054/* Find all free space records. */
3055int
3056xfs_alloc_query_all(
3057 struct xfs_btree_cur *cur,
3058 xfs_alloc_query_range_fn fn,
3059 void *priv)
3060{
3061 struct xfs_alloc_query_range_info query;
3062
3063 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3064 query.priv = priv;
3065 query.fn = fn;
3066 return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
3067}
9bef6258 3068
1fe41a73
DW
3069/* Is there a record covering a given extent? */
3070int
3071xfs_alloc_has_record(
3072 struct xfs_btree_cur *cur,
3073 xfs_agblock_t bno,
3074 xfs_extlen_t len,
3075 bool *exists)
3076{
3077 union xfs_btree_irec low;
3078 union xfs_btree_irec high;
3079
3080 memset(&low, 0, sizeof(low));
3081 low.a.ar_startblock = bno;
3082 memset(&high, 0xFF, sizeof(high));
3083 high.a.ar_startblock = bno + len - 1;
3084
3085 return xfs_btree_has_record(cur, &low, &high, exists);
3086}
71a98c66
DW
3087
3088/*
3089 * Walk all the blocks in the AGFL. The @walk_fn can return any negative
4a509d6d 3090 * error code or XFS_ITER_*.
71a98c66
DW
3091 */
3092int
3093xfs_agfl_walk(
3094 struct xfs_mount *mp,
3095 struct xfs_agf *agf,
3096 struct xfs_buf *agflbp,
3097 xfs_agfl_walk_fn walk_fn,
3098 void *priv)
3099{
3100 __be32 *agfl_bno;
3101 unsigned int i;
3102 int error;
3103
3104 agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp);
3105 i = be32_to_cpu(agf->agf_flfirst);
3106
3107 /* Nothing to walk in an empty AGFL. */
3108 if (agf->agf_flcount == cpu_to_be32(0))
3109 return 0;
3110
3111 /* Otherwise, walk from first to last, wrapping as needed. */
3112 for (;;) {
3113 error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
3114 if (error)
3115 return error;
3116 if (i == be32_to_cpu(agf->agf_fllast))
3117 break;
3118 if (++i == xfs_agfl_size(mp))
3119 i = 0;
3120 }
3121
3122 return 0;
3123}