]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - libxfs/xfs_alloc.c
xfs: set xefi_discard when creating a deferred agfl free log intent item
[thirdparty/xfsprogs-dev.git] / libxfs / xfs_alloc.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6 #include "libxfs_priv.h"
7 #include "xfs_fs.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_shared.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_sb.h"
14 #include "xfs_mount.h"
15 #include "xfs_defer.h"
16 #include "xfs_btree.h"
17 #include "xfs_rmap.h"
18 #include "xfs_alloc_btree.h"
19 #include "xfs_alloc.h"
20 #include "xfs_errortag.h"
21 #include "xfs_trace.h"
22 #include "xfs_trans.h"
23 #include "xfs_ag_resv.h"
24 #include "xfs_bmap.h"
25
26 extern kmem_zone_t *xfs_bmap_free_item_zone;
27
28 struct workqueue_struct *xfs_alloc_wq;
29
30 #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
31
32 #define XFSA_FIXUP_BNO_OK 1
33 #define XFSA_FIXUP_CNT_OK 2
34
35 STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
36 STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
37 STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
38
39 /*
40 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
41 * the beginning of the block for a proper header with the location information
42 * and CRC.
43 */
44 unsigned int
45 xfs_agfl_size(
46 struct xfs_mount *mp)
47 {
48 unsigned int size = mp->m_sb.sb_sectsize;
49
50 if (xfs_sb_version_hascrc(&mp->m_sb))
51 size -= sizeof(struct xfs_agfl);
52
53 return size / sizeof(xfs_agblock_t);
54 }
55
56 unsigned int
57 xfs_refc_block(
58 struct xfs_mount *mp)
59 {
60 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
61 return XFS_RMAP_BLOCK(mp) + 1;
62 if (xfs_sb_version_hasfinobt(&mp->m_sb))
63 return XFS_FIBT_BLOCK(mp) + 1;
64 return XFS_IBT_BLOCK(mp) + 1;
65 }
66
67 xfs_extlen_t
68 xfs_prealloc_blocks(
69 struct xfs_mount *mp)
70 {
71 if (xfs_sb_version_hasreflink(&mp->m_sb))
72 return xfs_refc_block(mp) + 1;
73 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
74 return XFS_RMAP_BLOCK(mp) + 1;
75 if (xfs_sb_version_hasfinobt(&mp->m_sb))
76 return XFS_FIBT_BLOCK(mp) + 1;
77 return XFS_IBT_BLOCK(mp) + 1;
78 }
79
80 /*
81 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
82 * AGF buffer (PV 947395), we place constraints on the relationship among
83 * actual allocations for data blocks, freelist blocks, and potential file data
84 * bmap btree blocks. However, these restrictions may result in no actual space
85 * allocated for a delayed extent, for example, a data block in a certain AG is
86 * allocated but there is no additional block for the additional bmap btree
87 * block due to a split of the bmap btree of the file. The result of this may
88 * lead to an infinite loop when the file gets flushed to disk and all delayed
89 * extents need to be actually allocated. To get around this, we explicitly set
90 * aside a few blocks which will not be reserved in delayed allocation.
91 *
92 * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
93 * potential split of the file's bmap btree.
94 */
95 unsigned int
96 xfs_alloc_set_aside(
97 struct xfs_mount *mp)
98 {
99 return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4);
100 }
101
102 /*
103 * When deciding how much space to allocate out of an AG, we limit the
104 * allocation maximum size to the size the AG. However, we cannot use all the
105 * blocks in the AG - some are permanently used by metadata. These
106 * blocks are generally:
107 * - the AG superblock, AGF, AGI and AGFL
108 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally
109 * the AGI free inode and rmap btree root blocks.
110 * - blocks on the AGFL according to xfs_alloc_set_aside() limits
111 * - the rmapbt root block
112 *
113 * The AG headers are sector sized, so the amount of space they take up is
114 * dependent on filesystem geometry. The others are all single blocks.
115 */
116 unsigned int
117 xfs_alloc_ag_max_usable(
118 struct xfs_mount *mp)
119 {
120 unsigned int blocks;
121
122 blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
123 blocks += XFS_ALLOC_AGFL_RESERVE;
124 blocks += 3; /* AGF, AGI btree root blocks */
125 if (xfs_sb_version_hasfinobt(&mp->m_sb))
126 blocks++; /* finobt root block */
127 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
128 blocks++; /* rmap root block */
129 if (xfs_sb_version_hasreflink(&mp->m_sb))
130 blocks++; /* refcount root block */
131
132 return mp->m_sb.sb_agblocks - blocks;
133 }
134
135 /*
136 * Lookup the record equal to [bno, len] in the btree given by cur.
137 */
138 STATIC int /* error */
139 xfs_alloc_lookup_eq(
140 struct xfs_btree_cur *cur, /* btree cursor */
141 xfs_agblock_t bno, /* starting block of extent */
142 xfs_extlen_t len, /* length of extent */
143 int *stat) /* success/failure */
144 {
145 int error;
146
147 cur->bc_rec.a.ar_startblock = bno;
148 cur->bc_rec.a.ar_blockcount = len;
149 error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
150 cur->bc_ag.abt.active = (*stat == 1);
151 return error;
152 }
153
154 /*
155 * Lookup the first record greater than or equal to [bno, len]
156 * in the btree given by cur.
157 */
158 int /* error */
159 xfs_alloc_lookup_ge(
160 struct xfs_btree_cur *cur, /* btree cursor */
161 xfs_agblock_t bno, /* starting block of extent */
162 xfs_extlen_t len, /* length of extent */
163 int *stat) /* success/failure */
164 {
165 int error;
166
167 cur->bc_rec.a.ar_startblock = bno;
168 cur->bc_rec.a.ar_blockcount = len;
169 error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
170 cur->bc_ag.abt.active = (*stat == 1);
171 return error;
172 }
173
174 /*
175 * Lookup the first record less than or equal to [bno, len]
176 * in the btree given by cur.
177 */
178 int /* error */
179 xfs_alloc_lookup_le(
180 struct xfs_btree_cur *cur, /* btree cursor */
181 xfs_agblock_t bno, /* starting block of extent */
182 xfs_extlen_t len, /* length of extent */
183 int *stat) /* success/failure */
184 {
185 int error;
186 cur->bc_rec.a.ar_startblock = bno;
187 cur->bc_rec.a.ar_blockcount = len;
188 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
189 cur->bc_ag.abt.active = (*stat == 1);
190 return error;
191 }
192
193 static inline bool
194 xfs_alloc_cur_active(
195 struct xfs_btree_cur *cur)
196 {
197 return cur && cur->bc_ag.abt.active;
198 }
199
200 /*
201 * Update the record referred to by cur to the value given
202 * by [bno, len].
203 * This either works (return 0) or gets an EFSCORRUPTED error.
204 */
205 STATIC int /* error */
206 xfs_alloc_update(
207 struct xfs_btree_cur *cur, /* btree cursor */
208 xfs_agblock_t bno, /* starting block of extent */
209 xfs_extlen_t len) /* length of extent */
210 {
211 union xfs_btree_rec rec;
212
213 rec.alloc.ar_startblock = cpu_to_be32(bno);
214 rec.alloc.ar_blockcount = cpu_to_be32(len);
215 return xfs_btree_update(cur, &rec);
216 }
217
218 /*
219 * Get the data from the pointed-to record.
220 */
221 int /* error */
222 xfs_alloc_get_rec(
223 struct xfs_btree_cur *cur, /* btree cursor */
224 xfs_agblock_t *bno, /* output: starting block of extent */
225 xfs_extlen_t *len, /* output: length of extent */
226 int *stat) /* output: success/failure */
227 {
228 struct xfs_mount *mp = cur->bc_mp;
229 xfs_agnumber_t agno = cur->bc_ag.agno;
230 union xfs_btree_rec *rec;
231 int error;
232
233 error = xfs_btree_get_rec(cur, &rec, stat);
234 if (error || !(*stat))
235 return error;
236
237 *bno = be32_to_cpu(rec->alloc.ar_startblock);
238 *len = be32_to_cpu(rec->alloc.ar_blockcount);
239
240 if (*len == 0)
241 goto out_bad_rec;
242
243 /* check for valid extent range, including overflow */
244 if (!xfs_verify_agbno(mp, agno, *bno))
245 goto out_bad_rec;
246 if (*bno > *bno + *len)
247 goto out_bad_rec;
248 if (!xfs_verify_agbno(mp, agno, *bno + *len - 1))
249 goto out_bad_rec;
250
251 return 0;
252
253 out_bad_rec:
254 xfs_warn(mp,
255 "%s Freespace BTree record corruption in AG %d detected!",
256 cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno);
257 xfs_warn(mp,
258 "start block 0x%x block count 0x%x", *bno, *len);
259 return -EFSCORRUPTED;
260 }
261
262 /*
263 * Compute aligned version of the found extent.
264 * Takes alignment and min length into account.
265 */
266 STATIC bool
267 xfs_alloc_compute_aligned(
268 xfs_alloc_arg_t *args, /* allocation argument structure */
269 xfs_agblock_t foundbno, /* starting block in found extent */
270 xfs_extlen_t foundlen, /* length in found extent */
271 xfs_agblock_t *resbno, /* result block number */
272 xfs_extlen_t *reslen, /* result length */
273 unsigned *busy_gen)
274 {
275 xfs_agblock_t bno = foundbno;
276 xfs_extlen_t len = foundlen;
277 xfs_extlen_t diff;
278 bool busy;
279
280 /* Trim busy sections out of found extent */
281 busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
282
283 /*
284 * If we have a largish extent that happens to start before min_agbno,
285 * see if we can shift it into range...
286 */
287 if (bno < args->min_agbno && bno + len > args->min_agbno) {
288 diff = args->min_agbno - bno;
289 if (len > diff) {
290 bno += diff;
291 len -= diff;
292 }
293 }
294
295 if (args->alignment > 1 && len >= args->minlen) {
296 xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
297
298 diff = aligned_bno - bno;
299
300 *resbno = aligned_bno;
301 *reslen = diff >= len ? 0 : len - diff;
302 } else {
303 *resbno = bno;
304 *reslen = len;
305 }
306
307 return busy;
308 }
309
310 /*
311 * Compute best start block and diff for "near" allocations.
312 * freelen >= wantlen already checked by caller.
313 */
314 STATIC xfs_extlen_t /* difference value (absolute) */
315 xfs_alloc_compute_diff(
316 xfs_agblock_t wantbno, /* target starting block */
317 xfs_extlen_t wantlen, /* target length */
318 xfs_extlen_t alignment, /* target alignment */
319 int datatype, /* are we allocating data? */
320 xfs_agblock_t freebno, /* freespace's starting block */
321 xfs_extlen_t freelen, /* freespace's length */
322 xfs_agblock_t *newbnop) /* result: best start block from free */
323 {
324 xfs_agblock_t freeend; /* end of freespace extent */
325 xfs_agblock_t newbno1; /* return block number */
326 xfs_agblock_t newbno2; /* other new block number */
327 xfs_extlen_t newlen1=0; /* length with newbno1 */
328 xfs_extlen_t newlen2=0; /* length with newbno2 */
329 xfs_agblock_t wantend; /* end of target extent */
330 bool userdata = datatype & XFS_ALLOC_USERDATA;
331
332 ASSERT(freelen >= wantlen);
333 freeend = freebno + freelen;
334 wantend = wantbno + wantlen;
335 /*
336 * We want to allocate from the start of a free extent if it is past
337 * the desired block or if we are allocating user data and the free
338 * extent is before desired block. The second case is there to allow
339 * for contiguous allocation from the remaining free space if the file
340 * grows in the short term.
341 */
342 if (freebno >= wantbno || (userdata && freeend < wantend)) {
343 if ((newbno1 = roundup(freebno, alignment)) >= freeend)
344 newbno1 = NULLAGBLOCK;
345 } else if (freeend >= wantend && alignment > 1) {
346 newbno1 = roundup(wantbno, alignment);
347 newbno2 = newbno1 - alignment;
348 if (newbno1 >= freeend)
349 newbno1 = NULLAGBLOCK;
350 else
351 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
352 if (newbno2 < freebno)
353 newbno2 = NULLAGBLOCK;
354 else
355 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
356 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
357 if (newlen1 < newlen2 ||
358 (newlen1 == newlen2 &&
359 XFS_ABSDIFF(newbno1, wantbno) >
360 XFS_ABSDIFF(newbno2, wantbno)))
361 newbno1 = newbno2;
362 } else if (newbno2 != NULLAGBLOCK)
363 newbno1 = newbno2;
364 } else if (freeend >= wantend) {
365 newbno1 = wantbno;
366 } else if (alignment > 1) {
367 newbno1 = roundup(freeend - wantlen, alignment);
368 if (newbno1 > freeend - wantlen &&
369 newbno1 - alignment >= freebno)
370 newbno1 -= alignment;
371 else if (newbno1 >= freeend)
372 newbno1 = NULLAGBLOCK;
373 } else
374 newbno1 = freeend - wantlen;
375 *newbnop = newbno1;
376 return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
377 }
378
379 /*
380 * Fix up the length, based on mod and prod.
381 * len should be k * prod + mod for some k.
382 * If len is too small it is returned unchanged.
383 * If len hits maxlen it is left alone.
384 */
385 STATIC void
386 xfs_alloc_fix_len(
387 xfs_alloc_arg_t *args) /* allocation argument structure */
388 {
389 xfs_extlen_t k;
390 xfs_extlen_t rlen;
391
392 ASSERT(args->mod < args->prod);
393 rlen = args->len;
394 ASSERT(rlen >= args->minlen);
395 ASSERT(rlen <= args->maxlen);
396 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
397 (args->mod == 0 && rlen < args->prod))
398 return;
399 k = rlen % args->prod;
400 if (k == args->mod)
401 return;
402 if (k > args->mod)
403 rlen = rlen - (k - args->mod);
404 else
405 rlen = rlen - args->prod + (args->mod - k);
406 /* casts to (int) catch length underflows */
407 if ((int)rlen < (int)args->minlen)
408 return;
409 ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
410 ASSERT(rlen % args->prod == args->mod);
411 ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
412 rlen + args->minleft);
413 args->len = rlen;
414 }
415
416 /*
417 * Update the two btrees, logically removing from freespace the extent
418 * starting at rbno, rlen blocks. The extent is contained within the
419 * actual (current) free extent fbno for flen blocks.
420 * Flags are passed in indicating whether the cursors are set to the
421 * relevant records.
422 */
423 STATIC int /* error code */
424 xfs_alloc_fixup_trees(
425 xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */
426 xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */
427 xfs_agblock_t fbno, /* starting block of free extent */
428 xfs_extlen_t flen, /* length of free extent */
429 xfs_agblock_t rbno, /* starting block of returned extent */
430 xfs_extlen_t rlen, /* length of returned extent */
431 int flags) /* flags, XFSA_FIXUP_... */
432 {
433 int error; /* error code */
434 int i; /* operation results */
435 xfs_agblock_t nfbno1; /* first new free startblock */
436 xfs_agblock_t nfbno2; /* second new free startblock */
437 xfs_extlen_t nflen1=0; /* first new free length */
438 xfs_extlen_t nflen2=0; /* second new free length */
439 struct xfs_mount *mp;
440
441 mp = cnt_cur->bc_mp;
442
443 /*
444 * Look up the record in the by-size tree if necessary.
445 */
446 if (flags & XFSA_FIXUP_CNT_OK) {
447 #ifdef DEBUG
448 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
449 return error;
450 if (XFS_IS_CORRUPT(mp,
451 i != 1 ||
452 nfbno1 != fbno ||
453 nflen1 != flen))
454 return -EFSCORRUPTED;
455 #endif
456 } else {
457 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
458 return error;
459 if (XFS_IS_CORRUPT(mp, i != 1))
460 return -EFSCORRUPTED;
461 }
462 /*
463 * Look up the record in the by-block tree if necessary.
464 */
465 if (flags & XFSA_FIXUP_BNO_OK) {
466 #ifdef DEBUG
467 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
468 return error;
469 if (XFS_IS_CORRUPT(mp,
470 i != 1 ||
471 nfbno1 != fbno ||
472 nflen1 != flen))
473 return -EFSCORRUPTED;
474 #endif
475 } else {
476 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
477 return error;
478 if (XFS_IS_CORRUPT(mp, i != 1))
479 return -EFSCORRUPTED;
480 }
481
482 #ifdef DEBUG
483 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
484 struct xfs_btree_block *bnoblock;
485 struct xfs_btree_block *cntblock;
486
487 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
488 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
489
490 if (XFS_IS_CORRUPT(mp,
491 bnoblock->bb_numrecs !=
492 cntblock->bb_numrecs))
493 return -EFSCORRUPTED;
494 }
495 #endif
496
497 /*
498 * Deal with all four cases: the allocated record is contained
499 * within the freespace record, so we can have new freespace
500 * at either (or both) end, or no freespace remaining.
501 */
502 if (rbno == fbno && rlen == flen)
503 nfbno1 = nfbno2 = NULLAGBLOCK;
504 else if (rbno == fbno) {
505 nfbno1 = rbno + rlen;
506 nflen1 = flen - rlen;
507 nfbno2 = NULLAGBLOCK;
508 } else if (rbno + rlen == fbno + flen) {
509 nfbno1 = fbno;
510 nflen1 = flen - rlen;
511 nfbno2 = NULLAGBLOCK;
512 } else {
513 nfbno1 = fbno;
514 nflen1 = rbno - fbno;
515 nfbno2 = rbno + rlen;
516 nflen2 = (fbno + flen) - nfbno2;
517 }
518 /*
519 * Delete the entry from the by-size btree.
520 */
521 if ((error = xfs_btree_delete(cnt_cur, &i)))
522 return error;
523 if (XFS_IS_CORRUPT(mp, i != 1))
524 return -EFSCORRUPTED;
525 /*
526 * Add new by-size btree entry(s).
527 */
528 if (nfbno1 != NULLAGBLOCK) {
529 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
530 return error;
531 if (XFS_IS_CORRUPT(mp, i != 0))
532 return -EFSCORRUPTED;
533 if ((error = xfs_btree_insert(cnt_cur, &i)))
534 return error;
535 if (XFS_IS_CORRUPT(mp, i != 1))
536 return -EFSCORRUPTED;
537 }
538 if (nfbno2 != NULLAGBLOCK) {
539 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
540 return error;
541 if (XFS_IS_CORRUPT(mp, i != 0))
542 return -EFSCORRUPTED;
543 if ((error = xfs_btree_insert(cnt_cur, &i)))
544 return error;
545 if (XFS_IS_CORRUPT(mp, i != 1))
546 return -EFSCORRUPTED;
547 }
548 /*
549 * Fix up the by-block btree entry(s).
550 */
551 if (nfbno1 == NULLAGBLOCK) {
552 /*
553 * No remaining freespace, just delete the by-block tree entry.
554 */
555 if ((error = xfs_btree_delete(bno_cur, &i)))
556 return error;
557 if (XFS_IS_CORRUPT(mp, i != 1))
558 return -EFSCORRUPTED;
559 } else {
560 /*
561 * Update the by-block entry to start later|be shorter.
562 */
563 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
564 return error;
565 }
566 if (nfbno2 != NULLAGBLOCK) {
567 /*
568 * 2 resulting free entries, need to add one.
569 */
570 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
571 return error;
572 if (XFS_IS_CORRUPT(mp, i != 0))
573 return -EFSCORRUPTED;
574 if ((error = xfs_btree_insert(bno_cur, &i)))
575 return error;
576 if (XFS_IS_CORRUPT(mp, i != 1))
577 return -EFSCORRUPTED;
578 }
579 return 0;
580 }
581
582 static xfs_failaddr_t
583 xfs_agfl_verify(
584 struct xfs_buf *bp)
585 {
586 struct xfs_mount *mp = bp->b_mount;
587 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
588 __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp);
589 int i;
590
591 /*
592 * There is no verification of non-crc AGFLs because mkfs does not
593 * initialise the AGFL to zero or NULL. Hence the only valid part of the
594 * AGFL is what the AGF says is active. We can't get to the AGF, so we
595 * can't verify just those entries are valid.
596 */
597 if (!xfs_sb_version_hascrc(&mp->m_sb))
598 return NULL;
599
600 if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
601 return __this_address;
602 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
603 return __this_address;
604 /*
605 * during growfs operations, the perag is not fully initialised,
606 * so we can't use it for any useful checking. growfs ensures we can't
607 * use it by using uncached buffers that don't have the perag attached
608 * so we can detect and avoid this problem.
609 */
610 if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
611 return __this_address;
612
613 for (i = 0; i < xfs_agfl_size(mp); i++) {
614 if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
615 be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
616 return __this_address;
617 }
618
619 if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
620 return __this_address;
621 return NULL;
622 }
623
624 static void
625 xfs_agfl_read_verify(
626 struct xfs_buf *bp)
627 {
628 struct xfs_mount *mp = bp->b_mount;
629 xfs_failaddr_t fa;
630
631 /*
632 * There is no verification of non-crc AGFLs because mkfs does not
633 * initialise the AGFL to zero or NULL. Hence the only valid part of the
634 * AGFL is what the AGF says is active. We can't get to the AGF, so we
635 * can't verify just those entries are valid.
636 */
637 if (!xfs_sb_version_hascrc(&mp->m_sb))
638 return;
639
640 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
641 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
642 else {
643 fa = xfs_agfl_verify(bp);
644 if (fa)
645 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
646 }
647 }
648
649 static void
650 xfs_agfl_write_verify(
651 struct xfs_buf *bp)
652 {
653 struct xfs_mount *mp = bp->b_mount;
654 struct xfs_buf_log_item *bip = bp->b_log_item;
655 xfs_failaddr_t fa;
656
657 /* no verification of non-crc AGFLs */
658 if (!xfs_sb_version_hascrc(&mp->m_sb))
659 return;
660
661 fa = xfs_agfl_verify(bp);
662 if (fa) {
663 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
664 return;
665 }
666
667 if (bip)
668 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
669
670 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
671 }
672
673 const struct xfs_buf_ops xfs_agfl_buf_ops = {
674 .name = "xfs_agfl",
675 .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
676 .verify_read = xfs_agfl_read_verify,
677 .verify_write = xfs_agfl_write_verify,
678 .verify_struct = xfs_agfl_verify,
679 };
680
681 /*
682 * Read in the allocation group free block array.
683 */
684 int /* error */
685 xfs_alloc_read_agfl(
686 xfs_mount_t *mp, /* mount point structure */
687 xfs_trans_t *tp, /* transaction pointer */
688 xfs_agnumber_t agno, /* allocation group number */
689 xfs_buf_t **bpp) /* buffer for the ag free block array */
690 {
691 xfs_buf_t *bp; /* return value */
692 int error;
693
694 ASSERT(agno != NULLAGNUMBER);
695 error = xfs_trans_read_buf(
696 mp, tp, mp->m_ddev_targp,
697 XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
698 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
699 if (error)
700 return error;
701 xfs_buf_set_ref(bp, XFS_AGFL_REF);
702 *bpp = bp;
703 return 0;
704 }
705
706 STATIC int
707 xfs_alloc_update_counters(
708 struct xfs_trans *tp,
709 struct xfs_buf *agbp,
710 long len)
711 {
712 struct xfs_agf *agf = agbp->b_addr;
713
714 agbp->b_pag->pagf_freeblks += len;
715 be32_add_cpu(&agf->agf_freeblks, len);
716
717 xfs_trans_agblocks_delta(tp, len);
718 if (unlikely(be32_to_cpu(agf->agf_freeblks) >
719 be32_to_cpu(agf->agf_length))) {
720 xfs_buf_mark_corrupt(agbp);
721 return -EFSCORRUPTED;
722 }
723
724 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
725 return 0;
726 }
727
728 /*
729 * Block allocation algorithm and data structures.
730 */
731 struct xfs_alloc_cur {
732 struct xfs_btree_cur *cnt; /* btree cursors */
733 struct xfs_btree_cur *bnolt;
734 struct xfs_btree_cur *bnogt;
735 xfs_extlen_t cur_len;/* current search length */
736 xfs_agblock_t rec_bno;/* extent startblock */
737 xfs_extlen_t rec_len;/* extent length */
738 xfs_agblock_t bno; /* alloc bno */
739 xfs_extlen_t len; /* alloc len */
740 xfs_extlen_t diff; /* diff from search bno */
741 unsigned int busy_gen;/* busy state */
742 bool busy;
743 };
744
745 /*
746 * Set up cursors, etc. in the extent allocation cursor. This function can be
747 * called multiple times to reset an initialized structure without having to
748 * reallocate cursors.
749 */
750 static int
751 xfs_alloc_cur_setup(
752 struct xfs_alloc_arg *args,
753 struct xfs_alloc_cur *acur)
754 {
755 int error;
756 int i;
757
758 ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
759
760 acur->cur_len = args->maxlen;
761 acur->rec_bno = 0;
762 acur->rec_len = 0;
763 acur->bno = 0;
764 acur->len = 0;
765 acur->diff = -1;
766 acur->busy = false;
767 acur->busy_gen = 0;
768
769 /*
770 * Perform an initial cntbt lookup to check for availability of maxlen
771 * extents. If this fails, we'll return -ENOSPC to signal the caller to
772 * attempt a small allocation.
773 */
774 if (!acur->cnt)
775 acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
776 args->agbp, args->agno, XFS_BTNUM_CNT);
777 error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
778 if (error)
779 return error;
780
781 /*
782 * Allocate the bnobt left and right search cursors.
783 */
784 if (!acur->bnolt)
785 acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
786 args->agbp, args->agno, XFS_BTNUM_BNO);
787 if (!acur->bnogt)
788 acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
789 args->agbp, args->agno, XFS_BTNUM_BNO);
790 return i == 1 ? 0 : -ENOSPC;
791 }
792
793 static void
794 xfs_alloc_cur_close(
795 struct xfs_alloc_cur *acur,
796 bool error)
797 {
798 int cur_error = XFS_BTREE_NOERROR;
799
800 if (error)
801 cur_error = XFS_BTREE_ERROR;
802
803 if (acur->cnt)
804 xfs_btree_del_cursor(acur->cnt, cur_error);
805 if (acur->bnolt)
806 xfs_btree_del_cursor(acur->bnolt, cur_error);
807 if (acur->bnogt)
808 xfs_btree_del_cursor(acur->bnogt, cur_error);
809 acur->cnt = acur->bnolt = acur->bnogt = NULL;
810 }
811
812 /*
813 * Check an extent for allocation and track the best available candidate in the
814 * allocation structure. The cursor is deactivated if it has entered an out of
815 * range state based on allocation arguments. Optionally return the extent
816 * extent geometry and allocation status if requested by the caller.
817 */
818 static int
819 xfs_alloc_cur_check(
820 struct xfs_alloc_arg *args,
821 struct xfs_alloc_cur *acur,
822 struct xfs_btree_cur *cur,
823 int *new)
824 {
825 int error, i;
826 xfs_agblock_t bno, bnoa, bnew;
827 xfs_extlen_t len, lena, diff = -1;
828 bool busy;
829 unsigned busy_gen = 0;
830 bool deactivate = false;
831 bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
832
833 *new = 0;
834
835 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
836 if (error)
837 return error;
838 if (XFS_IS_CORRUPT(args->mp, i != 1))
839 return -EFSCORRUPTED;
840
841 /*
842 * Check minlen and deactivate a cntbt cursor if out of acceptable size
843 * range (i.e., walking backwards looking for a minlen extent).
844 */
845 if (len < args->minlen) {
846 deactivate = !isbnobt;
847 goto out;
848 }
849
850 busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
851 &busy_gen);
852 acur->busy |= busy;
853 if (busy)
854 acur->busy_gen = busy_gen;
855 /* deactivate a bnobt cursor outside of locality range */
856 if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
857 deactivate = isbnobt;
858 goto out;
859 }
860 if (lena < args->minlen)
861 goto out;
862
863 args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
864 xfs_alloc_fix_len(args);
865 ASSERT(args->len >= args->minlen);
866 if (args->len < acur->len)
867 goto out;
868
869 /*
870 * We have an aligned record that satisfies minlen and beats or matches
871 * the candidate extent size. Compare locality for near allocation mode.
872 */
873 ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
874 diff = xfs_alloc_compute_diff(args->agbno, args->len,
875 args->alignment, args->datatype,
876 bnoa, lena, &bnew);
877 if (bnew == NULLAGBLOCK)
878 goto out;
879
880 /*
881 * Deactivate a bnobt cursor with worse locality than the current best.
882 */
883 if (diff > acur->diff) {
884 deactivate = isbnobt;
885 goto out;
886 }
887
888 ASSERT(args->len > acur->len ||
889 (args->len == acur->len && diff <= acur->diff));
890 acur->rec_bno = bno;
891 acur->rec_len = len;
892 acur->bno = bnew;
893 acur->len = args->len;
894 acur->diff = diff;
895 *new = 1;
896
897 /*
898 * We're done if we found a perfect allocation. This only deactivates
899 * the current cursor, but this is just an optimization to terminate a
900 * cntbt search that otherwise runs to the edge of the tree.
901 */
902 if (acur->diff == 0 && acur->len == args->maxlen)
903 deactivate = true;
904 out:
905 if (deactivate)
906 cur->bc_ag.abt.active = false;
907 trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
908 *new);
909 return 0;
910 }
911
912 /*
913 * Complete an allocation of a candidate extent. Remove the extent from both
914 * trees and update the args structure.
915 */
916 STATIC int
917 xfs_alloc_cur_finish(
918 struct xfs_alloc_arg *args,
919 struct xfs_alloc_cur *acur)
920 {
921 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
922 int error;
923
924 ASSERT(acur->cnt && acur->bnolt);
925 ASSERT(acur->bno >= acur->rec_bno);
926 ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
927 ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
928
929 error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
930 acur->rec_len, acur->bno, acur->len, 0);
931 if (error)
932 return error;
933
934 args->agbno = acur->bno;
935 args->len = acur->len;
936 args->wasfromfl = 0;
937
938 trace_xfs_alloc_cur(args);
939 return 0;
940 }
941
942 /*
943 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
944 * bno optimized lookup to search for extents with ideal size and locality.
945 */
946 STATIC int
947 xfs_alloc_cntbt_iter(
948 struct xfs_alloc_arg *args,
949 struct xfs_alloc_cur *acur)
950 {
951 struct xfs_btree_cur *cur = acur->cnt;
952 xfs_agblock_t bno;
953 xfs_extlen_t len, cur_len;
954 int error;
955 int i;
956
957 if (!xfs_alloc_cur_active(cur))
958 return 0;
959
960 /* locality optimized lookup */
961 cur_len = acur->cur_len;
962 error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
963 if (error)
964 return error;
965 if (i == 0)
966 return 0;
967 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
968 if (error)
969 return error;
970
971 /* check the current record and update search length from it */
972 error = xfs_alloc_cur_check(args, acur, cur, &i);
973 if (error)
974 return error;
975 ASSERT(len >= acur->cur_len);
976 acur->cur_len = len;
977
978 /*
979 * We looked up the first record >= [agbno, len] above. The agbno is a
980 * secondary key and so the current record may lie just before or after
981 * agbno. If it is past agbno, check the previous record too so long as
982 * the length matches as it may be closer. Don't check a smaller record
983 * because that could deactivate our cursor.
984 */
985 if (bno > args->agbno) {
986 error = xfs_btree_decrement(cur, 0, &i);
987 if (!error && i) {
988 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
989 if (!error && i && len == acur->cur_len)
990 error = xfs_alloc_cur_check(args, acur, cur,
991 &i);
992 }
993 if (error)
994 return error;
995 }
996
997 /*
998 * Increment the search key until we find at least one allocation
999 * candidate or if the extent we found was larger. Otherwise, double the
1000 * search key to optimize the search. Efficiency is more important here
1001 * than absolute best locality.
1002 */
1003 cur_len <<= 1;
1004 if (!acur->len || acur->cur_len >= cur_len)
1005 acur->cur_len++;
1006 else
1007 acur->cur_len = cur_len;
1008
1009 return error;
1010 }
1011
1012 /*
1013 * Deal with the case where only small freespaces remain. Either return the
1014 * contents of the last freespace record, or allocate space from the freelist if
1015 * there is nothing in the tree.
1016 */
1017 STATIC int /* error */
1018 xfs_alloc_ag_vextent_small(
1019 struct xfs_alloc_arg *args, /* allocation argument structure */
1020 struct xfs_btree_cur *ccur, /* optional by-size cursor */
1021 xfs_agblock_t *fbnop, /* result block number */
1022 xfs_extlen_t *flenp, /* result length */
1023 int *stat) /* status: 0-freelist, 1-normal/none */
1024 {
1025 struct xfs_agf *agf = args->agbp->b_addr;
1026 int error = 0;
1027 xfs_agblock_t fbno = NULLAGBLOCK;
1028 xfs_extlen_t flen = 0;
1029 int i = 0;
1030
1031 /*
1032 * If a cntbt cursor is provided, try to allocate the largest record in
1033 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1034 * allocation. Make sure to respect minleft even when pulling from the
1035 * freelist.
1036 */
1037 if (ccur)
1038 error = xfs_btree_decrement(ccur, 0, &i);
1039 if (error)
1040 goto error;
1041 if (i) {
1042 error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1043 if (error)
1044 goto error;
1045 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1046 error = -EFSCORRUPTED;
1047 goto error;
1048 }
1049 goto out;
1050 }
1051
1052 if (args->minlen != 1 || args->alignment != 1 ||
1053 args->resv == XFS_AG_RESV_AGFL ||
1054 be32_to_cpu(agf->agf_flcount) <= args->minleft)
1055 goto out;
1056
1057 error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
1058 if (error)
1059 goto error;
1060 if (fbno == NULLAGBLOCK)
1061 goto out;
1062
1063 xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
1064 (args->datatype & XFS_ALLOC_NOBUSY));
1065
1066 if (args->datatype & XFS_ALLOC_USERDATA) {
1067 struct xfs_buf *bp;
1068
1069 error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
1070 XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
1071 args->mp->m_bsize, 0, &bp);
1072 if (error)
1073 goto error;
1074 xfs_trans_binval(args->tp, bp);
1075 }
1076 *fbnop = args->agbno = fbno;
1077 *flenp = args->len = 1;
1078 if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
1079 error = -EFSCORRUPTED;
1080 goto error;
1081 }
1082 args->wasfromfl = 1;
1083 trace_xfs_alloc_small_freelist(args);
1084
1085 /*
1086 * If we're feeding an AGFL block to something that doesn't live in the
1087 * free space, we need to clear out the OWN_AG rmap.
1088 */
1089 error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
1090 &XFS_RMAP_OINFO_AG);
1091 if (error)
1092 goto error;
1093
1094 *stat = 0;
1095 return 0;
1096
1097 out:
1098 /*
1099 * Can't do the allocation, give up.
1100 */
1101 if (flen < args->minlen) {
1102 args->agbno = NULLAGBLOCK;
1103 trace_xfs_alloc_small_notenough(args);
1104 flen = 0;
1105 }
1106 *fbnop = fbno;
1107 *flenp = flen;
1108 *stat = 1;
1109 trace_xfs_alloc_small_done(args);
1110 return 0;
1111
1112 error:
1113 trace_xfs_alloc_small_error(args);
1114 return error;
1115 }
1116
1117 /*
1118 * Allocate a variable extent in the allocation group agno.
1119 * Type and bno are used to determine where in the allocation group the
1120 * extent will start.
1121 * Extent's length (returned in *len) will be between minlen and maxlen,
1122 * and of the form k * prod + mod unless there's nothing that large.
1123 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1124 */
1125 STATIC int /* error */
1126 xfs_alloc_ag_vextent(
1127 xfs_alloc_arg_t *args) /* argument structure for allocation */
1128 {
1129 int error=0;
1130
1131 ASSERT(args->minlen > 0);
1132 ASSERT(args->maxlen > 0);
1133 ASSERT(args->minlen <= args->maxlen);
1134 ASSERT(args->mod < args->prod);
1135 ASSERT(args->alignment > 0);
1136
1137 /*
1138 * Branch to correct routine based on the type.
1139 */
1140 args->wasfromfl = 0;
1141 switch (args->type) {
1142 case XFS_ALLOCTYPE_THIS_AG:
1143 error = xfs_alloc_ag_vextent_size(args);
1144 break;
1145 case XFS_ALLOCTYPE_NEAR_BNO:
1146 error = xfs_alloc_ag_vextent_near(args);
1147 break;
1148 case XFS_ALLOCTYPE_THIS_BNO:
1149 error = xfs_alloc_ag_vextent_exact(args);
1150 break;
1151 default:
1152 ASSERT(0);
1153 /* NOTREACHED */
1154 }
1155
1156 if (error || args->agbno == NULLAGBLOCK)
1157 return error;
1158
1159 ASSERT(args->len >= args->minlen);
1160 ASSERT(args->len <= args->maxlen);
1161 ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
1162 ASSERT(args->agbno % args->alignment == 0);
1163
1164 /* if not file data, insert new block into the reverse map btree */
1165 if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
1166 error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
1167 args->agbno, args->len, &args->oinfo);
1168 if (error)
1169 return error;
1170 }
1171
1172 if (!args->wasfromfl) {
1173 error = xfs_alloc_update_counters(args->tp, args->agbp,
1174 -((long)(args->len)));
1175 if (error)
1176 return error;
1177
1178 ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
1179 args->agbno, args->len));
1180 }
1181
1182 xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
1183
1184 XFS_STATS_INC(args->mp, xs_allocx);
1185 XFS_STATS_ADD(args->mp, xs_allocb, args->len);
1186 return error;
1187 }
1188
1189 /*
1190 * Allocate a variable extent at exactly agno/bno.
1191 * Extent's length (returned in *len) will be between minlen and maxlen,
1192 * and of the form k * prod + mod unless there's nothing that large.
1193 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1194 */
1195 STATIC int /* error */
1196 xfs_alloc_ag_vextent_exact(
1197 xfs_alloc_arg_t *args) /* allocation argument structure */
1198 {
1199 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
1200 xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
1201 xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
1202 int error;
1203 xfs_agblock_t fbno; /* start block of found extent */
1204 xfs_extlen_t flen; /* length of found extent */
1205 xfs_agblock_t tbno; /* start block of busy extent */
1206 xfs_extlen_t tlen; /* length of busy extent */
1207 xfs_agblock_t tend; /* end block of busy extent */
1208 int i; /* success/failure of operation */
1209 unsigned busy_gen;
1210
1211 ASSERT(args->alignment == 1);
1212
1213 /*
1214 * Allocate/initialize a cursor for the by-number freespace btree.
1215 */
1216 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1217 args->agno, XFS_BTNUM_BNO);
1218
1219 /*
1220 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1221 * Look for the closest free block <= bno, it must contain bno
1222 * if any free block does.
1223 */
1224 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1225 if (error)
1226 goto error0;
1227 if (!i)
1228 goto not_found;
1229
1230 /*
1231 * Grab the freespace record.
1232 */
1233 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1234 if (error)
1235 goto error0;
1236 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1237 error = -EFSCORRUPTED;
1238 goto error0;
1239 }
1240 ASSERT(fbno <= args->agbno);
1241
1242 /*
1243 * Check for overlapping busy extents.
1244 */
1245 tbno = fbno;
1246 tlen = flen;
1247 xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
1248
1249 /*
1250 * Give up if the start of the extent is busy, or the freespace isn't
1251 * long enough for the minimum request.
1252 */
1253 if (tbno > args->agbno)
1254 goto not_found;
1255 if (tlen < args->minlen)
1256 goto not_found;
1257 tend = tbno + tlen;
1258 if (tend < args->agbno + args->minlen)
1259 goto not_found;
1260
1261 /*
1262 * End of extent will be smaller of the freespace end and the
1263 * maximal requested end.
1264 *
1265 * Fix the length according to mod and prod if given.
1266 */
1267 args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1268 - args->agbno;
1269 xfs_alloc_fix_len(args);
1270 ASSERT(args->agbno + args->len <= tend);
1271
1272 /*
1273 * We are allocating agbno for args->len
1274 * Allocate/initialize a cursor for the by-size btree.
1275 */
1276 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1277 args->agno, XFS_BTNUM_CNT);
1278 ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
1279 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1280 args->len, XFSA_FIXUP_BNO_OK);
1281 if (error) {
1282 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1283 goto error0;
1284 }
1285
1286 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1287 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1288
1289 args->wasfromfl = 0;
1290 trace_xfs_alloc_exact_done(args);
1291 return 0;
1292
1293 not_found:
1294 /* Didn't find it, return null. */
1295 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1296 args->agbno = NULLAGBLOCK;
1297 trace_xfs_alloc_exact_notfound(args);
1298 return 0;
1299
1300 error0:
1301 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1302 trace_xfs_alloc_exact_error(args);
1303 return error;
1304 }
1305
1306 /*
1307 * Search a given number of btree records in a given direction. Check each
1308 * record against the good extent we've already found.
1309 */
1310 STATIC int
1311 xfs_alloc_walk_iter(
1312 struct xfs_alloc_arg *args,
1313 struct xfs_alloc_cur *acur,
1314 struct xfs_btree_cur *cur,
1315 bool increment,
1316 bool find_one, /* quit on first candidate */
1317 int count, /* rec count (-1 for infinite) */
1318 int *stat)
1319 {
1320 int error;
1321 int i;
1322
1323 *stat = 0;
1324
1325 /*
1326 * Search so long as the cursor is active or we find a better extent.
1327 * The cursor is deactivated if it extends beyond the range of the
1328 * current allocation candidate.
1329 */
1330 while (xfs_alloc_cur_active(cur) && count) {
1331 error = xfs_alloc_cur_check(args, acur, cur, &i);
1332 if (error)
1333 return error;
1334 if (i == 1) {
1335 *stat = 1;
1336 if (find_one)
1337 break;
1338 }
1339 if (!xfs_alloc_cur_active(cur))
1340 break;
1341
1342 if (increment)
1343 error = xfs_btree_increment(cur, 0, &i);
1344 else
1345 error = xfs_btree_decrement(cur, 0, &i);
1346 if (error)
1347 return error;
1348 if (i == 0)
1349 cur->bc_ag.abt.active = false;
1350
1351 if (count > 0)
1352 count--;
1353 }
1354
1355 return 0;
1356 }
1357
1358 /*
1359 * Search the by-bno and by-size btrees in parallel in search of an extent with
1360 * ideal locality based on the NEAR mode ->agbno locality hint.
1361 */
1362 STATIC int
1363 xfs_alloc_ag_vextent_locality(
1364 struct xfs_alloc_arg *args,
1365 struct xfs_alloc_cur *acur,
1366 int *stat)
1367 {
1368 struct xfs_btree_cur *fbcur = NULL;
1369 int error;
1370 int i;
1371 bool fbinc;
1372
1373 ASSERT(acur->len == 0);
1374 ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
1375
1376 *stat = 0;
1377
1378 error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1379 if (error)
1380 return error;
1381 error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1382 if (error)
1383 return error;
1384 error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1385 if (error)
1386 return error;
1387
1388 /*
1389 * Search the bnobt and cntbt in parallel. Search the bnobt left and
1390 * right and lookup the closest extent to the locality hint for each
1391 * extent size key in the cntbt. The entire search terminates
1392 * immediately on a bnobt hit because that means we've found best case
1393 * locality. Otherwise the search continues until the cntbt cursor runs
1394 * off the end of the tree. If no allocation candidate is found at this
1395 * point, give up on locality, walk backwards from the end of the cntbt
1396 * and take the first available extent.
1397 *
1398 * The parallel tree searches balance each other out to provide fairly
1399 * consistent performance for various situations. The bnobt search can
1400 * have pathological behavior in the worst case scenario of larger
1401 * allocation requests and fragmented free space. On the other hand, the
1402 * bnobt is able to satisfy most smaller allocation requests much more
1403 * quickly than the cntbt. The cntbt search can sift through fragmented
1404 * free space and sets of free extents for larger allocation requests
1405 * more quickly than the bnobt. Since the locality hint is just a hint
1406 * and we don't want to scan the entire bnobt for perfect locality, the
1407 * cntbt search essentially bounds the bnobt search such that we can
1408 * find good enough locality at reasonable performance in most cases.
1409 */
1410 while (xfs_alloc_cur_active(acur->bnolt) ||
1411 xfs_alloc_cur_active(acur->bnogt) ||
1412 xfs_alloc_cur_active(acur->cnt)) {
1413
1414 trace_xfs_alloc_cur_lookup(args);
1415
1416 /*
1417 * Search the bnobt left and right. In the case of a hit, finish
1418 * the search in the opposite direction and we're done.
1419 */
1420 error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1421 true, 1, &i);
1422 if (error)
1423 return error;
1424 if (i == 1) {
1425 trace_xfs_alloc_cur_left(args);
1426 fbcur = acur->bnogt;
1427 fbinc = true;
1428 break;
1429 }
1430 error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1431 1, &i);
1432 if (error)
1433 return error;
1434 if (i == 1) {
1435 trace_xfs_alloc_cur_right(args);
1436 fbcur = acur->bnolt;
1437 fbinc = false;
1438 break;
1439 }
1440
1441 /*
1442 * Check the extent with best locality based on the current
1443 * extent size search key and keep track of the best candidate.
1444 */
1445 error = xfs_alloc_cntbt_iter(args, acur);
1446 if (error)
1447 return error;
1448 if (!xfs_alloc_cur_active(acur->cnt)) {
1449 trace_xfs_alloc_cur_lookup_done(args);
1450 break;
1451 }
1452 }
1453
1454 /*
1455 * If we failed to find anything due to busy extents, return empty
1456 * handed so the caller can flush and retry. If no busy extents were
1457 * found, walk backwards from the end of the cntbt as a last resort.
1458 */
1459 if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1460 error = xfs_btree_decrement(acur->cnt, 0, &i);
1461 if (error)
1462 return error;
1463 if (i) {
1464 acur->cnt->bc_ag.abt.active = true;
1465 fbcur = acur->cnt;
1466 fbinc = false;
1467 }
1468 }
1469
1470 /*
1471 * Search in the opposite direction for a better entry in the case of
1472 * a bnobt hit or walk backwards from the end of the cntbt.
1473 */
1474 if (fbcur) {
1475 error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1476 &i);
1477 if (error)
1478 return error;
1479 }
1480
1481 if (acur->len)
1482 *stat = 1;
1483
1484 return 0;
1485 }
1486
1487 /* Check the last block of the cnt btree for allocations. */
1488 static int
1489 xfs_alloc_ag_vextent_lastblock(
1490 struct xfs_alloc_arg *args,
1491 struct xfs_alloc_cur *acur,
1492 xfs_agblock_t *bno,
1493 xfs_extlen_t *len,
1494 bool *allocated)
1495 {
1496 int error;
1497 int i;
1498
1499 #ifdef DEBUG
1500 /* Randomly don't execute the first algorithm. */
1501 if (prandom_u32() & 1)
1502 return 0;
1503 #endif
1504
1505 /*
1506 * Start from the entry that lookup found, sequence through all larger
1507 * free blocks. If we're actually pointing at a record smaller than
1508 * maxlen, go to the start of this block, and skip all those smaller
1509 * than minlen.
1510 */
1511 if (*len || args->alignment > 1) {
1512 acur->cnt->bc_ptrs[0] = 1;
1513 do {
1514 error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1515 if (error)
1516 return error;
1517 if (XFS_IS_CORRUPT(args->mp, i != 1))
1518 return -EFSCORRUPTED;
1519 if (*len >= args->minlen)
1520 break;
1521 error = xfs_btree_increment(acur->cnt, 0, &i);
1522 if (error)
1523 return error;
1524 } while (i);
1525 ASSERT(*len >= args->minlen);
1526 if (!i)
1527 return 0;
1528 }
1529
1530 error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1531 if (error)
1532 return error;
1533
1534 /*
1535 * It didn't work. We COULD be in a case where there's a good record
1536 * somewhere, so try again.
1537 */
1538 if (acur->len == 0)
1539 return 0;
1540
1541 trace_xfs_alloc_near_first(args);
1542 *allocated = true;
1543 return 0;
1544 }
1545
1546 /*
1547 * Allocate a variable extent near bno in the allocation group agno.
1548 * Extent's length (returned in len) will be between minlen and maxlen,
1549 * and of the form k * prod + mod unless there's nothing that large.
1550 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1551 */
1552 STATIC int
1553 xfs_alloc_ag_vextent_near(
1554 struct xfs_alloc_arg *args)
1555 {
1556 struct xfs_alloc_cur acur = {};
1557 int error; /* error code */
1558 int i; /* result code, temporary */
1559 xfs_agblock_t bno;
1560 xfs_extlen_t len;
1561
1562 /* handle uninitialized agbno range so caller doesn't have to */
1563 if (!args->min_agbno && !args->max_agbno)
1564 args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1565 ASSERT(args->min_agbno <= args->max_agbno);
1566
1567 /* clamp agbno to the range if it's outside */
1568 if (args->agbno < args->min_agbno)
1569 args->agbno = args->min_agbno;
1570 if (args->agbno > args->max_agbno)
1571 args->agbno = args->max_agbno;
1572
1573 restart:
1574 len = 0;
1575
1576 /*
1577 * Set up cursors and see if there are any free extents as big as
1578 * maxlen. If not, pick the last entry in the tree unless the tree is
1579 * empty.
1580 */
1581 error = xfs_alloc_cur_setup(args, &acur);
1582 if (error == -ENOSPC) {
1583 error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1584 &len, &i);
1585 if (error)
1586 goto out;
1587 if (i == 0 || len == 0) {
1588 trace_xfs_alloc_near_noentry(args);
1589 goto out;
1590 }
1591 ASSERT(i == 1);
1592 } else if (error) {
1593 goto out;
1594 }
1595
1596 /*
1597 * First algorithm.
1598 * If the requested extent is large wrt the freespaces available
1599 * in this a.g., then the cursor will be pointing to a btree entry
1600 * near the right edge of the tree. If it's in the last btree leaf
1601 * block, then we just examine all the entries in that block
1602 * that are big enough, and pick the best one.
1603 */
1604 if (xfs_btree_islastblock(acur.cnt, 0)) {
1605 bool allocated = false;
1606
1607 error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1608 &allocated);
1609 if (error)
1610 goto out;
1611 if (allocated)
1612 goto alloc_finish;
1613 }
1614
1615 /*
1616 * Second algorithm. Combined cntbt and bnobt search to find ideal
1617 * locality.
1618 */
1619 error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
1620 if (error)
1621 goto out;
1622
1623 /*
1624 * If we couldn't get anything, give up.
1625 */
1626 if (!acur.len) {
1627 if (acur.busy) {
1628 trace_xfs_alloc_near_busy(args);
1629 xfs_extent_busy_flush(args->mp, args->pag,
1630 acur.busy_gen);
1631 goto restart;
1632 }
1633 trace_xfs_alloc_size_neither(args);
1634 args->agbno = NULLAGBLOCK;
1635 goto out;
1636 }
1637
1638 alloc_finish:
1639 /* fix up btrees on a successful allocation */
1640 error = xfs_alloc_cur_finish(args, &acur);
1641
1642 out:
1643 xfs_alloc_cur_close(&acur, error);
1644 return error;
1645 }
1646
1647 /*
1648 * Allocate a variable extent anywhere in the allocation group agno.
1649 * Extent's length (returned in len) will be between minlen and maxlen,
1650 * and of the form k * prod + mod unless there's nothing that large.
1651 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1652 */
1653 STATIC int /* error */
1654 xfs_alloc_ag_vextent_size(
1655 xfs_alloc_arg_t *args) /* allocation argument structure */
1656 {
1657 struct xfs_agf *agf = args->agbp->b_addr;
1658 xfs_btree_cur_t *bno_cur; /* cursor for bno btree */
1659 xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */
1660 int error; /* error result */
1661 xfs_agblock_t fbno; /* start of found freespace */
1662 xfs_extlen_t flen; /* length of found freespace */
1663 int i; /* temp status variable */
1664 xfs_agblock_t rbno; /* returned block number */
1665 xfs_extlen_t rlen; /* length of returned extent */
1666 bool busy;
1667 unsigned busy_gen;
1668
1669 restart:
1670 /*
1671 * Allocate and initialize a cursor for the by-size btree.
1672 */
1673 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1674 args->agno, XFS_BTNUM_CNT);
1675 bno_cur = NULL;
1676 busy = false;
1677
1678 /*
1679 * Look for an entry >= maxlen+alignment-1 blocks.
1680 */
1681 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1682 args->maxlen + args->alignment - 1, &i)))
1683 goto error0;
1684
1685 /*
1686 * If none then we have to settle for a smaller extent. In the case that
1687 * there are no large extents, this will return the last entry in the
1688 * tree unless the tree is empty. In the case that there are only busy
1689 * large extents, this will return the largest small extent unless there
1690 * are no smaller extents available.
1691 */
1692 if (!i) {
1693 error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1694 &fbno, &flen, &i);
1695 if (error)
1696 goto error0;
1697 if (i == 0 || flen == 0) {
1698 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1699 trace_xfs_alloc_size_noentry(args);
1700 return 0;
1701 }
1702 ASSERT(i == 1);
1703 busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1704 &rlen, &busy_gen);
1705 } else {
1706 /*
1707 * Search for a non-busy extent that is large enough.
1708 */
1709 for (;;) {
1710 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1711 if (error)
1712 goto error0;
1713 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1714 error = -EFSCORRUPTED;
1715 goto error0;
1716 }
1717
1718 busy = xfs_alloc_compute_aligned(args, fbno, flen,
1719 &rbno, &rlen, &busy_gen);
1720
1721 if (rlen >= args->maxlen)
1722 break;
1723
1724 error = xfs_btree_increment(cnt_cur, 0, &i);
1725 if (error)
1726 goto error0;
1727 if (i == 0) {
1728 /*
1729 * Our only valid extents must have been busy.
1730 * Make it unbusy by forcing the log out and
1731 * retrying.
1732 */
1733 xfs_btree_del_cursor(cnt_cur,
1734 XFS_BTREE_NOERROR);
1735 trace_xfs_alloc_size_busy(args);
1736 xfs_extent_busy_flush(args->mp,
1737 args->pag, busy_gen);
1738 goto restart;
1739 }
1740 }
1741 }
1742
1743 /*
1744 * In the first case above, we got the last entry in the
1745 * by-size btree. Now we check to see if the space hits maxlen
1746 * once aligned; if not, we search left for something better.
1747 * This can't happen in the second case above.
1748 */
1749 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1750 if (XFS_IS_CORRUPT(args->mp,
1751 rlen != 0 &&
1752 (rlen > flen ||
1753 rbno + rlen > fbno + flen))) {
1754 error = -EFSCORRUPTED;
1755 goto error0;
1756 }
1757 if (rlen < args->maxlen) {
1758 xfs_agblock_t bestfbno;
1759 xfs_extlen_t bestflen;
1760 xfs_agblock_t bestrbno;
1761 xfs_extlen_t bestrlen;
1762
1763 bestrlen = rlen;
1764 bestrbno = rbno;
1765 bestflen = flen;
1766 bestfbno = fbno;
1767 for (;;) {
1768 if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1769 goto error0;
1770 if (i == 0)
1771 break;
1772 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1773 &i)))
1774 goto error0;
1775 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1776 error = -EFSCORRUPTED;
1777 goto error0;
1778 }
1779 if (flen < bestrlen)
1780 break;
1781 busy = xfs_alloc_compute_aligned(args, fbno, flen,
1782 &rbno, &rlen, &busy_gen);
1783 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1784 if (XFS_IS_CORRUPT(args->mp,
1785 rlen != 0 &&
1786 (rlen > flen ||
1787 rbno + rlen > fbno + flen))) {
1788 error = -EFSCORRUPTED;
1789 goto error0;
1790 }
1791 if (rlen > bestrlen) {
1792 bestrlen = rlen;
1793 bestrbno = rbno;
1794 bestflen = flen;
1795 bestfbno = fbno;
1796 if (rlen == args->maxlen)
1797 break;
1798 }
1799 }
1800 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1801 &i)))
1802 goto error0;
1803 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1804 error = -EFSCORRUPTED;
1805 goto error0;
1806 }
1807 rlen = bestrlen;
1808 rbno = bestrbno;
1809 flen = bestflen;
1810 fbno = bestfbno;
1811 }
1812 args->wasfromfl = 0;
1813 /*
1814 * Fix up the length.
1815 */
1816 args->len = rlen;
1817 if (rlen < args->minlen) {
1818 if (busy) {
1819 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1820 trace_xfs_alloc_size_busy(args);
1821 xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
1822 goto restart;
1823 }
1824 goto out_nominleft;
1825 }
1826 xfs_alloc_fix_len(args);
1827
1828 rlen = args->len;
1829 if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1830 error = -EFSCORRUPTED;
1831 goto error0;
1832 }
1833 /*
1834 * Allocate and initialize a cursor for the by-block tree.
1835 */
1836 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1837 args->agno, XFS_BTNUM_BNO);
1838 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1839 rbno, rlen, XFSA_FIXUP_CNT_OK)))
1840 goto error0;
1841 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1842 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1843 cnt_cur = bno_cur = NULL;
1844 args->len = rlen;
1845 args->agbno = rbno;
1846 if (XFS_IS_CORRUPT(args->mp,
1847 args->agbno + args->len >
1848 be32_to_cpu(agf->agf_length))) {
1849 error = -EFSCORRUPTED;
1850 goto error0;
1851 }
1852 trace_xfs_alloc_size_done(args);
1853 return 0;
1854
1855 error0:
1856 trace_xfs_alloc_size_error(args);
1857 if (cnt_cur)
1858 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1859 if (bno_cur)
1860 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1861 return error;
1862
1863 out_nominleft:
1864 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1865 trace_xfs_alloc_size_nominleft(args);
1866 args->agbno = NULLAGBLOCK;
1867 return 0;
1868 }
1869
1870 /*
1871 * Free the extent starting at agno/bno for length.
1872 */
1873 STATIC int
1874 xfs_free_ag_extent(
1875 struct xfs_trans *tp,
1876 struct xfs_buf *agbp,
1877 xfs_agnumber_t agno,
1878 xfs_agblock_t bno,
1879 xfs_extlen_t len,
1880 const struct xfs_owner_info *oinfo,
1881 enum xfs_ag_resv_type type)
1882 {
1883 struct xfs_mount *mp;
1884 struct xfs_btree_cur *bno_cur;
1885 struct xfs_btree_cur *cnt_cur;
1886 xfs_agblock_t gtbno; /* start of right neighbor */
1887 xfs_extlen_t gtlen; /* length of right neighbor */
1888 xfs_agblock_t ltbno; /* start of left neighbor */
1889 xfs_extlen_t ltlen; /* length of left neighbor */
1890 xfs_agblock_t nbno; /* new starting block of freesp */
1891 xfs_extlen_t nlen; /* new length of freespace */
1892 int haveleft; /* have a left neighbor */
1893 int haveright; /* have a right neighbor */
1894 int i;
1895 int error;
1896
1897 bno_cur = cnt_cur = NULL;
1898 mp = tp->t_mountp;
1899
1900 if (!xfs_rmap_should_skip_owner_update(oinfo)) {
1901 error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo);
1902 if (error)
1903 goto error0;
1904 }
1905
1906 /*
1907 * Allocate and initialize a cursor for the by-block btree.
1908 */
1909 bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
1910 /*
1911 * Look for a neighboring block on the left (lower block numbers)
1912 * that is contiguous with this space.
1913 */
1914 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
1915 goto error0;
1916 if (haveleft) {
1917 /*
1918 * There is a block to our left.
1919 */
1920 if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
1921 goto error0;
1922 if (XFS_IS_CORRUPT(mp, i != 1)) {
1923 error = -EFSCORRUPTED;
1924 goto error0;
1925 }
1926 /*
1927 * It's not contiguous, though.
1928 */
1929 if (ltbno + ltlen < bno)
1930 haveleft = 0;
1931 else {
1932 /*
1933 * If this failure happens the request to free this
1934 * space was invalid, it's (partly) already free.
1935 * Very bad.
1936 */
1937 if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
1938 error = -EFSCORRUPTED;
1939 goto error0;
1940 }
1941 }
1942 }
1943 /*
1944 * Look for a neighboring block on the right (higher block numbers)
1945 * that is contiguous with this space.
1946 */
1947 if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
1948 goto error0;
1949 if (haveright) {
1950 /*
1951 * There is a block to our right.
1952 */
1953 if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
1954 goto error0;
1955 if (XFS_IS_CORRUPT(mp, i != 1)) {
1956 error = -EFSCORRUPTED;
1957 goto error0;
1958 }
1959 /*
1960 * It's not contiguous, though.
1961 */
1962 if (bno + len < gtbno)
1963 haveright = 0;
1964 else {
1965 /*
1966 * If this failure happens the request to free this
1967 * space was invalid, it's (partly) already free.
1968 * Very bad.
1969 */
1970 if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
1971 error = -EFSCORRUPTED;
1972 goto error0;
1973 }
1974 }
1975 }
1976 /*
1977 * Now allocate and initialize a cursor for the by-size tree.
1978 */
1979 cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
1980 /*
1981 * Have both left and right contiguous neighbors.
1982 * Merge all three into a single free block.
1983 */
1984 if (haveleft && haveright) {
1985 /*
1986 * Delete the old by-size entry on the left.
1987 */
1988 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
1989 goto error0;
1990 if (XFS_IS_CORRUPT(mp, i != 1)) {
1991 error = -EFSCORRUPTED;
1992 goto error0;
1993 }
1994 if ((error = xfs_btree_delete(cnt_cur, &i)))
1995 goto error0;
1996 if (XFS_IS_CORRUPT(mp, i != 1)) {
1997 error = -EFSCORRUPTED;
1998 goto error0;
1999 }
2000 /*
2001 * Delete the old by-size entry on the right.
2002 */
2003 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2004 goto error0;
2005 if (XFS_IS_CORRUPT(mp, i != 1)) {
2006 error = -EFSCORRUPTED;
2007 goto error0;
2008 }
2009 if ((error = xfs_btree_delete(cnt_cur, &i)))
2010 goto error0;
2011 if (XFS_IS_CORRUPT(mp, i != 1)) {
2012 error = -EFSCORRUPTED;
2013 goto error0;
2014 }
2015 /*
2016 * Delete the old by-block entry for the right block.
2017 */
2018 if ((error = xfs_btree_delete(bno_cur, &i)))
2019 goto error0;
2020 if (XFS_IS_CORRUPT(mp, i != 1)) {
2021 error = -EFSCORRUPTED;
2022 goto error0;
2023 }
2024 /*
2025 * Move the by-block cursor back to the left neighbor.
2026 */
2027 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2028 goto error0;
2029 if (XFS_IS_CORRUPT(mp, i != 1)) {
2030 error = -EFSCORRUPTED;
2031 goto error0;
2032 }
2033 #ifdef DEBUG
2034 /*
2035 * Check that this is the right record: delete didn't
2036 * mangle the cursor.
2037 */
2038 {
2039 xfs_agblock_t xxbno;
2040 xfs_extlen_t xxlen;
2041
2042 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2043 &i)))
2044 goto error0;
2045 if (XFS_IS_CORRUPT(mp,
2046 i != 1 ||
2047 xxbno != ltbno ||
2048 xxlen != ltlen)) {
2049 error = -EFSCORRUPTED;
2050 goto error0;
2051 }
2052 }
2053 #endif
2054 /*
2055 * Update remaining by-block entry to the new, joined block.
2056 */
2057 nbno = ltbno;
2058 nlen = len + ltlen + gtlen;
2059 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2060 goto error0;
2061 }
2062 /*
2063 * Have only a left contiguous neighbor.
2064 * Merge it together with the new freespace.
2065 */
2066 else if (haveleft) {
2067 /*
2068 * Delete the old by-size entry on the left.
2069 */
2070 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2071 goto error0;
2072 if (XFS_IS_CORRUPT(mp, i != 1)) {
2073 error = -EFSCORRUPTED;
2074 goto error0;
2075 }
2076 if ((error = xfs_btree_delete(cnt_cur, &i)))
2077 goto error0;
2078 if (XFS_IS_CORRUPT(mp, i != 1)) {
2079 error = -EFSCORRUPTED;
2080 goto error0;
2081 }
2082 /*
2083 * Back up the by-block cursor to the left neighbor, and
2084 * update its length.
2085 */
2086 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2087 goto error0;
2088 if (XFS_IS_CORRUPT(mp, i != 1)) {
2089 error = -EFSCORRUPTED;
2090 goto error0;
2091 }
2092 nbno = ltbno;
2093 nlen = len + ltlen;
2094 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2095 goto error0;
2096 }
2097 /*
2098 * Have only a right contiguous neighbor.
2099 * Merge it together with the new freespace.
2100 */
2101 else if (haveright) {
2102 /*
2103 * Delete the old by-size entry on the right.
2104 */
2105 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2106 goto error0;
2107 if (XFS_IS_CORRUPT(mp, i != 1)) {
2108 error = -EFSCORRUPTED;
2109 goto error0;
2110 }
2111 if ((error = xfs_btree_delete(cnt_cur, &i)))
2112 goto error0;
2113 if (XFS_IS_CORRUPT(mp, i != 1)) {
2114 error = -EFSCORRUPTED;
2115 goto error0;
2116 }
2117 /*
2118 * Update the starting block and length of the right
2119 * neighbor in the by-block tree.
2120 */
2121 nbno = bno;
2122 nlen = len + gtlen;
2123 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2124 goto error0;
2125 }
2126 /*
2127 * No contiguous neighbors.
2128 * Insert the new freespace into the by-block tree.
2129 */
2130 else {
2131 nbno = bno;
2132 nlen = len;
2133 if ((error = xfs_btree_insert(bno_cur, &i)))
2134 goto error0;
2135 if (XFS_IS_CORRUPT(mp, i != 1)) {
2136 error = -EFSCORRUPTED;
2137 goto error0;
2138 }
2139 }
2140 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2141 bno_cur = NULL;
2142 /*
2143 * In all cases we need to insert the new freespace in the by-size tree.
2144 */
2145 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2146 goto error0;
2147 if (XFS_IS_CORRUPT(mp, i != 0)) {
2148 error = -EFSCORRUPTED;
2149 goto error0;
2150 }
2151 if ((error = xfs_btree_insert(cnt_cur, &i)))
2152 goto error0;
2153 if (XFS_IS_CORRUPT(mp, i != 1)) {
2154 error = -EFSCORRUPTED;
2155 goto error0;
2156 }
2157 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2158 cnt_cur = NULL;
2159
2160 /*
2161 * Update the freespace totals in the ag and superblock.
2162 */
2163 error = xfs_alloc_update_counters(tp, agbp, len);
2164 xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
2165 if (error)
2166 goto error0;
2167
2168 XFS_STATS_INC(mp, xs_freex);
2169 XFS_STATS_ADD(mp, xs_freeb, len);
2170
2171 trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
2172
2173 return 0;
2174
2175 error0:
2176 trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2177 if (bno_cur)
2178 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2179 if (cnt_cur)
2180 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2181 return error;
2182 }
2183
2184 /*
2185 * Visible (exported) allocation/free functions.
2186 * Some of these are used just by xfs_alloc_btree.c and this file.
2187 */
2188
2189 /*
2190 * Compute and fill in value of m_ag_maxlevels.
2191 */
2192 void
2193 xfs_alloc_compute_maxlevels(
2194 xfs_mount_t *mp) /* file system mount structure */
2195 {
2196 mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
2197 (mp->m_sb.sb_agblocks + 1) / 2);
2198 }
2199
2200 /*
2201 * Find the length of the longest extent in an AG. The 'need' parameter
2202 * specifies how much space we're going to need for the AGFL and the
2203 * 'reserved' parameter tells us how many blocks in this AG are reserved for
2204 * other callers.
2205 */
2206 xfs_extlen_t
2207 xfs_alloc_longest_free_extent(
2208 struct xfs_perag *pag,
2209 xfs_extlen_t need,
2210 xfs_extlen_t reserved)
2211 {
2212 xfs_extlen_t delta = 0;
2213
2214 /*
2215 * If the AGFL needs a recharge, we'll have to subtract that from the
2216 * longest extent.
2217 */
2218 if (need > pag->pagf_flcount)
2219 delta = need - pag->pagf_flcount;
2220
2221 /*
2222 * If we cannot maintain others' reservations with space from the
2223 * not-longest freesp extents, we'll have to subtract /that/ from
2224 * the longest extent too.
2225 */
2226 if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2227 delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2228
2229 /*
2230 * If the longest extent is long enough to satisfy all the
2231 * reservations and AGFL rules in place, we can return this extent.
2232 */
2233 if (pag->pagf_longest > delta)
2234 return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
2235 pag->pagf_longest - delta);
2236
2237 /* Otherwise, let the caller try for 1 block if there's space. */
2238 return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2239 }
2240
2241 /*
2242 * Compute the minimum length of the AGFL in the given AG. If @pag is NULL,
2243 * return the largest possible minimum length.
2244 */
2245 unsigned int
2246 xfs_alloc_min_freelist(
2247 struct xfs_mount *mp,
2248 struct xfs_perag *pag)
2249 {
2250 /* AG btrees have at least 1 level. */
2251 static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
2252 const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
2253 unsigned int min_free;
2254
2255 ASSERT(mp->m_ag_maxlevels > 0);
2256
2257 /* space needed by-bno freespace btree */
2258 min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
2259 mp->m_ag_maxlevels);
2260 /* space needed by-size freespace btree */
2261 min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
2262 mp->m_ag_maxlevels);
2263 /* space needed reverse mapping used space btree */
2264 if (xfs_sb_version_hasrmapbt(&mp->m_sb))
2265 min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
2266 mp->m_rmap_maxlevels);
2267
2268 return min_free;
2269 }
2270
2271 /*
2272 * Check if the operation we are fixing up the freelist for should go ahead or
2273 * not. If we are freeing blocks, we always allow it, otherwise the allocation
2274 * is dependent on whether the size and shape of free space available will
2275 * permit the requested allocation to take place.
2276 */
2277 static bool
2278 xfs_alloc_space_available(
2279 struct xfs_alloc_arg *args,
2280 xfs_extlen_t min_free,
2281 int flags)
2282 {
2283 struct xfs_perag *pag = args->pag;
2284 xfs_extlen_t alloc_len, longest;
2285 xfs_extlen_t reservation; /* blocks that are still reserved */
2286 int available;
2287 xfs_extlen_t agflcount;
2288
2289 if (flags & XFS_ALLOC_FLAG_FREEING)
2290 return true;
2291
2292 reservation = xfs_ag_resv_needed(pag, args->resv);
2293
2294 /* do we have enough contiguous free space for the allocation? */
2295 alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
2296 longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
2297 if (longest < alloc_len)
2298 return false;
2299
2300 /*
2301 * Do we have enough free space remaining for the allocation? Don't
2302 * account extra agfl blocks because we are about to defer free them,
2303 * making them unavailable until the current transaction commits.
2304 */
2305 agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2306 available = (int)(pag->pagf_freeblks + agflcount -
2307 reservation - min_free - args->minleft);
2308 if (available < (int)max(args->total, alloc_len))
2309 return false;
2310
2311 /*
2312 * Clamp maxlen to the amount of free space available for the actual
2313 * extent allocation.
2314 */
2315 if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2316 args->maxlen = available;
2317 ASSERT(args->maxlen > 0);
2318 ASSERT(args->maxlen >= args->minlen);
2319 }
2320
2321 return true;
2322 }
2323
2324 int
2325 xfs_free_agfl_block(
2326 struct xfs_trans *tp,
2327 xfs_agnumber_t agno,
2328 xfs_agblock_t agbno,
2329 struct xfs_buf *agbp,
2330 struct xfs_owner_info *oinfo)
2331 {
2332 int error;
2333 struct xfs_buf *bp;
2334
2335 error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2336 XFS_AG_RESV_AGFL);
2337 if (error)
2338 return error;
2339
2340 error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
2341 XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
2342 tp->t_mountp->m_bsize, 0, &bp);
2343 if (error)
2344 return error;
2345 xfs_trans_binval(tp, bp);
2346
2347 return 0;
2348 }
2349
2350 /*
2351 * Check the agfl fields of the agf for inconsistency or corruption. The purpose
2352 * is to detect an agfl header padding mismatch between current and early v5
2353 * kernels. This problem manifests as a 1-slot size difference between the
2354 * on-disk flcount and the active [first, last] range of a wrapped agfl. This
2355 * may also catch variants of agfl count corruption unrelated to padding. Either
2356 * way, we'll reset the agfl and warn the user.
2357 *
2358 * Return true if a reset is required before the agfl can be used, false
2359 * otherwise.
2360 */
2361 static bool
2362 xfs_agfl_needs_reset(
2363 struct xfs_mount *mp,
2364 struct xfs_agf *agf)
2365 {
2366 uint32_t f = be32_to_cpu(agf->agf_flfirst);
2367 uint32_t l = be32_to_cpu(agf->agf_fllast);
2368 uint32_t c = be32_to_cpu(agf->agf_flcount);
2369 int agfl_size = xfs_agfl_size(mp);
2370 int active;
2371
2372 /* no agfl header on v4 supers */
2373 if (!xfs_sb_version_hascrc(&mp->m_sb))
2374 return false;
2375
2376 /*
2377 * The agf read verifier catches severe corruption of these fields.
2378 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2379 * the verifier allows it.
2380 */
2381 if (f >= agfl_size || l >= agfl_size)
2382 return true;
2383 if (c > agfl_size)
2384 return true;
2385
2386 /*
2387 * Check consistency between the on-disk count and the active range. An
2388 * agfl padding mismatch manifests as an inconsistent flcount.
2389 */
2390 if (c && l >= f)
2391 active = l - f + 1;
2392 else if (c)
2393 active = agfl_size - f + l + 1;
2394 else
2395 active = 0;
2396
2397 return active != c;
2398 }
2399
2400 /*
2401 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2402 * agfl content cannot be trusted. Warn the user that a repair is required to
2403 * recover leaked blocks.
2404 *
2405 * The purpose of this mechanism is to handle filesystems affected by the agfl
2406 * header padding mismatch problem. A reset keeps the filesystem online with a
2407 * relatively minor free space accounting inconsistency rather than suffer the
2408 * inevitable crash from use of an invalid agfl block.
2409 */
2410 static void
2411 xfs_agfl_reset(
2412 struct xfs_trans *tp,
2413 struct xfs_buf *agbp,
2414 struct xfs_perag *pag)
2415 {
2416 struct xfs_mount *mp = tp->t_mountp;
2417 struct xfs_agf *agf = agbp->b_addr;
2418
2419 ASSERT(pag->pagf_agflreset);
2420 trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2421
2422 xfs_warn(mp,
2423 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2424 "Please unmount and run xfs_repair.",
2425 pag->pag_agno, pag->pagf_flcount);
2426
2427 agf->agf_flfirst = 0;
2428 agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2429 agf->agf_flcount = 0;
2430 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2431 XFS_AGF_FLCOUNT);
2432
2433 pag->pagf_flcount = 0;
2434 pag->pagf_agflreset = false;
2435 }
2436
2437 /*
2438 * Defer an AGFL block free. This is effectively equivalent to
2439 * xfs_bmap_add_free() with some special handling particular to AGFL blocks.
2440 *
2441 * Deferring AGFL frees helps prevent log reservation overruns due to too many
2442 * allocation operations in a transaction. AGFL frees are prone to this problem
2443 * because for one they are always freed one at a time. Further, an immediate
2444 * AGFL block free can cause a btree join and require another block free before
2445 * the real allocation can proceed. Deferring the free disconnects freeing up
2446 * the AGFL slot from freeing the block.
2447 */
2448 STATIC void
2449 xfs_defer_agfl_block(
2450 struct xfs_trans *tp,
2451 xfs_agnumber_t agno,
2452 xfs_fsblock_t agbno,
2453 struct xfs_owner_info *oinfo)
2454 {
2455 struct xfs_mount *mp = tp->t_mountp;
2456 struct xfs_extent_free_item *new; /* new element */
2457
2458 ASSERT(xfs_bmap_free_item_zone != NULL);
2459 ASSERT(oinfo != NULL);
2460
2461 new = kmem_cache_alloc(xfs_bmap_free_item_zone,
2462 GFP_KERNEL | __GFP_NOFAIL);
2463 new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
2464 new->xefi_blockcount = 1;
2465 new->xefi_oinfo = *oinfo;
2466 new->xefi_skip_discard = false;
2467
2468 trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2469
2470 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
2471 }
2472
2473 /*
2474 * Decide whether to use this allocation group for this allocation.
2475 * If so, fix up the btree freelist's size.
2476 */
2477 int /* error */
2478 xfs_alloc_fix_freelist(
2479 struct xfs_alloc_arg *args, /* allocation argument structure */
2480 int flags) /* XFS_ALLOC_FLAG_... */
2481 {
2482 struct xfs_mount *mp = args->mp;
2483 struct xfs_perag *pag = args->pag;
2484 struct xfs_trans *tp = args->tp;
2485 struct xfs_buf *agbp = NULL;
2486 struct xfs_buf *agflbp = NULL;
2487 struct xfs_alloc_arg targs; /* local allocation arguments */
2488 xfs_agblock_t bno; /* freelist block */
2489 xfs_extlen_t need; /* total blocks needed in freelist */
2490 int error = 0;
2491
2492 /* deferred ops (AGFL block frees) require permanent transactions */
2493 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2494
2495 if (!pag->pagf_init) {
2496 error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
2497 if (error) {
2498 /* Couldn't lock the AGF so skip this AG. */
2499 if (error == -EAGAIN)
2500 error = 0;
2501 goto out_no_agbp;
2502 }
2503 }
2504
2505 /*
2506 * If this is a metadata preferred pag and we are user data then try
2507 * somewhere else if we are not being asked to try harder at this
2508 * point
2509 */
2510 if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) &&
2511 (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2512 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
2513 goto out_agbp_relse;
2514 }
2515
2516 need = xfs_alloc_min_freelist(mp, pag);
2517 if (!xfs_alloc_space_available(args, need, flags |
2518 XFS_ALLOC_FLAG_CHECK))
2519 goto out_agbp_relse;
2520
2521 /*
2522 * Get the a.g. freespace buffer.
2523 * Can fail if we're not blocking on locks, and it's held.
2524 */
2525 if (!agbp) {
2526 error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
2527 if (error) {
2528 /* Couldn't lock the AGF so skip this AG. */
2529 if (error == -EAGAIN)
2530 error = 0;
2531 goto out_no_agbp;
2532 }
2533 }
2534
2535 /* reset a padding mismatched agfl before final free space check */
2536 if (pag->pagf_agflreset)
2537 xfs_agfl_reset(tp, agbp, pag);
2538
2539 /* If there isn't enough total space or single-extent, reject it. */
2540 need = xfs_alloc_min_freelist(mp, pag);
2541 if (!xfs_alloc_space_available(args, need, flags))
2542 goto out_agbp_relse;
2543
2544 /*
2545 * Make the freelist shorter if it's too long.
2546 *
2547 * Note that from this point onwards, we will always release the agf and
2548 * agfl buffers on error. This handles the case where we error out and
2549 * the buffers are clean or may not have been joined to the transaction
2550 * and hence need to be released manually. If they have been joined to
2551 * the transaction, then xfs_trans_brelse() will handle them
2552 * appropriately based on the recursion count and dirty state of the
2553 * buffer.
2554 *
2555 * XXX (dgc): When we have lots of free space, does this buy us
2556 * anything other than extra overhead when we need to put more blocks
2557 * back on the free list? Maybe we should only do this when space is
2558 * getting low or the AGFL is more than half full?
2559 *
2560 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2561 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2562 * updating the rmapbt. Both flags are used in xfs_repair while we're
2563 * rebuilding the rmapbt, and neither are used by the kernel. They're
2564 * both required to ensure that rmaps are correctly recorded for the
2565 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and
2566 * repair/rmap.c in xfsprogs for details.
2567 */
2568 memset(&targs, 0, sizeof(targs));
2569 /* struct copy below */
2570 if (flags & XFS_ALLOC_FLAG_NORMAP)
2571 targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
2572 else
2573 targs.oinfo = XFS_RMAP_OINFO_AG;
2574 while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
2575 error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
2576 if (error)
2577 goto out_agbp_relse;
2578
2579 /* defer agfl frees */
2580 xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2581 }
2582
2583 targs.tp = tp;
2584 targs.mp = mp;
2585 targs.agbp = agbp;
2586 targs.agno = args->agno;
2587 targs.alignment = targs.minlen = targs.prod = 1;
2588 targs.type = XFS_ALLOCTYPE_THIS_AG;
2589 targs.pag = pag;
2590 error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
2591 if (error)
2592 goto out_agbp_relse;
2593
2594 /* Make the freelist longer if it's too short. */
2595 while (pag->pagf_flcount < need) {
2596 targs.agbno = 0;
2597 targs.maxlen = need - pag->pagf_flcount;
2598 targs.resv = XFS_AG_RESV_AGFL;
2599
2600 /* Allocate as many blocks as possible at once. */
2601 error = xfs_alloc_ag_vextent(&targs);
2602 if (error)
2603 goto out_agflbp_relse;
2604
2605 /*
2606 * Stop if we run out. Won't happen if callers are obeying
2607 * the restrictions correctly. Can happen for free calls
2608 * on a completely full ag.
2609 */
2610 if (targs.agbno == NULLAGBLOCK) {
2611 if (flags & XFS_ALLOC_FLAG_FREEING)
2612 break;
2613 goto out_agflbp_relse;
2614 }
2615 /*
2616 * Put each allocated block on the list.
2617 */
2618 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
2619 error = xfs_alloc_put_freelist(tp, agbp,
2620 agflbp, bno, 0);
2621 if (error)
2622 goto out_agflbp_relse;
2623 }
2624 }
2625 xfs_trans_brelse(tp, agflbp);
2626 args->agbp = agbp;
2627 return 0;
2628
2629 out_agflbp_relse:
2630 xfs_trans_brelse(tp, agflbp);
2631 out_agbp_relse:
2632 if (agbp)
2633 xfs_trans_brelse(tp, agbp);
2634 out_no_agbp:
2635 args->agbp = NULL;
2636 return error;
2637 }
2638
2639 /*
2640 * Get a block from the freelist.
2641 * Returns with the buffer for the block gotten.
2642 */
2643 int /* error */
2644 xfs_alloc_get_freelist(
2645 xfs_trans_t *tp, /* transaction pointer */
2646 xfs_buf_t *agbp, /* buffer containing the agf structure */
2647 xfs_agblock_t *bnop, /* block address retrieved from freelist */
2648 int btreeblk) /* destination is a AGF btree */
2649 {
2650 struct xfs_agf *agf = agbp->b_addr;
2651 xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */
2652 xfs_agblock_t bno; /* block number returned */
2653 __be32 *agfl_bno;
2654 int error;
2655 int logflags;
2656 xfs_mount_t *mp = tp->t_mountp;
2657 xfs_perag_t *pag; /* per allocation group data */
2658
2659 /*
2660 * Freelist is empty, give up.
2661 */
2662 if (!agf->agf_flcount) {
2663 *bnop = NULLAGBLOCK;
2664 return 0;
2665 }
2666 /*
2667 * Read the array of free blocks.
2668 */
2669 error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
2670 &agflbp);
2671 if (error)
2672 return error;
2673
2674
2675 /*
2676 * Get the block number and update the data structures.
2677 */
2678 agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2679 bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
2680 be32_add_cpu(&agf->agf_flfirst, 1);
2681 xfs_trans_brelse(tp, agflbp);
2682 if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
2683 agf->agf_flfirst = 0;
2684
2685 pag = agbp->b_pag;
2686 ASSERT(!pag->pagf_agflreset);
2687 be32_add_cpu(&agf->agf_flcount, -1);
2688 xfs_trans_agflist_delta(tp, -1);
2689 pag->pagf_flcount--;
2690
2691 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2692 if (btreeblk) {
2693 be32_add_cpu(&agf->agf_btreeblks, 1);
2694 pag->pagf_btreeblks++;
2695 logflags |= XFS_AGF_BTREEBLKS;
2696 }
2697
2698 xfs_alloc_log_agf(tp, agbp, logflags);
2699 *bnop = bno;
2700
2701 return 0;
2702 }
2703
2704 /*
2705 * Log the given fields from the agf structure.
2706 */
2707 void
2708 xfs_alloc_log_agf(
2709 xfs_trans_t *tp, /* transaction pointer */
2710 xfs_buf_t *bp, /* buffer for a.g. freelist header */
2711 int fields) /* mask of fields to be logged (XFS_AGF_...) */
2712 {
2713 int first; /* first byte offset */
2714 int last; /* last byte offset */
2715 static const short offsets[] = {
2716 offsetof(xfs_agf_t, agf_magicnum),
2717 offsetof(xfs_agf_t, agf_versionnum),
2718 offsetof(xfs_agf_t, agf_seqno),
2719 offsetof(xfs_agf_t, agf_length),
2720 offsetof(xfs_agf_t, agf_roots[0]),
2721 offsetof(xfs_agf_t, agf_levels[0]),
2722 offsetof(xfs_agf_t, agf_flfirst),
2723 offsetof(xfs_agf_t, agf_fllast),
2724 offsetof(xfs_agf_t, agf_flcount),
2725 offsetof(xfs_agf_t, agf_freeblks),
2726 offsetof(xfs_agf_t, agf_longest),
2727 offsetof(xfs_agf_t, agf_btreeblks),
2728 offsetof(xfs_agf_t, agf_uuid),
2729 offsetof(xfs_agf_t, agf_rmap_blocks),
2730 offsetof(xfs_agf_t, agf_refcount_blocks),
2731 offsetof(xfs_agf_t, agf_refcount_root),
2732 offsetof(xfs_agf_t, agf_refcount_level),
2733 /* needed so that we don't log the whole rest of the structure: */
2734 offsetof(xfs_agf_t, agf_spare64),
2735 sizeof(xfs_agf_t)
2736 };
2737
2738 trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
2739
2740 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
2741
2742 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2743 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2744 }
2745
2746 /*
2747 * Interface for inode allocation to force the pag data to be initialized.
2748 */
2749 int /* error */
2750 xfs_alloc_pagf_init(
2751 xfs_mount_t *mp, /* file system mount structure */
2752 xfs_trans_t *tp, /* transaction pointer */
2753 xfs_agnumber_t agno, /* allocation group number */
2754 int flags) /* XFS_ALLOC_FLAGS_... */
2755 {
2756 xfs_buf_t *bp;
2757 int error;
2758
2759 error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp);
2760 if (!error)
2761 xfs_trans_brelse(tp, bp);
2762 return error;
2763 }
2764
2765 /*
2766 * Put the block on the freelist for the allocation group.
2767 */
2768 int /* error */
2769 xfs_alloc_put_freelist(
2770 xfs_trans_t *tp, /* transaction pointer */
2771 xfs_buf_t *agbp, /* buffer for a.g. freelist header */
2772 xfs_buf_t *agflbp,/* buffer for a.g. free block array */
2773 xfs_agblock_t bno, /* block being freed */
2774 int btreeblk) /* block came from a AGF btree */
2775 {
2776 struct xfs_mount *mp = tp->t_mountp;
2777 struct xfs_agf *agf = agbp->b_addr;
2778 __be32 *blockp;/* pointer to array entry */
2779 int error;
2780 int logflags;
2781 xfs_perag_t *pag; /* per allocation group data */
2782 __be32 *agfl_bno;
2783 int startoff;
2784
2785 if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
2786 be32_to_cpu(agf->agf_seqno), &agflbp)))
2787 return error;
2788 be32_add_cpu(&agf->agf_fllast, 1);
2789 if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
2790 agf->agf_fllast = 0;
2791
2792 pag = agbp->b_pag;
2793 ASSERT(!pag->pagf_agflreset);
2794 be32_add_cpu(&agf->agf_flcount, 1);
2795 xfs_trans_agflist_delta(tp, 1);
2796 pag->pagf_flcount++;
2797
2798 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2799 if (btreeblk) {
2800 be32_add_cpu(&agf->agf_btreeblks, -1);
2801 pag->pagf_btreeblks--;
2802 logflags |= XFS_AGF_BTREEBLKS;
2803 }
2804
2805 xfs_alloc_log_agf(tp, agbp, logflags);
2806
2807 ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
2808
2809 agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2810 blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
2811 *blockp = cpu_to_be32(bno);
2812 startoff = (char *)blockp - (char *)agflbp->b_addr;
2813
2814 xfs_alloc_log_agf(tp, agbp, logflags);
2815
2816 xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
2817 xfs_trans_log_buf(tp, agflbp, startoff,
2818 startoff + sizeof(xfs_agblock_t) - 1);
2819 return 0;
2820 }
2821
2822 static xfs_failaddr_t
2823 xfs_agf_verify(
2824 struct xfs_buf *bp)
2825 {
2826 struct xfs_mount *mp = bp->b_mount;
2827 struct xfs_agf *agf = bp->b_addr;
2828
2829 if (xfs_sb_version_hascrc(&mp->m_sb)) {
2830 if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
2831 return __this_address;
2832 if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
2833 return __this_address;
2834 }
2835
2836 if (!xfs_verify_magic(bp, agf->agf_magicnum))
2837 return __this_address;
2838
2839 if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2840 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2841 be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
2842 be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
2843 be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
2844 return __this_address;
2845
2846 if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks)
2847 return __this_address;
2848
2849 if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
2850 be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))
2851 return __this_address;
2852
2853 if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
2854 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
2855 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
2856 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
2857 return __this_address;
2858
2859 if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
2860 (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
2861 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS))
2862 return __this_address;
2863
2864 if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
2865 be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
2866 return __this_address;
2867
2868 /*
2869 * during growfs operations, the perag is not fully initialised,
2870 * so we can't use it for any useful checking. growfs ensures we can't
2871 * use it by using uncached buffers that don't have the perag attached
2872 * so we can detect and avoid this problem.
2873 */
2874 if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
2875 return __this_address;
2876
2877 if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
2878 be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
2879 return __this_address;
2880
2881 if (xfs_sb_version_hasreflink(&mp->m_sb) &&
2882 be32_to_cpu(agf->agf_refcount_blocks) >
2883 be32_to_cpu(agf->agf_length))
2884 return __this_address;
2885
2886 if (xfs_sb_version_hasreflink(&mp->m_sb) &&
2887 (be32_to_cpu(agf->agf_refcount_level) < 1 ||
2888 be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS))
2889 return __this_address;
2890
2891 return NULL;
2892
2893 }
2894
2895 static void
2896 xfs_agf_read_verify(
2897 struct xfs_buf *bp)
2898 {
2899 struct xfs_mount *mp = bp->b_mount;
2900 xfs_failaddr_t fa;
2901
2902 if (xfs_sb_version_hascrc(&mp->m_sb) &&
2903 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
2904 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
2905 else {
2906 fa = xfs_agf_verify(bp);
2907 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
2908 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2909 }
2910 }
2911
2912 static void
2913 xfs_agf_write_verify(
2914 struct xfs_buf *bp)
2915 {
2916 struct xfs_mount *mp = bp->b_mount;
2917 struct xfs_buf_log_item *bip = bp->b_log_item;
2918 struct xfs_agf *agf = bp->b_addr;
2919 xfs_failaddr_t fa;
2920
2921 fa = xfs_agf_verify(bp);
2922 if (fa) {
2923 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2924 return;
2925 }
2926
2927 if (!xfs_sb_version_hascrc(&mp->m_sb))
2928 return;
2929
2930 if (bip)
2931 agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2932
2933 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
2934 }
2935
2936 const struct xfs_buf_ops xfs_agf_buf_ops = {
2937 .name = "xfs_agf",
2938 .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
2939 .verify_read = xfs_agf_read_verify,
2940 .verify_write = xfs_agf_write_verify,
2941 .verify_struct = xfs_agf_verify,
2942 };
2943
2944 /*
2945 * Read in the allocation group header (free/alloc section).
2946 */
2947 int /* error */
2948 xfs_read_agf(
2949 struct xfs_mount *mp, /* mount point structure */
2950 struct xfs_trans *tp, /* transaction pointer */
2951 xfs_agnumber_t agno, /* allocation group number */
2952 int flags, /* XFS_BUF_ */
2953 struct xfs_buf **bpp) /* buffer for the ag freelist header */
2954 {
2955 int error;
2956
2957 trace_xfs_read_agf(mp, agno);
2958
2959 ASSERT(agno != NULLAGNUMBER);
2960 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
2961 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
2962 XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
2963 if (error)
2964 return error;
2965
2966 ASSERT(!(*bpp)->b_error);
2967 xfs_buf_set_ref(*bpp, XFS_AGF_REF);
2968 return 0;
2969 }
2970
2971 /*
2972 * Read in the allocation group header (free/alloc section).
2973 */
2974 int /* error */
2975 xfs_alloc_read_agf(
2976 struct xfs_mount *mp, /* mount point structure */
2977 struct xfs_trans *tp, /* transaction pointer */
2978 xfs_agnumber_t agno, /* allocation group number */
2979 int flags, /* XFS_ALLOC_FLAG_... */
2980 struct xfs_buf **bpp) /* buffer for the ag freelist header */
2981 {
2982 struct xfs_agf *agf; /* ag freelist header */
2983 struct xfs_perag *pag; /* per allocation group data */
2984 int error;
2985
2986 trace_xfs_alloc_read_agf(mp, agno);
2987
2988 /* We don't support trylock when freeing. */
2989 ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
2990 (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
2991 ASSERT(agno != NULLAGNUMBER);
2992 error = xfs_read_agf(mp, tp, agno,
2993 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
2994 bpp);
2995 if (error)
2996 return error;
2997 ASSERT(!(*bpp)->b_error);
2998
2999 agf = (*bpp)->b_addr;
3000 pag = (*bpp)->b_pag;
3001 if (!pag->pagf_init) {
3002 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
3003 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
3004 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3005 pag->pagf_longest = be32_to_cpu(agf->agf_longest);
3006 pag->pagf_levels[XFS_BTNUM_BNOi] =
3007 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
3008 pag->pagf_levels[XFS_BTNUM_CNTi] =
3009 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
3010 pag->pagf_levels[XFS_BTNUM_RMAPi] =
3011 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
3012 pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
3013 pag->pagf_init = 1;
3014 pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf);
3015 }
3016 #ifdef DEBUG
3017 else if (!XFS_FORCED_SHUTDOWN(mp)) {
3018 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
3019 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
3020 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
3021 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
3022 ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
3023 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
3024 ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
3025 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
3026 }
3027 #endif
3028 return 0;
3029 }
3030
3031 /*
3032 * Allocate an extent (variable-size).
3033 * Depending on the allocation type, we either look in a single allocation
3034 * group or loop over the allocation groups to find the result.
3035 */
3036 int /* error */
3037 xfs_alloc_vextent(
3038 struct xfs_alloc_arg *args) /* allocation argument structure */
3039 {
3040 xfs_agblock_t agsize; /* allocation group size */
3041 int error;
3042 int flags; /* XFS_ALLOC_FLAG_... locking flags */
3043 struct xfs_mount *mp; /* mount structure pointer */
3044 xfs_agnumber_t sagno; /* starting allocation group number */
3045 xfs_alloctype_t type; /* input allocation type */
3046 int bump_rotor = 0;
3047 xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */
3048
3049 mp = args->mp;
3050 type = args->otype = args->type;
3051 args->agbno = NULLAGBLOCK;
3052 /*
3053 * Just fix this up, for the case where the last a.g. is shorter
3054 * (or there's only one a.g.) and the caller couldn't easily figure
3055 * that out (xfs_bmap_alloc).
3056 */
3057 agsize = mp->m_sb.sb_agblocks;
3058 if (args->maxlen > agsize)
3059 args->maxlen = agsize;
3060 if (args->alignment == 0)
3061 args->alignment = 1;
3062 ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
3063 ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
3064 ASSERT(args->minlen <= args->maxlen);
3065 ASSERT(args->minlen <= agsize);
3066 ASSERT(args->mod < args->prod);
3067 if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
3068 XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
3069 args->minlen > args->maxlen || args->minlen > agsize ||
3070 args->mod >= args->prod) {
3071 args->fsbno = NULLFSBLOCK;
3072 trace_xfs_alloc_vextent_badargs(args);
3073 return 0;
3074 }
3075
3076 switch (type) {
3077 case XFS_ALLOCTYPE_THIS_AG:
3078 case XFS_ALLOCTYPE_NEAR_BNO:
3079 case XFS_ALLOCTYPE_THIS_BNO:
3080 /*
3081 * These three force us into a single a.g.
3082 */
3083 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3084 args->pag = xfs_perag_get(mp, args->agno);
3085 error = xfs_alloc_fix_freelist(args, 0);
3086 if (error) {
3087 trace_xfs_alloc_vextent_nofix(args);
3088 goto error0;
3089 }
3090 if (!args->agbp) {
3091 trace_xfs_alloc_vextent_noagbp(args);
3092 break;
3093 }
3094 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3095 if ((error = xfs_alloc_ag_vextent(args)))
3096 goto error0;
3097 break;
3098 case XFS_ALLOCTYPE_START_BNO:
3099 /*
3100 * Try near allocation first, then anywhere-in-ag after
3101 * the first a.g. fails.
3102 */
3103 if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
3104 (mp->m_flags & XFS_MOUNT_32BITINODES)) {
3105 args->fsbno = XFS_AGB_TO_FSB(mp,
3106 ((mp->m_agfrotor / rotorstep) %
3107 mp->m_sb.sb_agcount), 0);
3108 bump_rotor = 1;
3109 }
3110 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3111 args->type = XFS_ALLOCTYPE_NEAR_BNO;
3112 /* FALLTHROUGH */
3113 case XFS_ALLOCTYPE_FIRST_AG:
3114 /*
3115 * Rotate through the allocation groups looking for a winner.
3116 */
3117 if (type == XFS_ALLOCTYPE_FIRST_AG) {
3118 /*
3119 * Start with allocation group given by bno.
3120 */
3121 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3122 args->type = XFS_ALLOCTYPE_THIS_AG;
3123 sagno = 0;
3124 flags = 0;
3125 } else {
3126 /*
3127 * Start with the given allocation group.
3128 */
3129 args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3130 flags = XFS_ALLOC_FLAG_TRYLOCK;
3131 }
3132 /*
3133 * Loop over allocation groups twice; first time with
3134 * trylock set, second time without.
3135 */
3136 for (;;) {
3137 args->pag = xfs_perag_get(mp, args->agno);
3138 error = xfs_alloc_fix_freelist(args, flags);
3139 if (error) {
3140 trace_xfs_alloc_vextent_nofix(args);
3141 goto error0;
3142 }
3143 /*
3144 * If we get a buffer back then the allocation will fly.
3145 */
3146 if (args->agbp) {
3147 if ((error = xfs_alloc_ag_vextent(args)))
3148 goto error0;
3149 break;
3150 }
3151
3152 trace_xfs_alloc_vextent_loopfailed(args);
3153
3154 /*
3155 * Didn't work, figure out the next iteration.
3156 */
3157 if (args->agno == sagno &&
3158 type == XFS_ALLOCTYPE_START_BNO)
3159 args->type = XFS_ALLOCTYPE_THIS_AG;
3160 /*
3161 * For the first allocation, we can try any AG to get
3162 * space. However, if we already have allocated a
3163 * block, we don't want to try AGs whose number is below
3164 * sagno. Otherwise, we may end up with out-of-order
3165 * locking of AGF, which might cause deadlock.
3166 */
3167 if (++(args->agno) == mp->m_sb.sb_agcount) {
3168 if (args->tp->t_firstblock != NULLFSBLOCK)
3169 args->agno = sagno;
3170 else
3171 args->agno = 0;
3172 }
3173 /*
3174 * Reached the starting a.g., must either be done
3175 * or switch to non-trylock mode.
3176 */
3177 if (args->agno == sagno) {
3178 if (flags == 0) {
3179 args->agbno = NULLAGBLOCK;
3180 trace_xfs_alloc_vextent_allfailed(args);
3181 break;
3182 }
3183
3184 flags = 0;
3185 if (type == XFS_ALLOCTYPE_START_BNO) {
3186 args->agbno = XFS_FSB_TO_AGBNO(mp,
3187 args->fsbno);
3188 args->type = XFS_ALLOCTYPE_NEAR_BNO;
3189 }
3190 }
3191 xfs_perag_put(args->pag);
3192 }
3193 if (bump_rotor) {
3194 if (args->agno == sagno)
3195 mp->m_agfrotor = (mp->m_agfrotor + 1) %
3196 (mp->m_sb.sb_agcount * rotorstep);
3197 else
3198 mp->m_agfrotor = (args->agno * rotorstep + 1) %
3199 (mp->m_sb.sb_agcount * rotorstep);
3200 }
3201 break;
3202 default:
3203 ASSERT(0);
3204 /* NOTREACHED */
3205 }
3206 if (args->agbno == NULLAGBLOCK)
3207 args->fsbno = NULLFSBLOCK;
3208 else {
3209 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
3210 #ifdef DEBUG
3211 ASSERT(args->len >= args->minlen);
3212 ASSERT(args->len <= args->maxlen);
3213 ASSERT(args->agbno % args->alignment == 0);
3214 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
3215 args->len);
3216 #endif
3217
3218 }
3219 xfs_perag_put(args->pag);
3220 return 0;
3221 error0:
3222 xfs_perag_put(args->pag);
3223 return error;
3224 }
3225
3226 /* Ensure that the freelist is at full capacity. */
3227 int
3228 xfs_free_extent_fix_freelist(
3229 struct xfs_trans *tp,
3230 xfs_agnumber_t agno,
3231 struct xfs_buf **agbp)
3232 {
3233 struct xfs_alloc_arg args;
3234 int error;
3235
3236 memset(&args, 0, sizeof(struct xfs_alloc_arg));
3237 args.tp = tp;
3238 args.mp = tp->t_mountp;
3239 args.agno = agno;
3240
3241 /*
3242 * validate that the block number is legal - the enables us to detect
3243 * and handle a silent filesystem corruption rather than crashing.
3244 */
3245 if (args.agno >= args.mp->m_sb.sb_agcount)
3246 return -EFSCORRUPTED;
3247
3248 args.pag = xfs_perag_get(args.mp, args.agno);
3249 ASSERT(args.pag);
3250
3251 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3252 if (error)
3253 goto out;
3254
3255 *agbp = args.agbp;
3256 out:
3257 xfs_perag_put(args.pag);
3258 return error;
3259 }
3260
3261 /*
3262 * Free an extent.
3263 * Just break up the extent address and hand off to xfs_free_ag_extent
3264 * after fixing up the freelist.
3265 */
3266 int
3267 __xfs_free_extent(
3268 struct xfs_trans *tp,
3269 xfs_fsblock_t bno,
3270 xfs_extlen_t len,
3271 const struct xfs_owner_info *oinfo,
3272 enum xfs_ag_resv_type type,
3273 bool skip_discard)
3274 {
3275 struct xfs_mount *mp = tp->t_mountp;
3276 struct xfs_buf *agbp;
3277 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
3278 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
3279 struct xfs_agf *agf;
3280 int error;
3281 unsigned int busy_flags = 0;
3282
3283 ASSERT(len != 0);
3284 ASSERT(type != XFS_AG_RESV_AGFL);
3285
3286 if (XFS_TEST_ERROR(false, mp,
3287 XFS_ERRTAG_FREE_EXTENT))
3288 return -EIO;
3289
3290 error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
3291 if (error)
3292 return error;
3293 agf = agbp->b_addr;
3294
3295 if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
3296 error = -EFSCORRUPTED;
3297 goto err;
3298 }
3299
3300 /* validate the extent size is legal now we have the agf locked */
3301 if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
3302 error = -EFSCORRUPTED;
3303 goto err;
3304 }
3305
3306 error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
3307 if (error)
3308 goto err;
3309
3310 if (skip_discard)
3311 busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
3312 xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags);
3313 return 0;
3314
3315 err:
3316 xfs_trans_brelse(tp, agbp);
3317 return error;
3318 }
3319
3320 struct xfs_alloc_query_range_info {
3321 xfs_alloc_query_range_fn fn;
3322 void *priv;
3323 };
3324
3325 /* Format btree record and pass to our callback. */
3326 STATIC int
3327 xfs_alloc_query_range_helper(
3328 struct xfs_btree_cur *cur,
3329 union xfs_btree_rec *rec,
3330 void *priv)
3331 {
3332 struct xfs_alloc_query_range_info *query = priv;
3333 struct xfs_alloc_rec_incore irec;
3334
3335 irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
3336 irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
3337 return query->fn(cur, &irec, query->priv);
3338 }
3339
3340 /* Find all free space within a given range of blocks. */
3341 int
3342 xfs_alloc_query_range(
3343 struct xfs_btree_cur *cur,
3344 struct xfs_alloc_rec_incore *low_rec,
3345 struct xfs_alloc_rec_incore *high_rec,
3346 xfs_alloc_query_range_fn fn,
3347 void *priv)
3348 {
3349 union xfs_btree_irec low_brec;
3350 union xfs_btree_irec high_brec;
3351 struct xfs_alloc_query_range_info query;
3352
3353 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3354 low_brec.a = *low_rec;
3355 high_brec.a = *high_rec;
3356 query.priv = priv;
3357 query.fn = fn;
3358 return xfs_btree_query_range(cur, &low_brec, &high_brec,
3359 xfs_alloc_query_range_helper, &query);
3360 }
3361
3362 /* Find all free space records. */
3363 int
3364 xfs_alloc_query_all(
3365 struct xfs_btree_cur *cur,
3366 xfs_alloc_query_range_fn fn,
3367 void *priv)
3368 {
3369 struct xfs_alloc_query_range_info query;
3370
3371 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3372 query.priv = priv;
3373 query.fn = fn;
3374 return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
3375 }
3376
3377 /* Is there a record covering a given extent? */
3378 int
3379 xfs_alloc_has_record(
3380 struct xfs_btree_cur *cur,
3381 xfs_agblock_t bno,
3382 xfs_extlen_t len,
3383 bool *exists)
3384 {
3385 union xfs_btree_irec low;
3386 union xfs_btree_irec high;
3387
3388 memset(&low, 0, sizeof(low));
3389 low.a.ar_startblock = bno;
3390 memset(&high, 0xFF, sizeof(high));
3391 high.a.ar_startblock = bno + len - 1;
3392
3393 return xfs_btree_has_record(cur, &low, &high, exists);
3394 }
3395
3396 /*
3397 * Walk all the blocks in the AGFL. The @walk_fn can return any negative
3398 * error code or XFS_ITER_*.
3399 */
3400 int
3401 xfs_agfl_walk(
3402 struct xfs_mount *mp,
3403 struct xfs_agf *agf,
3404 struct xfs_buf *agflbp,
3405 xfs_agfl_walk_fn walk_fn,
3406 void *priv)
3407 {
3408 __be32 *agfl_bno;
3409 unsigned int i;
3410 int error;
3411
3412 agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3413 i = be32_to_cpu(agf->agf_flfirst);
3414
3415 /* Nothing to walk in an empty AGFL. */
3416 if (agf->agf_flcount == cpu_to_be32(0))
3417 return 0;
3418
3419 /* Otherwise, walk from first to last, wrapping as needed. */
3420 for (;;) {
3421 error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
3422 if (error)
3423 return error;
3424 if (i == be32_to_cpu(agf->agf_fllast))
3425 break;
3426 if (++i == xfs_agfl_size(mp))
3427 i = 0;
3428 }
3429
3430 return 0;
3431 }