1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
6 #include "libxfs_priv.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
13 #include "xfs_mount.h"
14 #include "xfs_inode.h"
15 #include "xfs_btree.h"
16 #include "xfs_ialloc.h"
17 #include "xfs_ialloc_btree.h"
18 #include "xfs_alloc.h"
19 #include "xfs_errortag.h"
21 #include "xfs_trans.h"
22 #include "xfs_trace.h"
25 #include "xfs_health.h"
28 * Lookup a record by ino in the btree given by cur.
32 struct xfs_btree_cur
*cur
, /* btree cursor */
33 xfs_agino_t ino
, /* starting inode of chunk */
34 xfs_lookup_t dir
, /* <=, >=, == */
35 int *stat
) /* success/failure */
37 cur
->bc_rec
.i
.ir_startino
= ino
;
38 cur
->bc_rec
.i
.ir_holemask
= 0;
39 cur
->bc_rec
.i
.ir_count
= 0;
40 cur
->bc_rec
.i
.ir_freecount
= 0;
41 cur
->bc_rec
.i
.ir_free
= 0;
42 return xfs_btree_lookup(cur
, dir
, stat
);
46 * Update the record referred to by cur to the value given.
47 * This either works (return 0) or gets an EFSCORRUPTED error.
49 STATIC
int /* error */
51 struct xfs_btree_cur
*cur
, /* btree cursor */
52 xfs_inobt_rec_incore_t
*irec
) /* btree record */
54 union xfs_btree_rec rec
;
56 rec
.inobt
.ir_startino
= cpu_to_be32(irec
->ir_startino
);
57 if (xfs_has_sparseinodes(cur
->bc_mp
)) {
58 rec
.inobt
.ir_u
.sp
.ir_holemask
= cpu_to_be16(irec
->ir_holemask
);
59 rec
.inobt
.ir_u
.sp
.ir_count
= irec
->ir_count
;
60 rec
.inobt
.ir_u
.sp
.ir_freecount
= irec
->ir_freecount
;
62 /* ir_holemask/ir_count not supported on-disk */
63 rec
.inobt
.ir_u
.f
.ir_freecount
= cpu_to_be32(irec
->ir_freecount
);
65 rec
.inobt
.ir_free
= cpu_to_be64(irec
->ir_free
);
66 return xfs_btree_update(cur
, &rec
);
69 /* Convert on-disk btree record to incore inobt record. */
71 xfs_inobt_btrec_to_irec(
73 const union xfs_btree_rec
*rec
,
74 struct xfs_inobt_rec_incore
*irec
)
76 irec
->ir_startino
= be32_to_cpu(rec
->inobt
.ir_startino
);
77 if (xfs_has_sparseinodes(mp
)) {
78 irec
->ir_holemask
= be16_to_cpu(rec
->inobt
.ir_u
.sp
.ir_holemask
);
79 irec
->ir_count
= rec
->inobt
.ir_u
.sp
.ir_count
;
80 irec
->ir_freecount
= rec
->inobt
.ir_u
.sp
.ir_freecount
;
83 * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
84 * values for full inode chunks.
86 irec
->ir_holemask
= XFS_INOBT_HOLEMASK_FULL
;
87 irec
->ir_count
= XFS_INODES_PER_CHUNK
;
89 be32_to_cpu(rec
->inobt
.ir_u
.f
.ir_freecount
);
91 irec
->ir_free
= be64_to_cpu(rec
->inobt
.ir_free
);
94 /* Compute the freecount of an incore inode record. */
96 xfs_inobt_rec_freecount(
97 const struct xfs_inobt_rec_incore
*irec
)
99 uint64_t realfree
= irec
->ir_free
;
101 if (xfs_inobt_issparse(irec
->ir_holemask
))
102 realfree
&= xfs_inobt_irec_to_allocmask(irec
);
103 return hweight64(realfree
);
106 /* Simple checks for inode records. */
108 xfs_inobt_check_irec(
109 struct xfs_perag
*pag
,
110 const struct xfs_inobt_rec_incore
*irec
)
112 /* Record has to be properly aligned within the AG. */
113 if (!xfs_verify_agino(pag
, irec
->ir_startino
))
114 return __this_address
;
115 if (!xfs_verify_agino(pag
,
116 irec
->ir_startino
+ XFS_INODES_PER_CHUNK
- 1))
117 return __this_address
;
118 if (irec
->ir_count
< XFS_INODES_PER_HOLEMASK_BIT
||
119 irec
->ir_count
> XFS_INODES_PER_CHUNK
)
120 return __this_address
;
121 if (irec
->ir_freecount
> XFS_INODES_PER_CHUNK
)
122 return __this_address
;
124 if (xfs_inobt_rec_freecount(irec
) != irec
->ir_freecount
)
125 return __this_address
;
131 xfs_inobt_complain_bad_rec(
132 struct xfs_btree_cur
*cur
,
134 const struct xfs_inobt_rec_incore
*irec
)
136 struct xfs_mount
*mp
= cur
->bc_mp
;
139 "%sbt record corruption in AG %d detected at %pS!",
140 cur
->bc_ops
->name
, cur
->bc_group
->xg_gno
, fa
);
142 "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
143 irec
->ir_startino
, irec
->ir_count
, irec
->ir_freecount
,
144 irec
->ir_free
, irec
->ir_holemask
);
145 xfs_btree_mark_sick(cur
);
146 return -EFSCORRUPTED
;
150 * Get the data from the pointed-to record.
154 struct xfs_btree_cur
*cur
,
155 struct xfs_inobt_rec_incore
*irec
,
158 struct xfs_mount
*mp
= cur
->bc_mp
;
159 union xfs_btree_rec
*rec
;
163 error
= xfs_btree_get_rec(cur
, &rec
, stat
);
164 if (error
|| *stat
== 0)
167 xfs_inobt_btrec_to_irec(mp
, rec
, irec
);
168 fa
= xfs_inobt_check_irec(to_perag(cur
->bc_group
), irec
);
170 return xfs_inobt_complain_bad_rec(cur
, fa
, irec
);
176 * Insert a single inobt record. Cursor must already point to desired location.
179 xfs_inobt_insert_rec(
180 struct xfs_btree_cur
*cur
,
187 cur
->bc_rec
.i
.ir_holemask
= holemask
;
188 cur
->bc_rec
.i
.ir_count
= count
;
189 cur
->bc_rec
.i
.ir_freecount
= freecount
;
190 cur
->bc_rec
.i
.ir_free
= free
;
191 return xfs_btree_insert(cur
, stat
);
195 * Insert records describing a newly allocated inode chunk into the inobt.
199 struct xfs_perag
*pag
,
200 struct xfs_trans
*tp
,
201 struct xfs_buf
*agbp
,
206 struct xfs_btree_cur
*cur
;
212 cur
= xfs_finobt_init_cursor(pag
, tp
, agbp
);
214 cur
= xfs_inobt_init_cursor(pag
, tp
, agbp
);
216 for (thisino
= newino
;
217 thisino
< newino
+ newlen
;
218 thisino
+= XFS_INODES_PER_CHUNK
) {
219 error
= xfs_inobt_lookup(cur
, thisino
, XFS_LOOKUP_EQ
, &i
);
221 xfs_btree_del_cursor(cur
, XFS_BTREE_ERROR
);
226 error
= xfs_inobt_insert_rec(cur
, XFS_INOBT_HOLEMASK_FULL
,
227 XFS_INODES_PER_CHUNK
,
228 XFS_INODES_PER_CHUNK
,
229 XFS_INOBT_ALL_FREE
, &i
);
231 xfs_btree_del_cursor(cur
, XFS_BTREE_ERROR
);
237 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
243 * Verify that the number of free inodes in the AGI is correct.
247 xfs_check_agi_freecount(
248 struct xfs_btree_cur
*cur
)
250 if (cur
->bc_nlevels
== 1) {
251 xfs_inobt_rec_incore_t rec
;
256 error
= xfs_inobt_lookup(cur
, 0, XFS_LOOKUP_GE
, &i
);
261 error
= xfs_inobt_get_rec(cur
, &rec
, &i
);
266 freecount
+= rec
.ir_freecount
;
267 error
= xfs_btree_increment(cur
, 0, &i
);
273 if (!xfs_is_shutdown(cur
->bc_mp
)) {
275 to_perag(cur
->bc_group
)->pagi_freecount
);
281 #define xfs_check_agi_freecount(cur) 0
285 * Initialise a new set of inodes. When called without a transaction context
286 * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
287 * than logging them (which in a transaction context puts them into the AIL
288 * for writeback rather than the xfsbufd queue).
291 xfs_ialloc_inode_init(
292 struct xfs_mount
*mp
,
293 struct xfs_trans
*tp
,
294 struct list_head
*buffer_list
,
298 xfs_agblock_t length
,
301 struct xfs_buf
*fbuf
;
302 struct xfs_dinode
*free
;
311 * Loop over the new block(s), filling in the inodes. For small block
312 * sizes, manipulate the inodes in buffers which are multiples of the
315 nbufs
= length
/ M_IGEO(mp
)->blocks_per_cluster
;
318 * Figure out what version number to use in the inodes we create. If
319 * the superblock version has caught up to the one that supports the new
320 * inode format, then use the new inode version. Otherwise use the old
321 * version so that old kernels will continue to be able to use the file
324 * For v3 inodes, we also need to write the inode number into the inode,
325 * so calculate the first inode number of the chunk here as
326 * XFS_AGB_TO_AGINO() only works within a filesystem block, not
327 * across multiple filesystem blocks (such as a cluster) and so cannot
328 * be used in the cluster buffer loop below.
330 * Further, because we are writing the inode directly into the buffer
331 * and calculating a CRC on the entire inode, we have ot log the entire
332 * inode so that the entire range the CRC covers is present in the log.
333 * That means for v3 inode we log the entire buffer rather than just the
336 if (xfs_has_v3inodes(mp
)) {
338 ino
= XFS_AGINO_TO_INO(mp
, agno
, XFS_AGB_TO_AGINO(mp
, agbno
));
341 * log the initialisation that is about to take place as an
342 * logical operation. This means the transaction does not
343 * need to log the physical changes to the inode buffers as log
344 * recovery will know what initialisation is actually needed.
345 * Hence we only need to log the buffers as "ordered" buffers so
346 * they track in the AIL as if they were physically logged.
349 xfs_icreate_log(tp
, agno
, agbno
, icount
,
350 mp
->m_sb
.sb_inodesize
, length
, gen
);
354 for (j
= 0; j
< nbufs
; j
++) {
358 d
= XFS_AGB_TO_DADDR(mp
, agno
, agbno
+
359 (j
* M_IGEO(mp
)->blocks_per_cluster
));
360 error
= xfs_trans_get_buf(tp
, mp
->m_ddev_targp
, d
,
361 mp
->m_bsize
* M_IGEO(mp
)->blocks_per_cluster
,
366 /* Initialize the inode buffers and log them appropriately. */
367 fbuf
->b_ops
= &xfs_inode_buf_ops
;
368 xfs_buf_zero(fbuf
, 0, BBTOB(fbuf
->b_length
));
369 for (i
= 0; i
< M_IGEO(mp
)->inodes_per_cluster
; i
++) {
370 int ioffset
= i
<< mp
->m_sb
.sb_inodelog
;
372 free
= xfs_make_iptr(mp
, fbuf
, i
);
373 free
->di_magic
= cpu_to_be16(XFS_DINODE_MAGIC
);
374 free
->di_version
= version
;
375 free
->di_gen
= cpu_to_be32(gen
);
376 free
->di_next_unlinked
= cpu_to_be32(NULLAGINO
);
379 free
->di_ino
= cpu_to_be64(ino
);
381 uuid_copy(&free
->di_uuid
,
382 &mp
->m_sb
.sb_meta_uuid
);
383 xfs_dinode_calc_crc(mp
, free
);
385 /* just log the inode core */
386 xfs_trans_log_buf(tp
, fbuf
, ioffset
,
387 ioffset
+ XFS_DINODE_SIZE(mp
) - 1);
393 * Mark the buffer as an inode allocation buffer so it
394 * sticks in AIL at the point of this allocation
395 * transaction. This ensures the they are on disk before
396 * the tail of the log can be moved past this
397 * transaction (i.e. by preventing relogging from moving
398 * it forward in the log).
400 xfs_trans_inode_alloc_buf(tp
, fbuf
);
403 * Mark the buffer as ordered so that they are
404 * not physically logged in the transaction but
405 * still tracked in the AIL as part of the
406 * transaction and pin the log appropriately.
408 xfs_trans_ordered_buf(tp
, fbuf
);
411 fbuf
->b_flags
|= XBF_DONE
;
412 xfs_buf_delwri_queue(fbuf
, buffer_list
);
420 * Align startino and allocmask for a recently allocated sparse chunk such that
421 * they are fit for insertion (or merge) into the on-disk inode btrees.
425 * When enabled, sparse inode support increases the inode alignment from cluster
426 * size to inode chunk size. This means that the minimum range between two
427 * non-adjacent inode records in the inobt is large enough for a full inode
428 * record. This allows for cluster sized, cluster aligned block allocation
429 * without need to worry about whether the resulting inode record overlaps with
430 * another record in the tree. Without this basic rule, we would have to deal
431 * with the consequences of overlap by potentially undoing recent allocations in
432 * the inode allocation codepath.
434 * Because of this alignment rule (which is enforced on mount), there are two
435 * inobt possibilities for newly allocated sparse chunks. One is that the
436 * aligned inode record for the chunk covers a range of inodes not already
437 * covered in the inobt (i.e., it is safe to insert a new sparse record). The
438 * other is that a record already exists at the aligned startino that considers
439 * the newly allocated range as sparse. In the latter case, record content is
440 * merged in hope that sparse inode chunks fill to full chunks over time.
443 xfs_align_sparse_ino(
444 struct xfs_mount
*mp
,
445 xfs_agino_t
*startino
,
452 agbno
= XFS_AGINO_TO_AGBNO(mp
, *startino
);
453 mod
= agbno
% mp
->m_sb
.sb_inoalignmt
;
457 /* calculate the inode offset and align startino */
458 offset
= XFS_AGB_TO_AGINO(mp
, mod
);
462 * Since startino has been aligned down, left shift allocmask such that
463 * it continues to represent the same physical inodes relative to the
466 *allocmask
<<= offset
/ XFS_INODES_PER_HOLEMASK_BIT
;
470 * Determine whether the source inode record can merge into the target. Both
471 * records must be sparse, the inode ranges must match and there must be no
472 * allocation overlap between the records.
475 __xfs_inobt_can_merge(
476 struct xfs_inobt_rec_incore
*trec
, /* tgt record */
477 struct xfs_inobt_rec_incore
*srec
) /* src record */
482 /* records must cover the same inode range */
483 if (trec
->ir_startino
!= srec
->ir_startino
)
486 /* both records must be sparse */
487 if (!xfs_inobt_issparse(trec
->ir_holemask
) ||
488 !xfs_inobt_issparse(srec
->ir_holemask
))
491 /* both records must track some inodes */
492 if (!trec
->ir_count
|| !srec
->ir_count
)
495 /* can't exceed capacity of a full record */
496 if (trec
->ir_count
+ srec
->ir_count
> XFS_INODES_PER_CHUNK
)
499 /* verify there is no allocation overlap */
500 talloc
= xfs_inobt_irec_to_allocmask(trec
);
501 salloc
= xfs_inobt_irec_to_allocmask(srec
);
509 * Merge the source inode record into the target. The caller must call
510 * __xfs_inobt_can_merge() to ensure the merge is valid.
513 __xfs_inobt_rec_merge(
514 struct xfs_inobt_rec_incore
*trec
, /* target */
515 struct xfs_inobt_rec_incore
*srec
) /* src */
517 ASSERT(trec
->ir_startino
== srec
->ir_startino
);
519 /* combine the counts */
520 trec
->ir_count
+= srec
->ir_count
;
521 trec
->ir_freecount
+= srec
->ir_freecount
;
524 * Merge the holemask and free mask. For both fields, 0 bits refer to
525 * allocated inodes. We combine the allocated ranges with bitwise AND.
527 trec
->ir_holemask
&= srec
->ir_holemask
;
528 trec
->ir_free
&= srec
->ir_free
;
532 * Insert a new sparse inode chunk into the associated inode allocation btree.
533 * The inode record for the sparse chunk is pre-aligned to a startino that
534 * should match any pre-existing sparse inode record in the tree. This allows
535 * sparse chunks to fill over time.
537 * If no preexisting record exists, the provided record is inserted.
538 * If there is a preexisting record, the provided record is merged with the
539 * existing record and updated in place. The merged record is returned in nrec.
541 * It is considered corruption if a merge is requested and not possible. Given
542 * the sparse inode alignment constraints, this should never happen.
545 xfs_inobt_insert_sprec(
546 struct xfs_perag
*pag
,
547 struct xfs_trans
*tp
,
548 struct xfs_buf
*agbp
,
549 struct xfs_inobt_rec_incore
*nrec
) /* in/out: new/merged rec. */
551 struct xfs_mount
*mp
= pag_mount(pag
);
552 struct xfs_btree_cur
*cur
;
555 struct xfs_inobt_rec_incore rec
;
557 cur
= xfs_inobt_init_cursor(pag
, tp
, agbp
);
559 /* the new record is pre-aligned so we know where to look */
560 error
= xfs_inobt_lookup(cur
, nrec
->ir_startino
, XFS_LOOKUP_EQ
, &i
);
563 /* if nothing there, insert a new record and return */
565 error
= xfs_inobt_insert_rec(cur
, nrec
->ir_holemask
,
566 nrec
->ir_count
, nrec
->ir_freecount
,
570 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
571 xfs_btree_mark_sick(cur
);
572 error
= -EFSCORRUPTED
;
580 * A record exists at this startino. Merge the records.
582 error
= xfs_inobt_get_rec(cur
, &rec
, &i
);
585 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
586 xfs_btree_mark_sick(cur
);
587 error
= -EFSCORRUPTED
;
590 if (XFS_IS_CORRUPT(mp
, rec
.ir_startino
!= nrec
->ir_startino
)) {
591 xfs_btree_mark_sick(cur
);
592 error
= -EFSCORRUPTED
;
597 * This should never fail. If we have coexisting records that
598 * cannot merge, something is seriously wrong.
600 if (XFS_IS_CORRUPT(mp
, !__xfs_inobt_can_merge(nrec
, &rec
))) {
601 xfs_btree_mark_sick(cur
);
602 error
= -EFSCORRUPTED
;
606 trace_xfs_irec_merge_pre(pag
, &rec
, nrec
);
608 /* merge to nrec to output the updated record */
609 __xfs_inobt_rec_merge(nrec
, &rec
);
611 trace_xfs_irec_merge_post(pag
, nrec
);
613 error
= xfs_inobt_rec_check_count(mp
, nrec
);
617 error
= xfs_inobt_update(cur
, nrec
);
622 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
625 xfs_btree_del_cursor(cur
, XFS_BTREE_ERROR
);
630 * Insert a new sparse inode chunk into the free inode btree. The inode
631 * record for the sparse chunk is pre-aligned to a startino that should match
632 * any pre-existing sparse inode record in the tree. This allows sparse chunks
635 * The new record is always inserted, overwriting a pre-existing record if
639 xfs_finobt_insert_sprec(
640 struct xfs_perag
*pag
,
641 struct xfs_trans
*tp
,
642 struct xfs_buf
*agbp
,
643 struct xfs_inobt_rec_incore
*nrec
) /* in/out: new rec. */
645 struct xfs_mount
*mp
= pag_mount(pag
);
646 struct xfs_btree_cur
*cur
;
650 cur
= xfs_finobt_init_cursor(pag
, tp
, agbp
);
652 /* the new record is pre-aligned so we know where to look */
653 error
= xfs_inobt_lookup(cur
, nrec
->ir_startino
, XFS_LOOKUP_EQ
, &i
);
656 /* if nothing there, insert a new record and return */
658 error
= xfs_inobt_insert_rec(cur
, nrec
->ir_holemask
,
659 nrec
->ir_count
, nrec
->ir_freecount
,
663 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
664 xfs_btree_mark_sick(cur
);
665 error
= -EFSCORRUPTED
;
669 error
= xfs_inobt_update(cur
, nrec
);
674 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
677 xfs_btree_del_cursor(cur
, XFS_BTREE_ERROR
);
683 * Allocate new inodes in the allocation group specified by agbp. Returns 0 if
684 * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so
685 * the caller knows it can try another AG, a hard -ENOSPC when over the maximum
686 * inode count threshold, or the usual negative error code for other errors.
690 struct xfs_perag
*pag
,
691 struct xfs_trans
*tp
,
692 struct xfs_buf
*agbp
)
695 struct xfs_alloc_arg args
;
697 xfs_agino_t newino
; /* new first inode's number */
698 xfs_agino_t newlen
; /* new number of inodes */
699 int isaligned
= 0; /* inode allocation at stripe */
701 /* init. to full chunk */
702 struct xfs_inobt_rec_incore rec
;
703 struct xfs_ino_geometry
*igeo
= M_IGEO(tp
->t_mountp
);
704 uint16_t allocmask
= (uint16_t) -1;
707 memset(&args
, 0, sizeof(args
));
709 args
.mp
= tp
->t_mountp
;
710 args
.fsbno
= NULLFSBLOCK
;
711 args
.oinfo
= XFS_RMAP_OINFO_INODES
;
715 /* randomly do sparse inode allocations */
716 if (xfs_has_sparseinodes(tp
->t_mountp
) &&
717 igeo
->ialloc_min_blks
< igeo
->ialloc_blks
)
718 do_sparse
= get_random_u32_below(2);
722 * Locking will ensure that we don't have two callers in here
725 newlen
= igeo
->ialloc_inos
;
726 if (igeo
->maxicount
&&
727 percpu_counter_read_positive(&args
.mp
->m_icount
) + newlen
>
730 args
.minlen
= args
.maxlen
= igeo
->ialloc_blks
;
732 * First try to allocate inodes contiguous with the last-allocated
733 * chunk of inodes. If the filesystem is striped, this will fill
734 * an entire stripe unit with inodes.
737 newino
= be32_to_cpu(agi
->agi_newino
);
738 args
.agbno
= XFS_AGINO_TO_AGBNO(args
.mp
, newino
) +
742 if (likely(newino
!= NULLAGINO
&&
743 (args
.agbno
< be32_to_cpu(agi
->agi_length
)))) {
747 * We need to take into account alignment here to ensure that
748 * we don't modify the free list if we fail to have an exact
749 * block. If we don't have an exact match, and every oher
750 * attempt allocation attempt fails, we'll end up cancelling
751 * a dirty transaction and shutting down.
753 * For an exact allocation, alignment must be 1,
754 * however we need to take cluster alignment into account when
755 * fixing up the freelist. Use the minalignslop field to
756 * indicate that extra blocks might be required for alignment,
757 * but not to use them in the actual exact allocation.
760 args
.minalignslop
= igeo
->cluster_align
- 1;
762 /* Allow space for the inode btree to split. */
763 args
.minleft
= igeo
->inobt_maxlevels
;
764 error
= xfs_alloc_vextent_exact_bno(&args
,
765 xfs_agbno_to_fsb(pag
, args
.agbno
));
770 * This request might have dirtied the transaction if the AG can
771 * satisfy the request, but the exact block was not available.
772 * If the allocation did fail, subsequent requests will relax
773 * the exact agbno requirement and increase the alignment
774 * instead. It is critical that the total size of the request
775 * (len + alignment + slop) does not increase from this point
776 * on, so reset minalignslop to ensure it is not included in
777 * subsequent requests.
779 args
.minalignslop
= 0;
782 if (unlikely(args
.fsbno
== NULLFSBLOCK
)) {
784 * Set the alignment for the allocation.
785 * If stripe alignment is turned on then align at stripe unit
787 * If the cluster size is smaller than a filesystem block
788 * then we're doing I/O for inodes in filesystem block size
789 * pieces, so don't need alignment anyway.
792 if (igeo
->ialloc_align
) {
793 ASSERT(!xfs_has_noalign(args
.mp
));
794 args
.alignment
= args
.mp
->m_dalign
;
797 args
.alignment
= igeo
->cluster_align
;
799 * Allocate a fixed-size extent of inodes.
803 * Allow space for the inode btree to split.
805 args
.minleft
= igeo
->inobt_maxlevels
;
806 error
= xfs_alloc_vextent_near_bno(&args
,
807 xfs_agbno_to_fsb(pag
,
808 be32_to_cpu(agi
->agi_root
)));
814 * If stripe alignment is turned on, then try again with cluster
817 if (isaligned
&& args
.fsbno
== NULLFSBLOCK
) {
818 args
.alignment
= igeo
->cluster_align
;
819 error
= xfs_alloc_vextent_near_bno(&args
,
820 xfs_agbno_to_fsb(pag
,
821 be32_to_cpu(agi
->agi_root
)));
827 * Finally, try a sparse allocation if the filesystem supports it and
828 * the sparse allocation length is smaller than a full chunk.
830 if (xfs_has_sparseinodes(args
.mp
) &&
831 igeo
->ialloc_min_blks
< igeo
->ialloc_blks
&&
832 args
.fsbno
== NULLFSBLOCK
) {
834 args
.alignment
= args
.mp
->m_sb
.sb_spino_align
;
837 args
.minlen
= igeo
->ialloc_min_blks
;
838 args
.maxlen
= args
.minlen
;
841 * The inode record will be aligned to full chunk size. We must
842 * prevent sparse allocation from AG boundaries that result in
843 * invalid inode records, such as records that start at agbno 0
844 * or extend beyond the AG.
846 * Set min agbno to the first aligned, non-zero agbno and max to
847 * the last aligned agbno that is at least one full chunk from
850 args
.min_agbno
= args
.mp
->m_sb
.sb_inoalignmt
;
851 args
.max_agbno
= round_down(xfs_ag_block_count(args
.mp
,
853 args
.mp
->m_sb
.sb_inoalignmt
) -
856 error
= xfs_alloc_vextent_near_bno(&args
,
857 xfs_agbno_to_fsb(pag
,
858 be32_to_cpu(agi
->agi_root
)));
862 newlen
= XFS_AGB_TO_AGINO(args
.mp
, args
.len
);
863 ASSERT(newlen
<= XFS_INODES_PER_CHUNK
);
864 allocmask
= (1 << (newlen
/ XFS_INODES_PER_HOLEMASK_BIT
)) - 1;
867 if (args
.fsbno
== NULLFSBLOCK
)
870 ASSERT(args
.len
== args
.minlen
);
873 * Stamp and write the inode buffers.
875 * Seed the new inode cluster with a random generation number. This
876 * prevents short-term reuse of generation numbers if a chunk is
877 * freed and then immediately reallocated. We use random numbers
878 * rather than a linear progression to prevent the next generation
879 * number from being easily guessable.
881 error
= xfs_ialloc_inode_init(args
.mp
, tp
, NULL
, newlen
, pag_agno(pag
),
882 args
.agbno
, args
.len
, get_random_u32());
887 * Convert the results.
889 newino
= XFS_AGB_TO_AGINO(args
.mp
, args
.agbno
);
891 if (xfs_inobt_issparse(~allocmask
)) {
893 * We've allocated a sparse chunk. Align the startino and mask.
895 xfs_align_sparse_ino(args
.mp
, &newino
, &allocmask
);
897 rec
.ir_startino
= newino
;
898 rec
.ir_holemask
= ~allocmask
;
899 rec
.ir_count
= newlen
;
900 rec
.ir_freecount
= newlen
;
901 rec
.ir_free
= XFS_INOBT_ALL_FREE
;
904 * Insert the sparse record into the inobt and allow for a merge
905 * if necessary. If a merge does occur, rec is updated to the
908 error
= xfs_inobt_insert_sprec(pag
, tp
, agbp
, &rec
);
909 if (error
== -EFSCORRUPTED
) {
911 "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
912 xfs_agino_to_ino(pag
, rec
.ir_startino
),
913 rec
.ir_holemask
, rec
.ir_count
);
914 xfs_force_shutdown(args
.mp
, SHUTDOWN_CORRUPT_INCORE
);
920 * We can't merge the part we've just allocated as for the inobt
921 * due to finobt semantics. The original record may or may not
922 * exist independent of whether physical inodes exist in this
925 * We must update the finobt record based on the inobt record.
926 * rec contains the fully merged and up to date inobt record
927 * from the previous call. Set merge false to replace any
928 * existing record with this one.
930 if (xfs_has_finobt(args
.mp
)) {
931 error
= xfs_finobt_insert_sprec(pag
, tp
, agbp
, &rec
);
936 /* full chunk - insert new records to both btrees */
937 error
= xfs_inobt_insert(pag
, tp
, agbp
, newino
, newlen
, false);
941 if (xfs_has_finobt(args
.mp
)) {
942 error
= xfs_inobt_insert(pag
, tp
, agbp
, newino
,
950 * Update AGI counts and newino.
952 be32_add_cpu(&agi
->agi_count
, newlen
);
953 be32_add_cpu(&agi
->agi_freecount
, newlen
);
954 pag
->pagi_freecount
+= newlen
;
955 pag
->pagi_count
+= newlen
;
956 agi
->agi_newino
= cpu_to_be32(newino
);
959 * Log allocation group header fields
961 xfs_ialloc_log_agi(tp
, agbp
,
962 XFS_AGI_COUNT
| XFS_AGI_FREECOUNT
| XFS_AGI_NEWINO
);
964 * Modify/log superblock values for inode count and inode free count.
966 xfs_trans_mod_sb(tp
, XFS_TRANS_SB_ICOUNT
, (long)newlen
);
967 xfs_trans_mod_sb(tp
, XFS_TRANS_SB_IFREE
, (long)newlen
);
972 * Try to retrieve the next record to the left/right from the current one.
976 struct xfs_btree_cur
*cur
,
977 xfs_inobt_rec_incore_t
*rec
,
985 error
= xfs_btree_decrement(cur
, 0, &i
);
987 error
= xfs_btree_increment(cur
, 0, &i
);
993 error
= xfs_inobt_get_rec(cur
, rec
, &i
);
996 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
997 xfs_btree_mark_sick(cur
);
998 return -EFSCORRUPTED
;
1007 struct xfs_btree_cur
*cur
,
1009 xfs_inobt_rec_incore_t
*rec
,
1015 error
= xfs_inobt_lookup(cur
, agino
, XFS_LOOKUP_EQ
, &i
);
1020 error
= xfs_inobt_get_rec(cur
, rec
, &i
);
1023 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
1024 xfs_btree_mark_sick(cur
);
1025 return -EFSCORRUPTED
;
1033 * Return the offset of the first free inode in the record. If the inode chunk
1034 * is sparsely allocated, we convert the record holemask to inode granularity
1035 * and mask off the unallocated regions from the inode free mask.
1038 xfs_inobt_first_free_inode(
1039 struct xfs_inobt_rec_incore
*rec
)
1041 xfs_inofree_t realfree
;
1043 /* if there are no holes, return the first available offset */
1044 if (!xfs_inobt_issparse(rec
->ir_holemask
))
1045 return xfs_lowbit64(rec
->ir_free
);
1047 realfree
= xfs_inobt_irec_to_allocmask(rec
);
1048 realfree
&= rec
->ir_free
;
1050 return xfs_lowbit64(realfree
);
1054 * If this AG has corrupt inodes, check if allocating this inode would fail
1055 * with corruption errors. Returns 0 if we're clear, or EAGAIN to try again
1059 xfs_dialloc_check_ino(
1060 struct xfs_perag
*pag
,
1061 struct xfs_trans
*tp
,
1064 struct xfs_imap imap
;
1068 error
= xfs_imap(pag
, tp
, ino
, &imap
, 0);
1072 error
= xfs_imap_to_bp(pag_mount(pag
), tp
, &imap
, &bp
);
1076 xfs_trans_brelse(tp
, bp
);
1081 * Allocate an inode using the inobt-only algorithm.
1084 xfs_dialloc_ag_inobt(
1085 struct xfs_perag
*pag
,
1086 struct xfs_trans
*tp
,
1087 struct xfs_buf
*agbp
,
1091 struct xfs_mount
*mp
= tp
->t_mountp
;
1092 struct xfs_agi
*agi
= agbp
->b_addr
;
1093 xfs_agnumber_t pagno
= XFS_INO_TO_AGNO(mp
, parent
);
1094 xfs_agino_t pagino
= XFS_INO_TO_AGINO(mp
, parent
);
1095 struct xfs_btree_cur
*cur
, *tcur
;
1096 struct xfs_inobt_rec_incore rec
, trec
;
1101 int searchdistance
= 10;
1103 ASSERT(xfs_perag_initialised_agi(pag
));
1104 ASSERT(xfs_perag_allows_inodes(pag
));
1105 ASSERT(pag
->pagi_freecount
> 0);
1108 cur
= xfs_inobt_init_cursor(pag
, tp
, agbp
);
1110 * If pagino is 0 (this is the root inode allocation) use newino.
1111 * This must work because we've just allocated some.
1114 pagino
= be32_to_cpu(agi
->agi_newino
);
1116 error
= xfs_check_agi_freecount(cur
);
1121 * If in the same AG as the parent, try to get near the parent.
1123 if (pagno
== pag_agno(pag
)) {
1124 int doneleft
; /* done, to the left */
1125 int doneright
; /* done, to the right */
1127 error
= xfs_inobt_lookup(cur
, pagino
, XFS_LOOKUP_LE
, &i
);
1130 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
1131 xfs_btree_mark_sick(cur
);
1132 error
= -EFSCORRUPTED
;
1136 error
= xfs_inobt_get_rec(cur
, &rec
, &j
);
1139 if (XFS_IS_CORRUPT(mp
, j
!= 1)) {
1140 xfs_btree_mark_sick(cur
);
1141 error
= -EFSCORRUPTED
;
1145 if (rec
.ir_freecount
> 0) {
1147 * Found a free inode in the same chunk
1148 * as the parent, done.
1155 * In the same AG as parent, but parent's chunk is full.
1158 /* duplicate the cursor, search left & right simultaneously */
1159 error
= xfs_btree_dup_cursor(cur
, &tcur
);
1164 * Skip to last blocks looked up if same parent inode.
1166 if (pagino
!= NULLAGINO
&&
1167 pag
->pagl_pagino
== pagino
&&
1168 pag
->pagl_leftrec
!= NULLAGINO
&&
1169 pag
->pagl_rightrec
!= NULLAGINO
) {
1170 error
= xfs_ialloc_get_rec(tcur
, pag
->pagl_leftrec
,
1175 error
= xfs_ialloc_get_rec(cur
, pag
->pagl_rightrec
,
1180 /* search left with tcur, back up 1 record */
1181 error
= xfs_ialloc_next_rec(tcur
, &trec
, &doneleft
, 1);
1185 /* search right with cur, go forward 1 record. */
1186 error
= xfs_ialloc_next_rec(cur
, &rec
, &doneright
, 0);
1192 * Loop until we find an inode chunk with a free inode.
1194 while (--searchdistance
> 0 && (!doneleft
|| !doneright
)) {
1195 int useleft
; /* using left inode chunk this time */
1197 /* figure out the closer block if both are valid. */
1198 if (!doneleft
&& !doneright
) {
1200 (trec
.ir_startino
+ XFS_INODES_PER_CHUNK
- 1) <
1201 rec
.ir_startino
- pagino
;
1203 useleft
= !doneleft
;
1206 /* free inodes to the left? */
1207 if (useleft
&& trec
.ir_freecount
) {
1208 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
1211 pag
->pagl_leftrec
= trec
.ir_startino
;
1212 pag
->pagl_rightrec
= rec
.ir_startino
;
1213 pag
->pagl_pagino
= pagino
;
1218 /* free inodes to the right? */
1219 if (!useleft
&& rec
.ir_freecount
) {
1220 xfs_btree_del_cursor(tcur
, XFS_BTREE_NOERROR
);
1222 pag
->pagl_leftrec
= trec
.ir_startino
;
1223 pag
->pagl_rightrec
= rec
.ir_startino
;
1224 pag
->pagl_pagino
= pagino
;
1228 /* get next record to check */
1230 error
= xfs_ialloc_next_rec(tcur
, &trec
,
1233 error
= xfs_ialloc_next_rec(cur
, &rec
,
1240 if (searchdistance
<= 0) {
1242 * Not in range - save last search
1243 * location and allocate a new inode
1245 xfs_btree_del_cursor(tcur
, XFS_BTREE_NOERROR
);
1246 pag
->pagl_leftrec
= trec
.ir_startino
;
1247 pag
->pagl_rightrec
= rec
.ir_startino
;
1248 pag
->pagl_pagino
= pagino
;
1252 * We've reached the end of the btree. because
1253 * we are only searching a small chunk of the
1254 * btree each search, there is obviously free
1255 * inodes closer to the parent inode than we
1256 * are now. restart the search again.
1258 pag
->pagl_pagino
= NULLAGINO
;
1259 pag
->pagl_leftrec
= NULLAGINO
;
1260 pag
->pagl_rightrec
= NULLAGINO
;
1261 xfs_btree_del_cursor(tcur
, XFS_BTREE_NOERROR
);
1262 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
1268 * In a different AG from the parent.
1269 * See if the most recently allocated block has any free.
1271 if (agi
->agi_newino
!= cpu_to_be32(NULLAGINO
)) {
1272 error
= xfs_inobt_lookup(cur
, be32_to_cpu(agi
->agi_newino
),
1278 error
= xfs_inobt_get_rec(cur
, &rec
, &j
);
1282 if (j
== 1 && rec
.ir_freecount
> 0) {
1284 * The last chunk allocated in the group
1285 * still has a free inode.
1293 * None left in the last group, search the whole AG
1295 error
= xfs_inobt_lookup(cur
, 0, XFS_LOOKUP_GE
, &i
);
1298 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
1299 xfs_btree_mark_sick(cur
);
1300 error
= -EFSCORRUPTED
;
1305 error
= xfs_inobt_get_rec(cur
, &rec
, &i
);
1308 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
1309 xfs_btree_mark_sick(cur
);
1310 error
= -EFSCORRUPTED
;
1313 if (rec
.ir_freecount
> 0)
1315 error
= xfs_btree_increment(cur
, 0, &i
);
1318 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
1319 xfs_btree_mark_sick(cur
);
1320 error
= -EFSCORRUPTED
;
1326 offset
= xfs_inobt_first_free_inode(&rec
);
1327 ASSERT(offset
>= 0);
1328 ASSERT(offset
< XFS_INODES_PER_CHUNK
);
1329 ASSERT((XFS_AGINO_TO_OFFSET(mp
, rec
.ir_startino
) %
1330 XFS_INODES_PER_CHUNK
) == 0);
1331 ino
= xfs_agino_to_ino(pag
, rec
.ir_startino
+ offset
);
1333 if (xfs_ag_has_sickness(pag
, XFS_SICK_AG_INODES
)) {
1334 error
= xfs_dialloc_check_ino(pag
, tp
, ino
);
1339 rec
.ir_free
&= ~XFS_INOBT_MASK(offset
);
1341 error
= xfs_inobt_update(cur
, &rec
);
1344 be32_add_cpu(&agi
->agi_freecount
, -1);
1345 xfs_ialloc_log_agi(tp
, agbp
, XFS_AGI_FREECOUNT
);
1346 pag
->pagi_freecount
--;
1348 error
= xfs_check_agi_freecount(cur
);
1352 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
1353 xfs_trans_mod_sb(tp
, XFS_TRANS_SB_IFREE
, -1);
1357 xfs_btree_del_cursor(tcur
, XFS_BTREE_ERROR
);
1359 xfs_btree_del_cursor(cur
, XFS_BTREE_ERROR
);
1364 * Use the free inode btree to allocate an inode based on distance from the
1365 * parent. Note that the provided cursor may be deleted and replaced.
1368 xfs_dialloc_ag_finobt_near(
1370 struct xfs_btree_cur
**ocur
,
1371 struct xfs_inobt_rec_incore
*rec
)
1373 struct xfs_btree_cur
*lcur
= *ocur
; /* left search cursor */
1374 struct xfs_btree_cur
*rcur
; /* right search cursor */
1375 struct xfs_inobt_rec_incore rrec
;
1379 error
= xfs_inobt_lookup(lcur
, pagino
, XFS_LOOKUP_LE
, &i
);
1384 error
= xfs_inobt_get_rec(lcur
, rec
, &i
);
1387 if (XFS_IS_CORRUPT(lcur
->bc_mp
, i
!= 1)) {
1388 xfs_btree_mark_sick(lcur
);
1389 return -EFSCORRUPTED
;
1393 * See if we've landed in the parent inode record. The finobt
1394 * only tracks chunks with at least one free inode, so record
1395 * existence is enough.
1397 if (pagino
>= rec
->ir_startino
&&
1398 pagino
< (rec
->ir_startino
+ XFS_INODES_PER_CHUNK
))
1402 error
= xfs_btree_dup_cursor(lcur
, &rcur
);
1406 error
= xfs_inobt_lookup(rcur
, pagino
, XFS_LOOKUP_GE
, &j
);
1410 error
= xfs_inobt_get_rec(rcur
, &rrec
, &j
);
1413 if (XFS_IS_CORRUPT(lcur
->bc_mp
, j
!= 1)) {
1414 xfs_btree_mark_sick(lcur
);
1415 error
= -EFSCORRUPTED
;
1420 if (XFS_IS_CORRUPT(lcur
->bc_mp
, i
!= 1 && j
!= 1)) {
1421 xfs_btree_mark_sick(lcur
);
1422 error
= -EFSCORRUPTED
;
1425 if (i
== 1 && j
== 1) {
1427 * Both the left and right records are valid. Choose the closer
1428 * inode chunk to the target.
1430 if ((pagino
- rec
->ir_startino
+ XFS_INODES_PER_CHUNK
- 1) >
1431 (rrec
.ir_startino
- pagino
)) {
1433 xfs_btree_del_cursor(lcur
, XFS_BTREE_NOERROR
);
1436 xfs_btree_del_cursor(rcur
, XFS_BTREE_NOERROR
);
1438 } else if (j
== 1) {
1439 /* only the right record is valid */
1441 xfs_btree_del_cursor(lcur
, XFS_BTREE_NOERROR
);
1443 } else if (i
== 1) {
1444 /* only the left record is valid */
1445 xfs_btree_del_cursor(rcur
, XFS_BTREE_NOERROR
);
1451 xfs_btree_del_cursor(rcur
, XFS_BTREE_ERROR
);
1456 * Use the free inode btree to find a free inode based on a newino hint. If
1457 * the hint is NULL, find the first free inode in the AG.
1460 xfs_dialloc_ag_finobt_newino(
1461 struct xfs_agi
*agi
,
1462 struct xfs_btree_cur
*cur
,
1463 struct xfs_inobt_rec_incore
*rec
)
1468 if (agi
->agi_newino
!= cpu_to_be32(NULLAGINO
)) {
1469 error
= xfs_inobt_lookup(cur
, be32_to_cpu(agi
->agi_newino
),
1474 error
= xfs_inobt_get_rec(cur
, rec
, &i
);
1477 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
1478 xfs_btree_mark_sick(cur
);
1479 return -EFSCORRUPTED
;
1486 * Find the first inode available in the AG.
1488 error
= xfs_inobt_lookup(cur
, 0, XFS_LOOKUP_GE
, &i
);
1491 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
1492 xfs_btree_mark_sick(cur
);
1493 return -EFSCORRUPTED
;
1496 error
= xfs_inobt_get_rec(cur
, rec
, &i
);
1499 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
1500 xfs_btree_mark_sick(cur
);
1501 return -EFSCORRUPTED
;
1508 * Update the inobt based on a modification made to the finobt. Also ensure that
1509 * the records from both trees are equivalent post-modification.
1512 xfs_dialloc_ag_update_inobt(
1513 struct xfs_btree_cur
*cur
, /* inobt cursor */
1514 struct xfs_inobt_rec_incore
*frec
, /* finobt record */
1515 int offset
) /* inode offset */
1517 struct xfs_inobt_rec_incore rec
;
1521 error
= xfs_inobt_lookup(cur
, frec
->ir_startino
, XFS_LOOKUP_EQ
, &i
);
1524 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
1525 xfs_btree_mark_sick(cur
);
1526 return -EFSCORRUPTED
;
1529 error
= xfs_inobt_get_rec(cur
, &rec
, &i
);
1532 if (XFS_IS_CORRUPT(cur
->bc_mp
, i
!= 1)) {
1533 xfs_btree_mark_sick(cur
);
1534 return -EFSCORRUPTED
;
1536 ASSERT((XFS_AGINO_TO_OFFSET(cur
->bc_mp
, rec
.ir_startino
) %
1537 XFS_INODES_PER_CHUNK
) == 0);
1539 rec
.ir_free
&= ~XFS_INOBT_MASK(offset
);
1542 if (XFS_IS_CORRUPT(cur
->bc_mp
,
1543 rec
.ir_free
!= frec
->ir_free
||
1544 rec
.ir_freecount
!= frec
->ir_freecount
)) {
1545 xfs_btree_mark_sick(cur
);
1546 return -EFSCORRUPTED
;
1549 return xfs_inobt_update(cur
, &rec
);
1553 * Allocate an inode using the free inode btree, if available. Otherwise, fall
1554 * back to the inobt search algorithm.
1556 * The caller selected an AG for us, and made sure that free inodes are
1561 struct xfs_perag
*pag
,
1562 struct xfs_trans
*tp
,
1563 struct xfs_buf
*agbp
,
1567 struct xfs_mount
*mp
= tp
->t_mountp
;
1568 struct xfs_agi
*agi
= agbp
->b_addr
;
1569 xfs_agnumber_t pagno
= XFS_INO_TO_AGNO(mp
, parent
);
1570 xfs_agino_t pagino
= XFS_INO_TO_AGINO(mp
, parent
);
1571 struct xfs_btree_cur
*cur
; /* finobt cursor */
1572 struct xfs_btree_cur
*icur
; /* inobt cursor */
1573 struct xfs_inobt_rec_incore rec
;
1579 if (!xfs_has_finobt(mp
))
1580 return xfs_dialloc_ag_inobt(pag
, tp
, agbp
, parent
, inop
);
1583 * If pagino is 0 (this is the root inode allocation) use newino.
1584 * This must work because we've just allocated some.
1587 pagino
= be32_to_cpu(agi
->agi_newino
);
1589 cur
= xfs_finobt_init_cursor(pag
, tp
, agbp
);
1591 error
= xfs_check_agi_freecount(cur
);
1596 * The search algorithm depends on whether we're in the same AG as the
1597 * parent. If so, find the closest available inode to the parent. If
1598 * not, consider the agi hint or find the first free inode in the AG.
1600 if (pag_agno(pag
) == pagno
)
1601 error
= xfs_dialloc_ag_finobt_near(pagino
, &cur
, &rec
);
1603 error
= xfs_dialloc_ag_finobt_newino(agi
, cur
, &rec
);
1607 offset
= xfs_inobt_first_free_inode(&rec
);
1608 ASSERT(offset
>= 0);
1609 ASSERT(offset
< XFS_INODES_PER_CHUNK
);
1610 ASSERT((XFS_AGINO_TO_OFFSET(mp
, rec
.ir_startino
) %
1611 XFS_INODES_PER_CHUNK
) == 0);
1612 ino
= xfs_agino_to_ino(pag
, rec
.ir_startino
+ offset
);
1614 if (xfs_ag_has_sickness(pag
, XFS_SICK_AG_INODES
)) {
1615 error
= xfs_dialloc_check_ino(pag
, tp
, ino
);
1621 * Modify or remove the finobt record.
1623 rec
.ir_free
&= ~XFS_INOBT_MASK(offset
);
1625 if (rec
.ir_freecount
)
1626 error
= xfs_inobt_update(cur
, &rec
);
1628 error
= xfs_btree_delete(cur
, &i
);
1633 * The finobt has now been updated appropriately. We haven't updated the
1634 * agi and superblock yet, so we can create an inobt cursor and validate
1635 * the original freecount. If all is well, make the equivalent update to
1636 * the inobt using the finobt record and offset information.
1638 icur
= xfs_inobt_init_cursor(pag
, tp
, agbp
);
1640 error
= xfs_check_agi_freecount(icur
);
1644 error
= xfs_dialloc_ag_update_inobt(icur
, &rec
, offset
);
1649 * Both trees have now been updated. We must update the perag and
1650 * superblock before we can check the freecount for each btree.
1652 be32_add_cpu(&agi
->agi_freecount
, -1);
1653 xfs_ialloc_log_agi(tp
, agbp
, XFS_AGI_FREECOUNT
);
1654 pag
->pagi_freecount
--;
1656 xfs_trans_mod_sb(tp
, XFS_TRANS_SB_IFREE
, -1);
1658 error
= xfs_check_agi_freecount(icur
);
1661 error
= xfs_check_agi_freecount(cur
);
1665 xfs_btree_del_cursor(icur
, XFS_BTREE_NOERROR
);
1666 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
1671 xfs_btree_del_cursor(icur
, XFS_BTREE_ERROR
);
1673 xfs_btree_del_cursor(cur
, XFS_BTREE_ERROR
);
1679 struct xfs_trans
**tpp
,
1680 struct xfs_buf
*agibp
)
1682 struct xfs_trans
*tp
= *tpp
;
1683 struct xfs_dquot_acct
*dqinfo
;
1687 * Hold to on to the agibp across the commit so no other allocation can
1688 * come in and take the free inodes we just allocated for our caller.
1690 xfs_trans_bhold(tp
, agibp
);
1693 * We want the quota changes to be associated with the next transaction,
1694 * NOT this one. So, detach the dqinfo from this and attach it to the
1697 dqinfo
= tp
->t_dqinfo
;
1698 tp
->t_dqinfo
= NULL
;
1700 error
= xfs_trans_roll(&tp
);
1702 /* Re-attach the quota info that we detached from prev trx. */
1703 tp
->t_dqinfo
= dqinfo
;
1706 * Join the buffer even on commit error so that the buffer is released
1707 * when the caller cancels the transaction and doesn't have to handle
1708 * this error case specially.
1710 xfs_trans_bjoin(tp
, agibp
);
1716 xfs_dialloc_good_ag(
1717 struct xfs_perag
*pag
,
1718 struct xfs_trans
*tp
,
1723 struct xfs_mount
*mp
= tp
->t_mountp
;
1725 xfs_extlen_t longest
= 0;
1731 if (!xfs_perag_allows_inodes(pag
))
1734 if (!xfs_perag_initialised_agi(pag
)) {
1735 error
= xfs_ialloc_read_agi(pag
, tp
, 0, NULL
);
1740 if (pag
->pagi_freecount
)
1745 if (!xfs_perag_initialised_agf(pag
)) {
1746 error
= xfs_alloc_read_agf(pag
, tp
, flags
, NULL
);
1752 * Check that there is enough free space for the file plus a chunk of
1753 * inodes if we need to allocate some. If this is the first pass across
1754 * the AGs, take into account the potential space needed for alignment
1755 * of inode chunks when checking the longest contiguous free space in
1756 * the AG - this prevents us from getting ENOSPC because we have free
1757 * space larger than ialloc_blks but alignment constraints prevent us
1760 * If we can't find an AG with space for full alignment slack to be
1761 * taken into account, we must be near ENOSPC in all AGs. Hence we
1762 * don't include alignment for the second pass and so if we fail
1763 * allocation due to alignment issues then it is most likely a real
1766 * XXX(dgc): this calculation is now bogus thanks to the per-ag
1767 * reservations that xfs_alloc_fix_freelist() now does via
1768 * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will
1769 * be more than large enough for the check below to succeed, but
1770 * xfs_alloc_space_available() will fail because of the non-zero
1771 * metadata reservation and hence we won't actually be able to allocate
1772 * more inodes in this AG. We do soooo much unnecessary work near ENOSPC
1775 ineed
= M_IGEO(mp
)->ialloc_min_blks
;
1776 if (flags
&& ineed
> 1)
1777 ineed
+= M_IGEO(mp
)->cluster_align
;
1778 longest
= pag
->pagf_longest
;
1780 longest
= pag
->pagf_flcount
> 0;
1781 needspace
= S_ISDIR(mode
) || S_ISREG(mode
) || S_ISLNK(mode
);
1783 if (pag
->pagf_freeblks
< needspace
+ ineed
|| longest
< ineed
)
1790 struct xfs_perag
*pag
,
1791 struct xfs_trans
**tpp
,
1796 struct xfs_buf
*agbp
;
1801 * Then read in the AGI buffer and recheck with the AGI buffer
1804 error
= xfs_ialloc_read_agi(pag
, *tpp
, 0, &agbp
);
1808 if (!pag
->pagi_freecount
) {
1814 error
= xfs_ialloc_ag_alloc(pag
, *tpp
, agbp
);
1819 * We successfully allocated space for an inode cluster in this
1820 * AG. Roll the transaction so that we can allocate one of the
1823 ASSERT(pag
->pagi_freecount
> 0);
1824 error
= xfs_dialloc_roll(tpp
, agbp
);
1829 /* Allocate an inode in the found AG */
1830 error
= xfs_dialloc_ag(pag
, *tpp
, agbp
, parent
, &ino
);
1836 xfs_trans_brelse(*tpp
, agbp
);
1841 * Pick an AG for the new inode.
1843 * Directories, symlinks, and regular files frequently allocate at least one
1844 * block, so factor that potential expansion when we examine whether an AG has
1845 * enough space for file creation. Try to keep metadata files all in the same
1848 static inline xfs_agnumber_t
1849 xfs_dialloc_pick_ag(
1850 struct xfs_mount
*mp
,
1851 struct xfs_inode
*dp
,
1854 xfs_agnumber_t start_agno
;
1858 if (xfs_is_metadir_inode(dp
)) {
1859 if (mp
->m_sb
.sb_logstart
)
1860 return XFS_FSB_TO_AGNO(mp
, mp
->m_sb
.sb_logstart
);
1865 return (atomic_inc_return(&mp
->m_agirotor
) - 1) % mp
->m_maxagi
;
1867 start_agno
= XFS_INO_TO_AGNO(mp
, dp
->i_ino
);
1868 if (start_agno
>= mp
->m_maxagi
)
1875 * Allocate an on-disk inode.
1877 * Mode is used to tell whether the new inode is a directory and hence where to
1878 * locate it. The on-disk inode that is allocated will be returned in @new_ino
1879 * on success, otherwise an error will be set to indicate the failure (e.g.
1884 struct xfs_trans
**tpp
,
1885 const struct xfs_icreate_args
*args
,
1888 struct xfs_mount
*mp
= (*tpp
)->t_mountp
;
1889 struct xfs_perag
*pag
;
1890 struct xfs_ino_geometry
*igeo
= M_IGEO(mp
);
1891 xfs_ino_t ino
= NULLFSINO
;
1892 xfs_ino_t parent
= args
->pip
? args
->pip
->i_ino
: 0;
1893 xfs_agnumber_t agno
;
1894 xfs_agnumber_t start_agno
;
1895 umode_t mode
= args
->mode
& S_IFMT
;
1896 bool ok_alloc
= true;
1897 bool low_space
= false;
1901 start_agno
= xfs_dialloc_pick_ag(mp
, args
->pip
, mode
);
1904 * If we have already hit the ceiling of inode blocks then clear
1905 * ok_alloc so we scan all available agi structures for a free
1908 * Read rough value of mp->m_icount by percpu_counter_read_positive,
1909 * which will sacrifice the preciseness but improve the performance.
1911 if (igeo
->maxicount
&&
1912 percpu_counter_read_positive(&mp
->m_icount
) + igeo
->ialloc_inos
1913 > igeo
->maxicount
) {
1918 * If we are near to ENOSPC, we want to prefer allocation from AGs that
1919 * have free inodes in them rather than use up free space allocating new
1920 * inode chunks. Hence we turn off allocation for the first non-blocking
1921 * pass through the AGs if we are near ENOSPC to consume free inodes
1922 * that we can immediately allocate, but then we allow allocation on the
1923 * second pass if we fail to find an AG with free inodes in it.
1925 if (xfs_estimate_freecounter(mp
, XC_FREE_BLOCKS
) <
1926 mp
->m_low_space
[XFS_LOWSP_1_PCNT
]) {
1932 * Loop until we find an allocation group that either has free inodes
1933 * or in which we can allocate some inodes. Iterate through the
1934 * allocation groups upward, wrapping at the end.
1936 flags
= XFS_ALLOC_FLAG_TRYLOCK
;
1938 for_each_perag_wrap_at(mp
, start_agno
, mp
->m_maxagi
, agno
, pag
) {
1939 if (xfs_dialloc_good_ag(pag
, *tpp
, mode
, flags
, ok_alloc
)) {
1940 error
= xfs_dialloc_try_ag(pag
, tpp
, parent
,
1942 if (error
!= -EAGAIN
)
1947 if (xfs_is_shutdown(mp
)) {
1948 error
= -EFSCORRUPTED
;
1953 xfs_perag_rele(pag
);
1956 if (ino
== NULLFSINO
) {
1967 * Protect against obviously corrupt allocation btree records. Later
1968 * xfs_iget checks will catch re-allocation of other active in-memory
1969 * and on-disk inodes. If we don't catch reallocating the parent inode
1970 * here we will deadlock in xfs_iget() so we have to do these checks
1973 if (ino
== parent
|| !xfs_verify_dir_ino(mp
, ino
)) {
1974 xfs_alert(mp
, "Allocated a known in-use inode 0x%llx!", ino
);
1975 xfs_agno_mark_sick(mp
, XFS_INO_TO_AGNO(mp
, ino
),
1977 return -EFSCORRUPTED
;
1985 * Free the blocks of an inode chunk. We must consider that the inode chunk
1986 * might be sparse and only free the regions that are allocated as part of the
1990 xfs_difree_inode_chunk(
1991 struct xfs_trans
*tp
,
1992 struct xfs_perag
*pag
,
1993 struct xfs_inobt_rec_incore
*rec
)
1995 struct xfs_mount
*mp
= tp
->t_mountp
;
1996 xfs_agblock_t sagbno
= XFS_AGINO_TO_AGBNO(mp
,
1998 int startidx
, endidx
;
2000 xfs_agblock_t agbno
;
2002 DECLARE_BITMAP(holemask
, XFS_INOBT_HOLEMASK_BITS
);
2004 if (!xfs_inobt_issparse(rec
->ir_holemask
)) {
2005 /* not sparse, calculate extent info directly */
2006 return xfs_free_extent_later(tp
, xfs_agbno_to_fsb(pag
, sagbno
),
2007 M_IGEO(mp
)->ialloc_blks
, &XFS_RMAP_OINFO_INODES
,
2008 XFS_AG_RESV_NONE
, 0);
2011 /* holemask is only 16-bits (fits in an unsigned long) */
2012 ASSERT(sizeof(rec
->ir_holemask
) <= sizeof(holemask
[0]));
2013 holemask
[0] = rec
->ir_holemask
;
2016 * Find contiguous ranges of zeroes (i.e., allocated regions) in the
2017 * holemask and convert the start/end index of each range to an extent.
2018 * We start with the start and end index both pointing at the first 0 in
2021 startidx
= endidx
= find_first_zero_bit(holemask
,
2022 XFS_INOBT_HOLEMASK_BITS
);
2023 nextbit
= startidx
+ 1;
2024 while (startidx
< XFS_INOBT_HOLEMASK_BITS
) {
2027 nextbit
= find_next_zero_bit(holemask
, XFS_INOBT_HOLEMASK_BITS
,
2030 * If the next zero bit is contiguous, update the end index of
2031 * the current range and continue.
2033 if (nextbit
!= XFS_INOBT_HOLEMASK_BITS
&&
2034 nextbit
== endidx
+ 1) {
2040 * nextbit is not contiguous with the current end index. Convert
2041 * the current start/end to an extent and add it to the free
2044 agbno
= sagbno
+ (startidx
* XFS_INODES_PER_HOLEMASK_BIT
) /
2045 mp
->m_sb
.sb_inopblock
;
2046 contigblk
= ((endidx
- startidx
+ 1) *
2047 XFS_INODES_PER_HOLEMASK_BIT
) /
2048 mp
->m_sb
.sb_inopblock
;
2050 ASSERT(agbno
% mp
->m_sb
.sb_spino_align
== 0);
2051 ASSERT(contigblk
% mp
->m_sb
.sb_spino_align
== 0);
2052 error
= xfs_free_extent_later(tp
, xfs_agbno_to_fsb(pag
, agbno
),
2053 contigblk
, &XFS_RMAP_OINFO_INODES
,
2054 XFS_AG_RESV_NONE
, 0);
2058 /* reset range to current bit and carry on... */
2059 startidx
= endidx
= nextbit
;
2069 struct xfs_perag
*pag
,
2070 struct xfs_trans
*tp
,
2071 struct xfs_buf
*agbp
,
2073 struct xfs_icluster
*xic
,
2074 struct xfs_inobt_rec_incore
*orec
)
2076 struct xfs_mount
*mp
= pag_mount(pag
);
2077 struct xfs_agi
*agi
= agbp
->b_addr
;
2078 struct xfs_btree_cur
*cur
;
2079 struct xfs_inobt_rec_incore rec
;
2085 ASSERT(agi
->agi_magicnum
== cpu_to_be32(XFS_AGI_MAGIC
));
2086 ASSERT(XFS_AGINO_TO_AGBNO(mp
, agino
) < be32_to_cpu(agi
->agi_length
));
2089 * Initialize the cursor.
2091 cur
= xfs_inobt_init_cursor(pag
, tp
, agbp
);
2093 error
= xfs_check_agi_freecount(cur
);
2098 * Look for the entry describing this inode.
2100 if ((error
= xfs_inobt_lookup(cur
, agino
, XFS_LOOKUP_LE
, &i
))) {
2101 xfs_warn(mp
, "%s: xfs_inobt_lookup() returned error %d.",
2105 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2106 xfs_btree_mark_sick(cur
);
2107 error
= -EFSCORRUPTED
;
2110 error
= xfs_inobt_get_rec(cur
, &rec
, &i
);
2112 xfs_warn(mp
, "%s: xfs_inobt_get_rec() returned error %d.",
2116 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2117 xfs_btree_mark_sick(cur
);
2118 error
= -EFSCORRUPTED
;
2122 * Get the offset in the inode chunk.
2124 off
= agino
- rec
.ir_startino
;
2125 ASSERT(off
>= 0 && off
< XFS_INODES_PER_CHUNK
);
2126 ASSERT(!(rec
.ir_free
& XFS_INOBT_MASK(off
)));
2128 * Mark the inode free & increment the count.
2130 rec
.ir_free
|= XFS_INOBT_MASK(off
);
2134 * When an inode chunk is free, it becomes eligible for removal. Don't
2135 * remove the chunk if the block size is large enough for multiple inode
2136 * chunks (that might not be free).
2138 if (!xfs_has_ikeep(mp
) && rec
.ir_free
== XFS_INOBT_ALL_FREE
&&
2139 mp
->m_sb
.sb_inopblock
<= XFS_INODES_PER_CHUNK
) {
2140 xic
->deleted
= true;
2141 xic
->first_ino
= xfs_agino_to_ino(pag
, rec
.ir_startino
);
2142 xic
->alloc
= xfs_inobt_irec_to_allocmask(&rec
);
2145 * Remove the inode cluster from the AGI B+Tree, adjust the
2146 * AGI and Superblock inode counts, and mark the disk space
2147 * to be freed when the transaction is committed.
2149 ilen
= rec
.ir_freecount
;
2150 be32_add_cpu(&agi
->agi_count
, -ilen
);
2151 be32_add_cpu(&agi
->agi_freecount
, -(ilen
- 1));
2152 xfs_ialloc_log_agi(tp
, agbp
, XFS_AGI_COUNT
| XFS_AGI_FREECOUNT
);
2153 pag
->pagi_freecount
-= ilen
- 1;
2154 pag
->pagi_count
-= ilen
;
2155 xfs_trans_mod_sb(tp
, XFS_TRANS_SB_ICOUNT
, -ilen
);
2156 xfs_trans_mod_sb(tp
, XFS_TRANS_SB_IFREE
, -(ilen
- 1));
2158 if ((error
= xfs_btree_delete(cur
, &i
))) {
2159 xfs_warn(mp
, "%s: xfs_btree_delete returned error %d.",
2164 error
= xfs_difree_inode_chunk(tp
, pag
, &rec
);
2168 xic
->deleted
= false;
2170 error
= xfs_inobt_update(cur
, &rec
);
2172 xfs_warn(mp
, "%s: xfs_inobt_update returned error %d.",
2178 * Change the inode free counts and log the ag/sb changes.
2180 be32_add_cpu(&agi
->agi_freecount
, 1);
2181 xfs_ialloc_log_agi(tp
, agbp
, XFS_AGI_FREECOUNT
);
2182 pag
->pagi_freecount
++;
2183 xfs_trans_mod_sb(tp
, XFS_TRANS_SB_IFREE
, 1);
2186 error
= xfs_check_agi_freecount(cur
);
2191 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
2195 xfs_btree_del_cursor(cur
, XFS_BTREE_ERROR
);
2200 * Free an inode in the free inode btree.
2204 struct xfs_perag
*pag
,
2205 struct xfs_trans
*tp
,
2206 struct xfs_buf
*agbp
,
2208 struct xfs_inobt_rec_incore
*ibtrec
) /* inobt record */
2210 struct xfs_mount
*mp
= pag_mount(pag
);
2211 struct xfs_btree_cur
*cur
;
2212 struct xfs_inobt_rec_incore rec
;
2213 int offset
= agino
- ibtrec
->ir_startino
;
2217 cur
= xfs_finobt_init_cursor(pag
, tp
, agbp
);
2219 error
= xfs_inobt_lookup(cur
, ibtrec
->ir_startino
, XFS_LOOKUP_EQ
, &i
);
2224 * If the record does not exist in the finobt, we must have just
2225 * freed an inode in a previously fully allocated chunk. If not,
2226 * something is out of sync.
2228 if (XFS_IS_CORRUPT(mp
, ibtrec
->ir_freecount
!= 1)) {
2229 xfs_btree_mark_sick(cur
);
2230 error
= -EFSCORRUPTED
;
2234 error
= xfs_inobt_insert_rec(cur
, ibtrec
->ir_holemask
,
2236 ibtrec
->ir_freecount
,
2237 ibtrec
->ir_free
, &i
);
2246 * Read and update the existing record. We could just copy the ibtrec
2247 * across here, but that would defeat the purpose of having redundant
2248 * metadata. By making the modifications independently, we can catch
2249 * corruptions that we wouldn't see if we just copied from one record
2252 error
= xfs_inobt_get_rec(cur
, &rec
, &i
);
2255 if (XFS_IS_CORRUPT(mp
, i
!= 1)) {
2256 xfs_btree_mark_sick(cur
);
2257 error
= -EFSCORRUPTED
;
2261 rec
.ir_free
|= XFS_INOBT_MASK(offset
);
2264 if (XFS_IS_CORRUPT(mp
,
2265 rec
.ir_free
!= ibtrec
->ir_free
||
2266 rec
.ir_freecount
!= ibtrec
->ir_freecount
)) {
2267 xfs_btree_mark_sick(cur
);
2268 error
= -EFSCORRUPTED
;
2273 * The content of inobt records should always match between the inobt
2274 * and finobt. The lifecycle of records in the finobt is different from
2275 * the inobt in that the finobt only tracks records with at least one
2276 * free inode. Hence, if all of the inodes are free and we aren't
2277 * keeping inode chunks permanently on disk, remove the record.
2278 * Otherwise, update the record with the new information.
2280 * Note that we currently can't free chunks when the block size is large
2281 * enough for multiple chunks. Leave the finobt record to remain in sync
2284 if (!xfs_has_ikeep(mp
) && rec
.ir_free
== XFS_INOBT_ALL_FREE
&&
2285 mp
->m_sb
.sb_inopblock
<= XFS_INODES_PER_CHUNK
) {
2286 error
= xfs_btree_delete(cur
, &i
);
2291 error
= xfs_inobt_update(cur
, &rec
);
2297 error
= xfs_check_agi_freecount(cur
);
2301 xfs_btree_del_cursor(cur
, XFS_BTREE_NOERROR
);
2305 xfs_btree_del_cursor(cur
, XFS_BTREE_ERROR
);
2310 * Free disk inode. Carefully avoids touching the incore inode, all
2311 * manipulations incore are the caller's responsibility.
2312 * The on-disk inode is not changed by this operation, only the
2313 * btree (free inode mask) is changed.
2317 struct xfs_trans
*tp
,
2318 struct xfs_perag
*pag
,
2320 struct xfs_icluster
*xic
)
2323 xfs_agblock_t agbno
; /* block number containing inode */
2324 struct xfs_buf
*agbp
; /* buffer for allocation group header */
2325 xfs_agino_t agino
; /* allocation group inode number */
2326 int error
; /* error return value */
2327 struct xfs_mount
*mp
= tp
->t_mountp
;
2328 struct xfs_inobt_rec_incore rec
;/* btree record */
2331 * Break up inode number into its components.
2333 if (pag_agno(pag
) != XFS_INO_TO_AGNO(mp
, inode
)) {
2334 xfs_warn(mp
, "%s: agno != pag_agno(pag) (%d != %d).",
2335 __func__
, XFS_INO_TO_AGNO(mp
, inode
), pag_agno(pag
));
2339 agino
= XFS_INO_TO_AGINO(mp
, inode
);
2340 if (inode
!= xfs_agino_to_ino(pag
, agino
)) {
2341 xfs_warn(mp
, "%s: inode != xfs_agino_to_ino() (%llu != %llu).",
2342 __func__
, (unsigned long long)inode
,
2343 (unsigned long long)xfs_agino_to_ino(pag
, agino
));
2347 agbno
= XFS_AGINO_TO_AGBNO(mp
, agino
);
2348 if (agbno
>= xfs_ag_block_count(mp
, pag_agno(pag
))) {
2349 xfs_warn(mp
, "%s: agbno >= xfs_ag_block_count (%d >= %d).",
2350 __func__
, agbno
, xfs_ag_block_count(mp
, pag_agno(pag
)));
2355 * Get the allocation group header.
2357 error
= xfs_ialloc_read_agi(pag
, tp
, 0, &agbp
);
2359 xfs_warn(mp
, "%s: xfs_ialloc_read_agi() returned error %d.",
2365 * Fix up the inode allocation btree.
2367 error
= xfs_difree_inobt(pag
, tp
, agbp
, agino
, xic
, &rec
);
2372 * Fix up the free inode btree.
2374 if (xfs_has_finobt(mp
)) {
2375 error
= xfs_difree_finobt(pag
, tp
, agbp
, agino
, &rec
);
2388 struct xfs_perag
*pag
,
2389 struct xfs_trans
*tp
,
2391 xfs_agblock_t agbno
,
2392 xfs_agblock_t
*chunk_agbno
,
2393 xfs_agblock_t
*offset_agbno
,
2396 struct xfs_mount
*mp
= pag_mount(pag
);
2397 struct xfs_inobt_rec_incore rec
;
2398 struct xfs_btree_cur
*cur
;
2399 struct xfs_buf
*agbp
;
2403 error
= xfs_ialloc_read_agi(pag
, tp
, 0, &agbp
);
2406 "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
2407 __func__
, error
, pag_agno(pag
));
2412 * Lookup the inode record for the given agino. If the record cannot be
2413 * found, then it's an invalid inode number and we should abort. Once
2414 * we have a record, we need to ensure it contains the inode number
2415 * we are looking up.
2417 cur
= xfs_inobt_init_cursor(pag
, tp
, agbp
);
2418 error
= xfs_inobt_lookup(cur
, agino
, XFS_LOOKUP_LE
, &i
);
2421 error
= xfs_inobt_get_rec(cur
, &rec
, &i
);
2422 if (!error
&& i
== 0)
2426 xfs_trans_brelse(tp
, agbp
);
2427 xfs_btree_del_cursor(cur
, error
);
2431 /* check that the returned record contains the required inode */
2432 if (rec
.ir_startino
> agino
||
2433 rec
.ir_startino
+ M_IGEO(mp
)->ialloc_inos
<= agino
)
2436 /* for untrusted inodes check it is allocated first */
2437 if ((flags
& XFS_IGET_UNTRUSTED
) &&
2438 (rec
.ir_free
& XFS_INOBT_MASK(agino
- rec
.ir_startino
)))
2441 *chunk_agbno
= XFS_AGINO_TO_AGBNO(mp
, rec
.ir_startino
);
2442 *offset_agbno
= agbno
- *chunk_agbno
;
2447 * Return the location of the inode in imap, for mapping it into a buffer.
2451 struct xfs_perag
*pag
,
2452 struct xfs_trans
*tp
,
2453 xfs_ino_t ino
, /* inode to locate */
2454 struct xfs_imap
*imap
, /* location map structure */
2455 uint flags
) /* flags for inode btree lookup */
2457 struct xfs_mount
*mp
= pag_mount(pag
);
2458 xfs_agblock_t agbno
; /* block number of inode in the alloc group */
2459 xfs_agino_t agino
; /* inode number within alloc group */
2460 xfs_agblock_t chunk_agbno
; /* first block in inode chunk */
2461 xfs_agblock_t cluster_agbno
; /* first block in inode cluster */
2462 int error
; /* error code */
2463 int offset
; /* index of inode in its buffer */
2464 xfs_agblock_t offset_agbno
; /* blks from chunk start to inode */
2466 ASSERT(ino
!= NULLFSINO
);
2469 * Split up the inode number into its parts.
2471 agino
= XFS_INO_TO_AGINO(mp
, ino
);
2472 agbno
= XFS_AGINO_TO_AGBNO(mp
, agino
);
2473 if (agbno
>= xfs_ag_block_count(mp
, pag_agno(pag
)) ||
2474 ino
!= xfs_agino_to_ino(pag
, agino
)) {
2478 * Don't output diagnostic information for untrusted inodes
2479 * as they can be invalid without implying corruption.
2481 if (flags
& XFS_IGET_UNTRUSTED
)
2483 if (agbno
>= xfs_ag_block_count(mp
, pag_agno(pag
))) {
2485 "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
2486 __func__
, (unsigned long long)agbno
,
2487 (unsigned long)xfs_ag_block_count(mp
,
2490 if (ino
!= xfs_agino_to_ino(pag
, agino
)) {
2492 "%s: ino (0x%llx) != xfs_agino_to_ino() (0x%llx)",
2494 xfs_agino_to_ino(pag
, agino
));
2502 * For bulkstat and handle lookups, we have an untrusted inode number
2503 * that we have to verify is valid. We cannot do this just by reading
2504 * the inode buffer as it may have been unlinked and removed leaving
2505 * inodes in stale state on disk. Hence we have to do a btree lookup
2506 * in all cases where an untrusted inode number is passed.
2508 if (flags
& XFS_IGET_UNTRUSTED
) {
2509 error
= xfs_imap_lookup(pag
, tp
, agino
, agbno
,
2510 &chunk_agbno
, &offset_agbno
, flags
);
2517 * If the inode cluster size is the same as the blocksize or
2518 * smaller we get to the buffer by simple arithmetics.
2520 if (M_IGEO(mp
)->blocks_per_cluster
== 1) {
2521 offset
= XFS_INO_TO_OFFSET(mp
, ino
);
2522 ASSERT(offset
< mp
->m_sb
.sb_inopblock
);
2524 imap
->im_blkno
= xfs_agbno_to_daddr(pag
, agbno
);
2525 imap
->im_len
= XFS_FSB_TO_BB(mp
, 1);
2526 imap
->im_boffset
= (unsigned short)(offset
<<
2527 mp
->m_sb
.sb_inodelog
);
2532 * If the inode chunks are aligned then use simple maths to
2533 * find the location. Otherwise we have to do a btree
2534 * lookup to find the location.
2536 if (M_IGEO(mp
)->inoalign_mask
) {
2537 offset_agbno
= agbno
& M_IGEO(mp
)->inoalign_mask
;
2538 chunk_agbno
= agbno
- offset_agbno
;
2540 error
= xfs_imap_lookup(pag
, tp
, agino
, agbno
,
2541 &chunk_agbno
, &offset_agbno
, flags
);
2547 ASSERT(agbno
>= chunk_agbno
);
2548 cluster_agbno
= chunk_agbno
+
2549 ((offset_agbno
/ M_IGEO(mp
)->blocks_per_cluster
) *
2550 M_IGEO(mp
)->blocks_per_cluster
);
2551 offset
= ((agbno
- cluster_agbno
) * mp
->m_sb
.sb_inopblock
) +
2552 XFS_INO_TO_OFFSET(mp
, ino
);
2554 imap
->im_blkno
= xfs_agbno_to_daddr(pag
, cluster_agbno
);
2555 imap
->im_len
= XFS_FSB_TO_BB(mp
, M_IGEO(mp
)->blocks_per_cluster
);
2556 imap
->im_boffset
= (unsigned short)(offset
<< mp
->m_sb
.sb_inodelog
);
2559 * If the inode number maps to a block outside the bounds
2560 * of the file system then return NULL rather than calling
2561 * read_buf and panicing when we get an error from the
2564 if ((imap
->im_blkno
+ imap
->im_len
) >
2565 XFS_FSB_TO_BB(mp
, mp
->m_sb
.sb_dblocks
)) {
2567 "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
2568 __func__
, (unsigned long long) imap
->im_blkno
,
2569 (unsigned long long) imap
->im_len
,
2570 XFS_FSB_TO_BB(mp
, mp
->m_sb
.sb_dblocks
));
2577 * Log specified fields for the ag hdr (inode section). The growth of the agi
2578 * structure over time requires that we interpret the buffer as two logical
2579 * regions delineated by the end of the unlinked list. This is due to the size
2580 * of the hash table and its location in the middle of the agi.
2582 * For example, a request to log a field before agi_unlinked and a field after
2583 * agi_unlinked could cause us to log the entire hash table and use an excessive
2584 * amount of log space. To avoid this behavior, log the region up through
2585 * agi_unlinked in one call and the region after agi_unlinked through the end of
2586 * the structure in another.
2590 struct xfs_trans
*tp
,
2594 int first
; /* first byte number */
2595 int last
; /* last byte number */
2596 static const short offsets
[] = { /* field starting offsets */
2597 /* keep in sync with bit definitions */
2598 offsetof(xfs_agi_t
, agi_magicnum
),
2599 offsetof(xfs_agi_t
, agi_versionnum
),
2600 offsetof(xfs_agi_t
, agi_seqno
),
2601 offsetof(xfs_agi_t
, agi_length
),
2602 offsetof(xfs_agi_t
, agi_count
),
2603 offsetof(xfs_agi_t
, agi_root
),
2604 offsetof(xfs_agi_t
, agi_level
),
2605 offsetof(xfs_agi_t
, agi_freecount
),
2606 offsetof(xfs_agi_t
, agi_newino
),
2607 offsetof(xfs_agi_t
, agi_dirino
),
2608 offsetof(xfs_agi_t
, agi_unlinked
),
2609 offsetof(xfs_agi_t
, agi_free_root
),
2610 offsetof(xfs_agi_t
, agi_free_level
),
2611 offsetof(xfs_agi_t
, agi_iblocks
),
2615 struct xfs_agi
*agi
= bp
->b_addr
;
2617 ASSERT(agi
->agi_magicnum
== cpu_to_be32(XFS_AGI_MAGIC
));
2621 * Compute byte offsets for the first and last fields in the first
2622 * region and log the agi buffer. This only logs up through
2625 if (fields
& XFS_AGI_ALL_BITS_R1
) {
2626 xfs_btree_offsets(fields
, offsets
, XFS_AGI_NUM_BITS_R1
,
2628 xfs_trans_log_buf(tp
, bp
, first
, last
);
2632 * Mask off the bits in the first region and calculate the first and
2633 * last field offsets for any bits in the second region.
2635 fields
&= ~XFS_AGI_ALL_BITS_R1
;
2637 xfs_btree_offsets(fields
, offsets
, XFS_AGI_NUM_BITS_R2
,
2639 xfs_trans_log_buf(tp
, bp
, first
, last
);
2643 static xfs_failaddr_t
2647 struct xfs_mount
*mp
= bp
->b_mount
;
2648 struct xfs_agi
*agi
= bp
->b_addr
;
2650 uint32_t agi_seqno
= be32_to_cpu(agi
->agi_seqno
);
2651 uint32_t agi_length
= be32_to_cpu(agi
->agi_length
);
2654 if (xfs_has_crc(mp
)) {
2655 if (!uuid_equal(&agi
->agi_uuid
, &mp
->m_sb
.sb_meta_uuid
))
2656 return __this_address
;
2657 if (!xfs_log_check_lsn(mp
, be64_to_cpu(agi
->agi_lsn
)))
2658 return __this_address
;
2662 * Validate the magic number of the agi block.
2664 if (!xfs_verify_magic(bp
, agi
->agi_magicnum
))
2665 return __this_address
;
2666 if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi
->agi_versionnum
)))
2667 return __this_address
;
2669 fa
= xfs_validate_ag_length(bp
, agi_seqno
, agi_length
);
2673 if (be32_to_cpu(agi
->agi_level
) < 1 ||
2674 be32_to_cpu(agi
->agi_level
) > M_IGEO(mp
)->inobt_maxlevels
)
2675 return __this_address
;
2677 if (xfs_has_finobt(mp
) &&
2678 (be32_to_cpu(agi
->agi_free_level
) < 1 ||
2679 be32_to_cpu(agi
->agi_free_level
) > M_IGEO(mp
)->inobt_maxlevels
))
2680 return __this_address
;
2682 for (i
= 0; i
< XFS_AGI_UNLINKED_BUCKETS
; i
++) {
2683 if (agi
->agi_unlinked
[i
] == cpu_to_be32(NULLAGINO
))
2685 if (!xfs_verify_ino(mp
, be32_to_cpu(agi
->agi_unlinked
[i
])))
2686 return __this_address
;
2693 xfs_agi_read_verify(
2696 struct xfs_mount
*mp
= bp
->b_mount
;
2699 if (xfs_has_crc(mp
) &&
2700 !xfs_buf_verify_cksum(bp
, XFS_AGI_CRC_OFF
))
2701 xfs_verifier_error(bp
, -EFSBADCRC
, __this_address
);
2703 fa
= xfs_agi_verify(bp
);
2704 if (XFS_TEST_ERROR(fa
, mp
, XFS_ERRTAG_IALLOC_READ_AGI
))
2705 xfs_verifier_error(bp
, -EFSCORRUPTED
, fa
);
2710 xfs_agi_write_verify(
2713 struct xfs_mount
*mp
= bp
->b_mount
;
2714 struct xfs_buf_log_item
*bip
= bp
->b_log_item
;
2715 struct xfs_agi
*agi
= bp
->b_addr
;
2718 fa
= xfs_agi_verify(bp
);
2720 xfs_verifier_error(bp
, -EFSCORRUPTED
, fa
);
2724 if (!xfs_has_crc(mp
))
2728 agi
->agi_lsn
= cpu_to_be64(bip
->bli_item
.li_lsn
);
2729 xfs_buf_update_cksum(bp
, XFS_AGI_CRC_OFF
);
2732 const struct xfs_buf_ops xfs_agi_buf_ops
= {
2734 .magic
= { cpu_to_be32(XFS_AGI_MAGIC
), cpu_to_be32(XFS_AGI_MAGIC
) },
2735 .verify_read
= xfs_agi_read_verify
,
2736 .verify_write
= xfs_agi_write_verify
,
2737 .verify_struct
= xfs_agi_verify
,
2741 * Read in the allocation group header (inode allocation section)
2745 struct xfs_perag
*pag
,
2746 struct xfs_trans
*tp
,
2747 xfs_buf_flags_t flags
,
2748 struct xfs_buf
**agibpp
)
2750 struct xfs_mount
*mp
= pag_mount(pag
);
2753 trace_xfs_read_agi(pag
);
2755 error
= xfs_trans_read_buf(mp
, tp
, mp
->m_ddev_targp
,
2756 XFS_AG_DADDR(mp
, pag_agno(pag
), XFS_AGI_DADDR(mp
)),
2757 XFS_FSS_TO_BB(mp
, 1), flags
, agibpp
, &xfs_agi_buf_ops
);
2758 if (xfs_metadata_is_sick(error
))
2759 xfs_ag_mark_sick(pag
, XFS_SICK_AG_AGI
);
2763 xfs_trans_buf_set_type(tp
, *agibpp
, XFS_BLFT_AGI_BUF
);
2765 xfs_buf_set_ref(*agibpp
, XFS_AGI_REF
);
2770 * Read in the agi and initialise the per-ag data. If the caller supplies a
2771 * @agibpp, return the locked AGI buffer to them, otherwise release it.
2774 xfs_ialloc_read_agi(
2775 struct xfs_perag
*pag
,
2776 struct xfs_trans
*tp
,
2778 struct xfs_buf
**agibpp
)
2780 struct xfs_buf
*agibp
;
2781 struct xfs_agi
*agi
;
2784 trace_xfs_ialloc_read_agi(pag
);
2786 error
= xfs_read_agi(pag
, tp
,
2787 (flags
& XFS_IALLOC_FLAG_TRYLOCK
) ? XBF_TRYLOCK
: 0,
2792 agi
= agibp
->b_addr
;
2793 if (!xfs_perag_initialised_agi(pag
)) {
2794 pag
->pagi_freecount
= be32_to_cpu(agi
->agi_freecount
);
2795 pag
->pagi_count
= be32_to_cpu(agi
->agi_count
);
2796 set_bit(XFS_AGSTATE_AGI_INIT
, &pag
->pag_opstate
);
2800 * It's possible for these to be out of sync if
2801 * we are in the middle of a forced shutdown.
2803 ASSERT(pag
->pagi_freecount
== be32_to_cpu(agi
->agi_freecount
) ||
2804 xfs_is_shutdown(pag_mount(pag
)));
2808 xfs_trans_brelse(tp
, agibp
);
2812 /* How many inodes are backed by inode clusters ondisk? */
2814 xfs_ialloc_count_ondisk(
2815 struct xfs_btree_cur
*cur
,
2818 unsigned int *allocated
)
2820 struct xfs_inobt_rec_incore irec
;
2821 unsigned int ret
= 0;
2825 error
= xfs_inobt_lookup(cur
, low
, XFS_LOOKUP_LE
, &has_record
);
2829 while (has_record
) {
2830 unsigned int i
, hole_idx
;
2832 error
= xfs_inobt_get_rec(cur
, &irec
, &has_record
);
2835 if (irec
.ir_startino
> high
)
2838 for (i
= 0; i
< XFS_INODES_PER_CHUNK
; i
++) {
2839 if (irec
.ir_startino
+ i
< low
)
2841 if (irec
.ir_startino
+ i
> high
)
2844 hole_idx
= i
/ XFS_INODES_PER_HOLEMASK_BIT
;
2845 if (!(irec
.ir_holemask
& (1U << hole_idx
)))
2849 error
= xfs_btree_increment(cur
, 0, &has_record
);
2858 /* Is there an inode record covering a given extent? */
2860 xfs_ialloc_has_inodes_at_extent(
2861 struct xfs_btree_cur
*cur
,
2864 enum xbtree_recpacking
*outcome
)
2867 xfs_agino_t last_agino
;
2868 unsigned int allocated
;
2871 agino
= XFS_AGB_TO_AGINO(cur
->bc_mp
, bno
);
2872 last_agino
= XFS_AGB_TO_AGINO(cur
->bc_mp
, bno
+ len
) - 1;
2874 error
= xfs_ialloc_count_ondisk(cur
, agino
, last_agino
, &allocated
);
2879 *outcome
= XBTREE_RECPACKING_EMPTY
;
2880 else if (allocated
== last_agino
- agino
+ 1)
2881 *outcome
= XBTREE_RECPACKING_FULL
;
2883 *outcome
= XBTREE_RECPACKING_SPARSE
;
2887 struct xfs_ialloc_count_inodes
{
2889 xfs_agino_t freecount
;
2892 /* Record inode counts across all inobt records. */
2894 xfs_ialloc_count_inodes_rec(
2895 struct xfs_btree_cur
*cur
,
2896 const union xfs_btree_rec
*rec
,
2899 struct xfs_inobt_rec_incore irec
;
2900 struct xfs_ialloc_count_inodes
*ci
= priv
;
2903 xfs_inobt_btrec_to_irec(cur
->bc_mp
, rec
, &irec
);
2904 fa
= xfs_inobt_check_irec(to_perag(cur
->bc_group
), &irec
);
2906 return xfs_inobt_complain_bad_rec(cur
, fa
, &irec
);
2908 ci
->count
+= irec
.ir_count
;
2909 ci
->freecount
+= irec
.ir_freecount
;
2914 /* Count allocated and free inodes under an inobt. */
2916 xfs_ialloc_count_inodes(
2917 struct xfs_btree_cur
*cur
,
2919 xfs_agino_t
*freecount
)
2921 struct xfs_ialloc_count_inodes ci
= {0};
2924 ASSERT(xfs_btree_is_ino(cur
->bc_ops
));
2925 error
= xfs_btree_query_all(cur
, xfs_ialloc_count_inodes_rec
, &ci
);
2930 *freecount
= ci
.freecount
;
2935 * Initialize inode-related geometry information.
2937 * Compute the inode btree min and max levels and set maxicount.
2939 * Set the inode cluster size. This may still be overridden by the file
2940 * system block size if it is larger than the chosen cluster size.
2942 * For v5 filesystems, scale the cluster size with the inode size to keep a
2943 * constant ratio of inode per cluster buffer, but only if mkfs has set the
2944 * inode alignment value appropriately for larger cluster sizes.
2946 * Then compute the inode cluster alignment information.
2949 xfs_ialloc_setup_geometry(
2950 struct xfs_mount
*mp
)
2952 struct xfs_sb
*sbp
= &mp
->m_sb
;
2953 struct xfs_ino_geometry
*igeo
= M_IGEO(mp
);
2957 igeo
->new_diflags2
= 0;
2958 if (xfs_has_bigtime(mp
))
2959 igeo
->new_diflags2
|= XFS_DIFLAG2_BIGTIME
;
2960 if (xfs_has_large_extent_counts(mp
))
2961 igeo
->new_diflags2
|= XFS_DIFLAG2_NREXT64
;
2963 /* Compute inode btree geometry. */
2964 igeo
->agino_log
= sbp
->sb_inopblog
+ sbp
->sb_agblklog
;
2965 igeo
->inobt_mxr
[0] = xfs_inobt_maxrecs(mp
, sbp
->sb_blocksize
, true);
2966 igeo
->inobt_mxr
[1] = xfs_inobt_maxrecs(mp
, sbp
->sb_blocksize
, false);
2967 igeo
->inobt_mnr
[0] = igeo
->inobt_mxr
[0] / 2;
2968 igeo
->inobt_mnr
[1] = igeo
->inobt_mxr
[1] / 2;
2970 igeo
->ialloc_inos
= max_t(uint16_t, XFS_INODES_PER_CHUNK
,
2972 igeo
->ialloc_blks
= igeo
->ialloc_inos
>> sbp
->sb_inopblog
;
2974 if (sbp
->sb_spino_align
)
2975 igeo
->ialloc_min_blks
= sbp
->sb_spino_align
;
2977 igeo
->ialloc_min_blks
= igeo
->ialloc_blks
;
2979 /* Compute and fill in value of m_ino_geo.inobt_maxlevels. */
2980 inodes
= (1LL << XFS_INO_AGINO_BITS(mp
)) >> XFS_INODES_PER_CHUNK_LOG
;
2981 igeo
->inobt_maxlevels
= xfs_btree_compute_maxlevels(igeo
->inobt_mnr
,
2983 ASSERT(igeo
->inobt_maxlevels
<= xfs_iallocbt_maxlevels_ondisk());
2986 * Set the maximum inode count for this filesystem, being careful not
2987 * to use obviously garbage sb_inopblog/sb_inopblock values. Regular
2988 * users should never get here due to failing sb verification, but
2989 * certain users (xfs_db) need to be usable even with corrupt metadata.
2991 if (sbp
->sb_imax_pct
&& igeo
->ialloc_blks
) {
2993 * Make sure the maximum inode count is a multiple
2994 * of the units we allocate inodes in.
2996 icount
= sbp
->sb_dblocks
* sbp
->sb_imax_pct
;
2997 do_div(icount
, 100);
2998 do_div(icount
, igeo
->ialloc_blks
);
2999 igeo
->maxicount
= XFS_FSB_TO_INO(mp
,
3000 icount
* igeo
->ialloc_blks
);
3002 igeo
->maxicount
= 0;
3006 * Compute the desired size of an inode cluster buffer size, which
3007 * starts at 8K and (on v5 filesystems) scales up with larger inode
3010 * Preserve the desired inode cluster size because the sparse inodes
3011 * feature uses that desired size (not the actual size) to compute the
3012 * sparse inode alignment. The mount code validates this value, so we
3013 * cannot change the behavior.
3015 igeo
->inode_cluster_size_raw
= XFS_INODE_BIG_CLUSTER_SIZE
;
3016 if (xfs_has_v3inodes(mp
)) {
3017 int new_size
= igeo
->inode_cluster_size_raw
;
3019 new_size
*= mp
->m_sb
.sb_inodesize
/ XFS_DINODE_MIN_SIZE
;
3020 if (mp
->m_sb
.sb_inoalignmt
>= XFS_B_TO_FSBT(mp
, new_size
))
3021 igeo
->inode_cluster_size_raw
= new_size
;
3024 /* Calculate inode cluster ratios. */
3025 if (igeo
->inode_cluster_size_raw
> mp
->m_sb
.sb_blocksize
)
3026 igeo
->blocks_per_cluster
= XFS_B_TO_FSBT(mp
,
3027 igeo
->inode_cluster_size_raw
);
3029 igeo
->blocks_per_cluster
= 1;
3030 igeo
->inode_cluster_size
= XFS_FSB_TO_B(mp
, igeo
->blocks_per_cluster
);
3031 igeo
->inodes_per_cluster
= XFS_FSB_TO_INO(mp
, igeo
->blocks_per_cluster
);
3033 /* Calculate inode cluster alignment. */
3034 if (xfs_has_align(mp
) &&
3035 mp
->m_sb
.sb_inoalignmt
>= igeo
->blocks_per_cluster
)
3036 igeo
->cluster_align
= mp
->m_sb
.sb_inoalignmt
;
3038 igeo
->cluster_align
= 1;
3039 igeo
->inoalign_mask
= igeo
->cluster_align
- 1;
3040 igeo
->cluster_align_inodes
= XFS_FSB_TO_INO(mp
, igeo
->cluster_align
);
3043 * If we are using stripe alignment, check whether
3044 * the stripe unit is a multiple of the inode alignment
3046 if (mp
->m_dalign
&& igeo
->inoalign_mask
&&
3047 !(mp
->m_dalign
& igeo
->inoalign_mask
))
3048 igeo
->ialloc_align
= mp
->m_dalign
;
3050 igeo
->ialloc_align
= 0;
3052 if (mp
->m_sb
.sb_blocksize
> PAGE_SIZE
)
3053 igeo
->min_folio_order
= mp
->m_sb
.sb_blocklog
- PAGE_SHIFT
;
3055 igeo
->min_folio_order
= 0;
3058 /* Compute the location of the root directory inode that is laid out by mkfs. */
3060 xfs_ialloc_calc_rootino(
3061 struct xfs_mount
*mp
,
3064 struct xfs_ino_geometry
*igeo
= M_IGEO(mp
);
3065 xfs_agblock_t first_bno
;
3068 * Pre-calculate the geometry of AG 0. We know what it looks like
3069 * because libxfs knows how to create allocation groups now.
3071 * first_bno is the first block in which mkfs could possibly have
3072 * allocated the root directory inode, once we factor in the metadata
3073 * that mkfs formats before it. Namely, the four AG headers...
3075 first_bno
= howmany(4 * mp
->m_sb
.sb_sectsize
, mp
->m_sb
.sb_blocksize
);
3077 /* ...the two free space btree roots... */
3080 /* ...the inode btree root... */
3083 /* ...the initial AGFL... */
3084 first_bno
+= xfs_alloc_min_freelist(mp
, NULL
);
3086 /* ...the free inode btree root... */
3087 if (xfs_has_finobt(mp
))
3090 /* ...the reverse mapping btree root... */
3091 if (xfs_has_rmapbt(mp
))
3094 /* ...the reference count btree... */
3095 if (xfs_has_reflink(mp
))
3099 * ...and the log, if it is allocated in the first allocation group.
3101 * This can happen with filesystems that only have a single
3102 * allocation group, or very odd geometries created by old mkfs
3103 * versions on very small filesystems.
3105 if (xfs_ag_contains_log(mp
, 0))
3106 first_bno
+= mp
->m_sb
.sb_logblocks
;
3109 * Now round first_bno up to whatever allocation alignment is given
3110 * by the filesystem or was passed in.
3112 if (xfs_has_dalign(mp
) && igeo
->ialloc_align
> 0)
3113 first_bno
= roundup(first_bno
, sunit
);
3114 else if (xfs_has_align(mp
) &&
3115 mp
->m_sb
.sb_inoalignmt
> 1)
3116 first_bno
= roundup(first_bno
, mp
->m_sb
.sb_inoalignmt
);
3118 return XFS_AGINO_TO_INO(mp
, 0, XFS_AGB_TO_AGINO(mp
, first_bno
));
3122 * Ensure there are not sparse inode clusters that cross the new EOAG.
3124 * This is a no-op for non-spinode filesystems since clusters are always fully
3125 * allocated and checking the bnobt suffices. However, a spinode filesystem
3126 * could have a record where the upper inodes are free blocks. If those blocks
3127 * were removed from the filesystem, the inode record would extend beyond EOAG,
3128 * which will be flagged as corruption.
3131 xfs_ialloc_check_shrink(
3132 struct xfs_perag
*pag
,
3133 struct xfs_trans
*tp
,
3134 struct xfs_buf
*agibp
,
3135 xfs_agblock_t new_length
)
3137 struct xfs_inobt_rec_incore rec
;
3138 struct xfs_btree_cur
*cur
;
3143 if (!xfs_has_sparseinodes(pag_mount(pag
)))
3146 cur
= xfs_inobt_init_cursor(pag
, tp
, agibp
);
3148 /* Look up the inobt record that would correspond to the new EOFS. */
3149 agino
= XFS_AGB_TO_AGINO(pag_mount(pag
), new_length
);
3150 error
= xfs_inobt_lookup(cur
, agino
, XFS_LOOKUP_LE
, &has
);
3154 error
= xfs_inobt_get_rec(cur
, &rec
, &has
);
3159 xfs_ag_mark_sick(pag
, XFS_SICK_AG_INOBT
);
3160 error
= -EFSCORRUPTED
;
3164 /* If the record covers inodes that would be beyond EOFS, bail out. */
3165 if (rec
.ir_startino
+ XFS_INODES_PER_CHUNK
> agino
) {
3170 xfs_btree_del_cursor(cur
, error
);