libxfs/xfs_ialloc.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   4  * All Rights Reserved.
   5  */
   6 #include "libxfs_priv.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_bit.h"
  13 #include "xfs_mount.h"
  14 #include "xfs_inode.h"
  15 #include "xfs_btree.h"
  16 #include "xfs_ialloc.h"
  17 #include "xfs_ialloc_btree.h"
  18 #include "xfs_alloc.h"
  19 #include "xfs_errortag.h"
  20 #include "xfs_bmap.h"
  21 #include "xfs_trans.h"
  22 #include "xfs_trace.h"
  23 #include "xfs_rmap.h"
  24 #include "xfs_ag.h"
  25 #include "xfs_health.h"
  26
  27 /*
  28  * Lookup a record by ino in the btree given by cur.
  29  */
  30 int                                     /* error */
  31 xfs_inobt_lookup(
  32         struct xfs_btree_cur    *cur,   /* btree cursor */
  33         xfs_agino_t             ino,    /* starting inode of chunk */
  34         xfs_lookup_t            dir,    /* <=, >=, == */
  35         int                     *stat)  /* success/failure */
  36 {
  37         cur->bc_rec.i.ir_startino = ino;
  38         cur->bc_rec.i.ir_holemask = 0;
  39         cur->bc_rec.i.ir_count = 0;
  40         cur->bc_rec.i.ir_freecount = 0;
  41         cur->bc_rec.i.ir_free = 0;
  42         return xfs_btree_lookup(cur, dir, stat);
  43 }
  44
  45 /*
  46  * Update the record referred to by cur to the value given.
  47  * This either works (return 0) or gets an EFSCORRUPTED error.
  48  */
  49 STATIC int                              /* error */
  50 xfs_inobt_update(
  51         struct xfs_btree_cur    *cur,   /* btree cursor */
  52         xfs_inobt_rec_incore_t  *irec)  /* btree record */
  53 {
  54         union xfs_btree_rec     rec;
  55
  56         rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
  57         if (xfs_has_sparseinodes(cur->bc_mp)) {
  58                 rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
  59                 rec.inobt.ir_u.sp.ir_count = irec->ir_count;
  60                 rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
  61         } else {
  62                 /* ir_holemask/ir_count not supported on-disk */
  63                 rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
  64         }
  65         rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
  66         return xfs_btree_update(cur, &rec);
  67 }
  68
  69 /* Convert on-disk btree record to incore inobt record. */
  70 void
  71 xfs_inobt_btrec_to_irec(
  72         struct xfs_mount                *mp,
  73         const union xfs_btree_rec       *rec,
  74         struct xfs_inobt_rec_incore     *irec)
  75 {
  76         irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
  77         if (xfs_has_sparseinodes(mp)) {
  78                 irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
  79                 irec->ir_count = rec->inobt.ir_u.sp.ir_count;
  80                 irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
  81         } else {
  82                 /*
  83                  * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
  84                  * values for full inode chunks.
  85                  */
  86                 irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
  87                 irec->ir_count = XFS_INODES_PER_CHUNK;
  88                 irec->ir_freecount =
  89                                 be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
  90         }
  91         irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
  92 }
  93
  94 /* Compute the freecount of an incore inode record. */
  95 uint8_t
  96 xfs_inobt_rec_freecount(
  97         const struct xfs_inobt_rec_incore       *irec)
  98 {
  99         uint64_t                                realfree = irec->ir_free;
 100
 101         if (xfs_inobt_issparse(irec->ir_holemask))
 102                 realfree &= xfs_inobt_irec_to_allocmask(irec);
 103         return hweight64(realfree);
 104 }
 105
 106 /* Simple checks for inode records. */
 107 xfs_failaddr_t
 108 xfs_inobt_check_irec(
 109         struct xfs_perag                        *pag,
 110         const struct xfs_inobt_rec_incore       *irec)
 111 {
 112         /* Record has to be properly aligned within the AG. */
 113         if (!xfs_verify_agino(pag, irec->ir_startino))
 114                 return __this_address;
 115         if (!xfs_verify_agino(pag,
 116                                 irec->ir_startino + XFS_INODES_PER_CHUNK - 1))
 117                 return __this_address;
 118         if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
 119             irec->ir_count > XFS_INODES_PER_CHUNK)
 120                 return __this_address;
 121         if (irec->ir_freecount > XFS_INODES_PER_CHUNK)
 122                 return __this_address;
 123
 124         if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount)
 125                 return __this_address;
 126
 127         return NULL;
 128 }
 129
 130 static inline int
 131 xfs_inobt_complain_bad_rec(
 132         struct xfs_btree_cur            *cur,
 133         xfs_failaddr_t                  fa,
 134         const struct xfs_inobt_rec_incore *irec)
 135 {
 136         struct xfs_mount                *mp = cur->bc_mp;
 137
 138         xfs_warn(mp,
 139                 "%sbt record corruption in AG %d detected at %pS!",
 140                 cur->bc_ops->name, cur->bc_group->xg_gno, fa);
 141         xfs_warn(mp,
 142 "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
 143                 irec->ir_startino, irec->ir_count, irec->ir_freecount,
 144                 irec->ir_free, irec->ir_holemask);
 145         xfs_btree_mark_sick(cur);
 146         return -EFSCORRUPTED;
 147 }
 148
 149 /*
 150  * Get the data from the pointed-to record.
 151  */
 152 int
 153 xfs_inobt_get_rec(
 154         struct xfs_btree_cur            *cur,
 155         struct xfs_inobt_rec_incore     *irec,
 156         int                             *stat)
 157 {
 158         struct xfs_mount                *mp = cur->bc_mp;
 159         union xfs_btree_rec             *rec;
 160         xfs_failaddr_t                  fa;
 161         int                             error;
 162
 163         error = xfs_btree_get_rec(cur, &rec, stat);
 164         if (error || *stat == 0)
 165                 return error;
 166
 167         xfs_inobt_btrec_to_irec(mp, rec, irec);
 168         fa = xfs_inobt_check_irec(to_perag(cur->bc_group), irec);
 169         if (fa)
 170                 return xfs_inobt_complain_bad_rec(cur, fa, irec);
 171
 172         return 0;
 173 }
 174
 175 /*
 176  * Insert a single inobt record. Cursor must already point to desired location.
 177  */
 178 int
 179 xfs_inobt_insert_rec(
 180         struct xfs_btree_cur    *cur,
 181         uint16_t                holemask,
 182         uint8_t                 count,
 183         int32_t                 freecount,
 184         xfs_inofree_t           free,
 185         int                     *stat)
 186 {
 187         cur->bc_rec.i.ir_holemask = holemask;
 188         cur->bc_rec.i.ir_count = count;
 189         cur->bc_rec.i.ir_freecount = freecount;
 190         cur->bc_rec.i.ir_free = free;
 191         return xfs_btree_insert(cur, stat);
 192 }
 193
 194 /*
 195  * Insert records describing a newly allocated inode chunk into the inobt.
 196  */
 197 STATIC int
 198 xfs_inobt_insert(
 199         struct xfs_perag        *pag,
 200         struct xfs_trans        *tp,
 201         struct xfs_buf          *agbp,
 202         xfs_agino_t             newino,
 203         xfs_agino_t             newlen,
 204         bool                    is_finobt)
 205 {
 206         struct xfs_btree_cur    *cur;
 207         xfs_agino_t             thisino;
 208         int                     i;
 209         int                     error;
 210
 211         if (is_finobt)
 212                 cur = xfs_finobt_init_cursor(pag, tp, agbp);
 213         else
 214                 cur = xfs_inobt_init_cursor(pag, tp, agbp);
 215
 216         for (thisino = newino;
 217              thisino < newino + newlen;
 218              thisino += XFS_INODES_PER_CHUNK) {
 219                 error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
 220                 if (error) {
 221                         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 222                         return error;
 223                 }
 224                 ASSERT(i == 0);
 225
 226                 error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
 227                                              XFS_INODES_PER_CHUNK,
 228                                              XFS_INODES_PER_CHUNK,
 229                                              XFS_INOBT_ALL_FREE, &i);
 230                 if (error) {
 231                         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 232                         return error;
 233                 }
 234                 ASSERT(i == 1);
 235         }
 236
 237         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 238
 239         return 0;
 240 }
 241
 242 /*
 243  * Verify that the number of free inodes in the AGI is correct.
 244  */
 245 #ifdef DEBUG
 246 static int
 247 xfs_check_agi_freecount(
 248         struct xfs_btree_cur    *cur)
 249 {
 250         if (cur->bc_nlevels == 1) {
 251                 xfs_inobt_rec_incore_t rec;
 252                 int             freecount = 0;
 253                 int             error;
 254                 int             i;
 255
 256                 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
 257                 if (error)
 258                         return error;
 259
 260                 do {
 261                         error = xfs_inobt_get_rec(cur, &rec, &i);
 262                         if (error)
 263                                 return error;
 264
 265                         if (i) {
 266                                 freecount += rec.ir_freecount;
 267                                 error = xfs_btree_increment(cur, 0, &i);
 268                                 if (error)
 269                                         return error;
 270                         }
 271                 } while (i == 1);
 272
 273                 if (!xfs_is_shutdown(cur->bc_mp)) {
 274                         ASSERT(freecount ==
 275                                 to_perag(cur->bc_group)->pagi_freecount);
 276                 }
 277         }
 278         return 0;
 279 }
 280 #else
 281 #define xfs_check_agi_freecount(cur)    0
 282 #endif
 283
 284 /*
 285  * Initialise a new set of inodes. When called without a transaction context
 286  * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
 287  * than logging them (which in a transaction context puts them into the AIL
 288  * for writeback rather than the xfsbufd queue).
 289  */
 290 int
 291 xfs_ialloc_inode_init(
 292         struct xfs_mount        *mp,
 293         struct xfs_trans        *tp,
 294         struct list_head        *buffer_list,
 295         int                     icount,
 296         xfs_agnumber_t          agno,
 297         xfs_agblock_t           agbno,
 298         xfs_agblock_t           length,
 299         unsigned int            gen)
 300 {
 301         struct xfs_buf          *fbuf;
 302         struct xfs_dinode       *free;
 303         int                     nbufs;
 304         int                     version;
 305         int                     i, j;
 306         xfs_daddr_t             d;
 307         xfs_ino_t               ino = 0;
 308         int                     error;
 309
 310         /*
 311          * Loop over the new block(s), filling in the inodes.  For small block
 312          * sizes, manipulate the inodes in buffers  which are multiples of the
 313          * blocks size.
 314          */
 315         nbufs = length / M_IGEO(mp)->blocks_per_cluster;
 316
 317         /*
 318          * Figure out what version number to use in the inodes we create.  If
 319          * the superblock version has caught up to the one that supports the new
 320          * inode format, then use the new inode version.  Otherwise use the old
 321          * version so that old kernels will continue to be able to use the file
 322          * system.
 323          *
 324          * For v3 inodes, we also need to write the inode number into the inode,
 325          * so calculate the first inode number of the chunk here as
 326          * XFS_AGB_TO_AGINO() only works within a filesystem block, not
 327          * across multiple filesystem blocks (such as a cluster) and so cannot
 328          * be used in the cluster buffer loop below.
 329          *
 330          * Further, because we are writing the inode directly into the buffer
 331          * and calculating a CRC on the entire inode, we have ot log the entire
 332          * inode so that the entire range the CRC covers is present in the log.
 333          * That means for v3 inode we log the entire buffer rather than just the
 334          * inode cores.
 335          */
 336         if (xfs_has_v3inodes(mp)) {
 337                 version = 3;
 338                 ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno));
 339
 340                 /*
 341                  * log the initialisation that is about to take place as an
 342                  * logical operation. This means the transaction does not
 343                  * need to log the physical changes to the inode buffers as log
 344                  * recovery will know what initialisation is actually needed.
 345                  * Hence we only need to log the buffers as "ordered" buffers so
 346                  * they track in the AIL as if they were physically logged.
 347                  */
 348                 if (tp)
 349                         xfs_icreate_log(tp, agno, agbno, icount,
 350                                         mp->m_sb.sb_inodesize, length, gen);
 351         } else
 352                 version = 2;
 353
 354         for (j = 0; j < nbufs; j++) {
 355                 /*
 356                  * Get the block.
 357                  */
 358                 d = XFS_AGB_TO_DADDR(mp, agno, agbno +
 359                                 (j * M_IGEO(mp)->blocks_per_cluster));
 360                 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
 361                                 mp->m_bsize * M_IGEO(mp)->blocks_per_cluster,
 362                                 0, &fbuf);
 363                 if (error)
 364                         return error;
 365
 366                 /* Initialize the inode buffers and log them appropriately. */
 367                 fbuf->b_ops = &xfs_inode_buf_ops;
 368                 xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
 369                 for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
 370                         int     ioffset = i << mp->m_sb.sb_inodelog;
 371
 372                         free = xfs_make_iptr(mp, fbuf, i);
 373                         free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
 374                         free->di_version = version;
 375                         free->di_gen = cpu_to_be32(gen);
 376                         free->di_next_unlinked = cpu_to_be32(NULLAGINO);
 377
 378                         if (version == 3) {
 379                                 free->di_ino = cpu_to_be64(ino);
 380                                 ino++;
 381                                 uuid_copy(&free->di_uuid,
 382                                           &mp->m_sb.sb_meta_uuid);
 383                                 xfs_dinode_calc_crc(mp, free);
 384                         } else if (tp) {
 385                                 /* just log the inode core */
 386                                 xfs_trans_log_buf(tp, fbuf, ioffset,
 387                                           ioffset + XFS_DINODE_SIZE(mp) - 1);
 388                         }
 389                 }
 390
 391                 if (tp) {
 392                         /*
 393                          * Mark the buffer as an inode allocation buffer so it
 394                          * sticks in AIL at the point of this allocation
 395                          * transaction. This ensures the they are on disk before
 396                          * the tail of the log can be moved past this
 397                          * transaction (i.e. by preventing relogging from moving
 398                          * it forward in the log).
 399                          */
 400                         xfs_trans_inode_alloc_buf(tp, fbuf);
 401                         if (version == 3) {
 402                                 /*
 403                                  * Mark the buffer as ordered so that they are
 404                                  * not physically logged in the transaction but
 405                                  * still tracked in the AIL as part of the
 406                                  * transaction and pin the log appropriately.
 407                                  */
 408                                 xfs_trans_ordered_buf(tp, fbuf);
 409                         }
 410                 } else {
 411                         fbuf->b_flags |= XBF_DONE;
 412                         xfs_buf_delwri_queue(fbuf, buffer_list);
 413                         xfs_buf_relse(fbuf);
 414                 }
 415         }
 416         return 0;
 417 }
 418
 419 /*
 420  * Align startino and allocmask for a recently allocated sparse chunk such that
 421  * they are fit for insertion (or merge) into the on-disk inode btrees.
 422  *
 423  * Background:
 424  *
 425  * When enabled, sparse inode support increases the inode alignment from cluster
 426  * size to inode chunk size. This means that the minimum range between two
 427  * non-adjacent inode records in the inobt is large enough for a full inode
 428  * record. This allows for cluster sized, cluster aligned block allocation
 429  * without need to worry about whether the resulting inode record overlaps with
 430  * another record in the tree. Without this basic rule, we would have to deal
 431  * with the consequences of overlap by potentially undoing recent allocations in
 432  * the inode allocation codepath.
 433  *
 434  * Because of this alignment rule (which is enforced on mount), there are two
 435  * inobt possibilities for newly allocated sparse chunks. One is that the
 436  * aligned inode record for the chunk covers a range of inodes not already
 437  * covered in the inobt (i.e., it is safe to insert a new sparse record). The
 438  * other is that a record already exists at the aligned startino that considers
 439  * the newly allocated range as sparse. In the latter case, record content is
 440  * merged in hope that sparse inode chunks fill to full chunks over time.
 441  */
 442 STATIC void
 443 xfs_align_sparse_ino(
 444         struct xfs_mount                *mp,
 445         xfs_agino_t                     *startino,
 446         uint16_t                        *allocmask)
 447 {
 448         xfs_agblock_t                   agbno;
 449         xfs_agblock_t                   mod;
 450         int                             offset;
 451
 452         agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
 453         mod = agbno % mp->m_sb.sb_inoalignmt;
 454         if (!mod)
 455                 return;
 456
 457         /* calculate the inode offset and align startino */
 458         offset = XFS_AGB_TO_AGINO(mp, mod);
 459         *startino -= offset;
 460
 461         /*
 462          * Since startino has been aligned down, left shift allocmask such that
 463          * it continues to represent the same physical inodes relative to the
 464          * new startino.
 465          */
 466         *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
 467 }
 468
 469 /*
 470  * Determine whether the source inode record can merge into the target. Both
 471  * records must be sparse, the inode ranges must match and there must be no
 472  * allocation overlap between the records.
 473  */
 474 STATIC bool
 475 __xfs_inobt_can_merge(
 476         struct xfs_inobt_rec_incore     *trec,  /* tgt record */
 477         struct xfs_inobt_rec_incore     *srec)  /* src record */
 478 {
 479         uint64_t                        talloc;
 480         uint64_t                        salloc;
 481
 482         /* records must cover the same inode range */
 483         if (trec->ir_startino != srec->ir_startino)
 484                 return false;
 485
 486         /* both records must be sparse */
 487         if (!xfs_inobt_issparse(trec->ir_holemask) ||
 488             !xfs_inobt_issparse(srec->ir_holemask))
 489                 return false;
 490
 491         /* both records must track some inodes */
 492         if (!trec->ir_count || !srec->ir_count)
 493                 return false;
 494
 495         /* can't exceed capacity of a full record */
 496         if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
 497                 return false;
 498
 499         /* verify there is no allocation overlap */
 500         talloc = xfs_inobt_irec_to_allocmask(trec);
 501         salloc = xfs_inobt_irec_to_allocmask(srec);
 502         if (talloc & salloc)
 503                 return false;
 504
 505         return true;
 506 }
 507
 508 /*
 509  * Merge the source inode record into the target. The caller must call
 510  * __xfs_inobt_can_merge() to ensure the merge is valid.
 511  */
 512 STATIC void
 513 __xfs_inobt_rec_merge(
 514         struct xfs_inobt_rec_incore     *trec,  /* target */
 515         struct xfs_inobt_rec_incore     *srec)  /* src */
 516 {
 517         ASSERT(trec->ir_startino == srec->ir_startino);
 518
 519         /* combine the counts */
 520         trec->ir_count += srec->ir_count;
 521         trec->ir_freecount += srec->ir_freecount;
 522
 523         /*
 524          * Merge the holemask and free mask. For both fields, 0 bits refer to
 525          * allocated inodes. We combine the allocated ranges with bitwise AND.
 526          */
 527         trec->ir_holemask &= srec->ir_holemask;
 528         trec->ir_free &= srec->ir_free;
 529 }
 530
 531 /*
 532  * Insert a new sparse inode chunk into the associated inode allocation btree.
 533  * The inode record for the sparse chunk is pre-aligned to a startino that
 534  * should match any pre-existing sparse inode record in the tree. This allows
 535  * sparse chunks to fill over time.
 536  *
 537  * If no preexisting record exists, the provided record is inserted.
 538  * If there is a preexisting record, the provided record is merged with the
 539  * existing record and updated in place. The merged record is returned in nrec.
 540  *
 541  * It is considered corruption if a merge is requested and not possible. Given
 542  * the sparse inode alignment constraints, this should never happen.
 543  */
 544 STATIC int
 545 xfs_inobt_insert_sprec(
 546         struct xfs_perag                *pag,
 547         struct xfs_trans                *tp,
 548         struct xfs_buf                  *agbp,
 549         struct xfs_inobt_rec_incore     *nrec)  /* in/out: new/merged rec. */
 550 {
 551         struct xfs_mount                *mp = pag_mount(pag);
 552         struct xfs_btree_cur            *cur;
 553         int                             error;
 554         int                             i;
 555         struct xfs_inobt_rec_incore     rec;
 556
 557         cur = xfs_inobt_init_cursor(pag, tp, agbp);
 558
 559         /* the new record is pre-aligned so we know where to look */
 560         error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
 561         if (error)
 562                 goto error;
 563         /* if nothing there, insert a new record and return */
 564         if (i == 0) {
 565                 error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
 566                                              nrec->ir_count, nrec->ir_freecount,
 567                                              nrec->ir_free, &i);
 568                 if (error)
 569                         goto error;
 570                 if (XFS_IS_CORRUPT(mp, i != 1)) {
 571                         xfs_btree_mark_sick(cur);
 572                         error = -EFSCORRUPTED;
 573                         goto error;
 574                 }
 575
 576                 goto out;
 577         }
 578
 579         /*
 580          * A record exists at this startino.  Merge the records.
 581          */
 582         error = xfs_inobt_get_rec(cur, &rec, &i);
 583         if (error)
 584                 goto error;
 585         if (XFS_IS_CORRUPT(mp, i != 1)) {
 586                 xfs_btree_mark_sick(cur);
 587                 error = -EFSCORRUPTED;
 588                 goto error;
 589         }
 590         if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
 591                 xfs_btree_mark_sick(cur);
 592                 error = -EFSCORRUPTED;
 593                 goto error;
 594         }
 595
 596         /*
 597          * This should never fail. If we have coexisting records that
 598          * cannot merge, something is seriously wrong.
 599          */
 600         if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
 601                 xfs_btree_mark_sick(cur);
 602                 error = -EFSCORRUPTED;
 603                 goto error;
 604         }
 605
 606         trace_xfs_irec_merge_pre(pag, &rec, nrec);
 607
 608         /* merge to nrec to output the updated record */
 609         __xfs_inobt_rec_merge(nrec, &rec);
 610
 611         trace_xfs_irec_merge_post(pag, nrec);
 612
 613         error = xfs_inobt_rec_check_count(mp, nrec);
 614         if (error)
 615                 goto error;
 616
 617         error = xfs_inobt_update(cur, nrec);
 618         if (error)
 619                 goto error;
 620
 621 out:
 622         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 623         return 0;
 624 error:
 625         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 626         return error;
 627 }
 628
 629 /*
 630  * Insert a new sparse inode chunk into the free inode btree. The inode
 631  * record for the sparse chunk is pre-aligned to a startino that should match
 632  * any pre-existing sparse inode record in the tree. This allows sparse chunks
 633  * to fill over time.
 634  *
 635  * The new record is always inserted, overwriting a pre-existing record if
 636  * there is one.
 637  */
 638 STATIC int
 639 xfs_finobt_insert_sprec(
 640         struct xfs_perag                *pag,
 641         struct xfs_trans                *tp,
 642         struct xfs_buf                  *agbp,
 643         struct xfs_inobt_rec_incore     *nrec)  /* in/out: new rec. */
 644 {
 645         struct xfs_mount                *mp = pag_mount(pag);
 646         struct xfs_btree_cur            *cur;
 647         int                             error;
 648         int                             i;
 649
 650         cur = xfs_finobt_init_cursor(pag, tp, agbp);
 651
 652         /* the new record is pre-aligned so we know where to look */
 653         error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
 654         if (error)
 655                 goto error;
 656         /* if nothing there, insert a new record and return */
 657         if (i == 0) {
 658                 error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
 659                                              nrec->ir_count, nrec->ir_freecount,
 660                                              nrec->ir_free, &i);
 661                 if (error)
 662                         goto error;
 663                 if (XFS_IS_CORRUPT(mp, i != 1)) {
 664                         xfs_btree_mark_sick(cur);
 665                         error = -EFSCORRUPTED;
 666                         goto error;
 667                 }
 668         } else {
 669                 error = xfs_inobt_update(cur, nrec);
 670                 if (error)
 671                         goto error;
 672         }
 673
 674         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 675         return 0;
 676 error:
 677         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 678         return error;
 679 }
 680
 681
 682 /*
 683  * Allocate new inodes in the allocation group specified by agbp.  Returns 0 if
 684  * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so
 685  * the caller knows it can try another AG, a hard -ENOSPC when over the maximum
 686  * inode count threshold, or the usual negative error code for other errors.
 687  */
 688 STATIC int
 689 xfs_ialloc_ag_alloc(
 690         struct xfs_perag        *pag,
 691         struct xfs_trans        *tp,
 692         struct xfs_buf          *agbp)
 693 {
 694         struct xfs_agi          *agi;
 695         struct xfs_alloc_arg    args;
 696         int                     error;
 697         xfs_agino_t             newino;         /* new first inode's number */
 698         xfs_agino_t             newlen;         /* new number of inodes */
 699         int                     isaligned = 0;  /* inode allocation at stripe */
 700                                                 /* unit boundary */
 701         /* init. to full chunk */
 702         struct xfs_inobt_rec_incore rec;
 703         struct xfs_ino_geometry *igeo = M_IGEO(tp->t_mountp);
 704         uint16_t                allocmask = (uint16_t) -1;
 705         int                     do_sparse = 0;
 706
 707         memset(&args, 0, sizeof(args));
 708         args.tp = tp;
 709         args.mp = tp->t_mountp;
 710         args.fsbno = NULLFSBLOCK;
 711         args.oinfo = XFS_RMAP_OINFO_INODES;
 712         args.pag = pag;
 713
 714 #ifdef DEBUG
 715         /* randomly do sparse inode allocations */
 716         if (xfs_has_sparseinodes(tp->t_mountp) &&
 717             igeo->ialloc_min_blks < igeo->ialloc_blks)
 718                 do_sparse = get_random_u32_below(2);
 719 #endif
 720
 721         /*
 722          * Locking will ensure that we don't have two callers in here
 723          * at one time.
 724          */
 725         newlen = igeo->ialloc_inos;
 726         if (igeo->maxicount &&
 727             percpu_counter_read_positive(&args.mp->m_icount) + newlen >
 728                                                         igeo->maxicount)
 729                 return -ENOSPC;
 730         args.minlen = args.maxlen = igeo->ialloc_blks;
 731         /*
 732          * First try to allocate inodes contiguous with the last-allocated
 733          * chunk of inodes.  If the filesystem is striped, this will fill
 734          * an entire stripe unit with inodes.
 735          */
 736         agi = agbp->b_addr;
 737         newino = be32_to_cpu(agi->agi_newino);
 738         args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
 739                      igeo->ialloc_blks;
 740         if (do_sparse)
 741                 goto sparse_alloc;
 742         if (likely(newino != NULLAGINO &&
 743                   (args.agbno < be32_to_cpu(agi->agi_length)))) {
 744                 args.prod = 1;
 745
 746                 /*
 747                  * We need to take into account alignment here to ensure that
 748                  * we don't modify the free list if we fail to have an exact
 749                  * block. If we don't have an exact match, and every oher
 750                  * attempt allocation attempt fails, we'll end up cancelling
 751                  * a dirty transaction and shutting down.
 752                  *
 753                  * For an exact allocation, alignment must be 1,
 754                  * however we need to take cluster alignment into account when
 755                  * fixing up the freelist. Use the minalignslop field to
 756                  * indicate that extra blocks might be required for alignment,
 757                  * but not to use them in the actual exact allocation.
 758                  */
 759                 args.alignment = 1;
 760                 args.minalignslop = igeo->cluster_align - 1;
 761
 762                 /* Allow space for the inode btree to split. */
 763                 args.minleft = igeo->inobt_maxlevels;
 764                 error = xfs_alloc_vextent_exact_bno(&args,
 765                                 xfs_agbno_to_fsb(pag, args.agbno));
 766                 if (error)
 767                         return error;
 768
 769                 /*
 770                  * This request might have dirtied the transaction if the AG can
 771                  * satisfy the request, but the exact block was not available.
 772                  * If the allocation did fail, subsequent requests will relax
 773                  * the exact agbno requirement and increase the alignment
 774                  * instead. It is critical that the total size of the request
 775                  * (len + alignment + slop) does not increase from this point
 776                  * on, so reset minalignslop to ensure it is not included in
 777                  * subsequent requests.
 778                  */
 779                 args.minalignslop = 0;
 780         }
 781
 782         if (unlikely(args.fsbno == NULLFSBLOCK)) {
 783                 /*
 784                  * Set the alignment for the allocation.
 785                  * If stripe alignment is turned on then align at stripe unit
 786                  * boundary.
 787                  * If the cluster size is smaller than a filesystem block
 788                  * then we're doing I/O for inodes in filesystem block size
 789                  * pieces, so don't need alignment anyway.
 790                  */
 791                 isaligned = 0;
 792                 if (igeo->ialloc_align) {
 793                         ASSERT(!xfs_has_noalign(args.mp));
 794                         args.alignment = args.mp->m_dalign;
 795                         isaligned = 1;
 796                 } else
 797                         args.alignment = igeo->cluster_align;
 798                 /*
 799                  * Allocate a fixed-size extent of inodes.
 800                  */
 801                 args.prod = 1;
 802                 /*
 803                  * Allow space for the inode btree to split.
 804                  */
 805                 args.minleft = igeo->inobt_maxlevels;
 806                 error = xfs_alloc_vextent_near_bno(&args,
 807                                 xfs_agbno_to_fsb(pag,
 808                                         be32_to_cpu(agi->agi_root)));
 809                 if (error)
 810                         return error;
 811         }
 812
 813         /*
 814          * If stripe alignment is turned on, then try again with cluster
 815          * alignment.
 816          */
 817         if (isaligned && args.fsbno == NULLFSBLOCK) {
 818                 args.alignment = igeo->cluster_align;
 819                 error = xfs_alloc_vextent_near_bno(&args,
 820                                 xfs_agbno_to_fsb(pag,
 821                                         be32_to_cpu(agi->agi_root)));
 822                 if (error)
 823                         return error;
 824         }
 825
 826         /*
 827          * Finally, try a sparse allocation if the filesystem supports it and
 828          * the sparse allocation length is smaller than a full chunk.
 829          */
 830         if (xfs_has_sparseinodes(args.mp) &&
 831             igeo->ialloc_min_blks < igeo->ialloc_blks &&
 832             args.fsbno == NULLFSBLOCK) {
 833 sparse_alloc:
 834                 args.alignment = args.mp->m_sb.sb_spino_align;
 835                 args.prod = 1;
 836
 837                 args.minlen = igeo->ialloc_min_blks;
 838                 args.maxlen = args.minlen;
 839
 840                 /*
 841                  * The inode record will be aligned to full chunk size. We must
 842                  * prevent sparse allocation from AG boundaries that result in
 843                  * invalid inode records, such as records that start at agbno 0
 844                  * or extend beyond the AG.
 845                  *
 846                  * Set min agbno to the first aligned, non-zero agbno and max to
 847                  * the last aligned agbno that is at least one full chunk from
 848                  * the end of the AG.
 849                  */
 850                 args.min_agbno = args.mp->m_sb.sb_inoalignmt;
 851                 args.max_agbno = round_down(xfs_ag_block_count(args.mp,
 852                                                         pag_agno(pag)),
 853                                             args.mp->m_sb.sb_inoalignmt) -
 854                                  igeo->ialloc_blks;
 855
 856                 error = xfs_alloc_vextent_near_bno(&args,
 857                                 xfs_agbno_to_fsb(pag,
 858                                         be32_to_cpu(agi->agi_root)));
 859                 if (error)
 860                         return error;
 861
 862                 newlen = XFS_AGB_TO_AGINO(args.mp, args.len);
 863                 ASSERT(newlen <= XFS_INODES_PER_CHUNK);
 864                 allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
 865         }
 866
 867         if (args.fsbno == NULLFSBLOCK)
 868                 return -EAGAIN;
 869
 870         ASSERT(args.len == args.minlen);
 871
 872         /*
 873          * Stamp and write the inode buffers.
 874          *
 875          * Seed the new inode cluster with a random generation number. This
 876          * prevents short-term reuse of generation numbers if a chunk is
 877          * freed and then immediately reallocated. We use random numbers
 878          * rather than a linear progression to prevent the next generation
 879          * number from being easily guessable.
 880          */
 881         error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, pag_agno(pag),
 882                         args.agbno, args.len, get_random_u32());
 883
 884         if (error)
 885                 return error;
 886         /*
 887          * Convert the results.
 888          */
 889         newino = XFS_AGB_TO_AGINO(args.mp, args.agbno);
 890
 891         if (xfs_inobt_issparse(~allocmask)) {
 892                 /*
 893                  * We've allocated a sparse chunk. Align the startino and mask.
 894                  */
 895                 xfs_align_sparse_ino(args.mp, &newino, &allocmask);
 896
 897                 rec.ir_startino = newino;
 898                 rec.ir_holemask = ~allocmask;
 899                 rec.ir_count = newlen;
 900                 rec.ir_freecount = newlen;
 901                 rec.ir_free = XFS_INOBT_ALL_FREE;
 902
 903                 /*
 904                  * Insert the sparse record into the inobt and allow for a merge
 905                  * if necessary. If a merge does occur, rec is updated to the
 906                  * merged record.
 907                  */
 908                 error = xfs_inobt_insert_sprec(pag, tp, agbp, &rec);
 909                 if (error == -EFSCORRUPTED) {
 910                         xfs_alert(args.mp,
 911         "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
 912                                   xfs_agino_to_ino(pag, rec.ir_startino),
 913                                   rec.ir_holemask, rec.ir_count);
 914                         xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
 915                 }
 916                 if (error)
 917                         return error;
 918
 919                 /*
 920                  * We can't merge the part we've just allocated as for the inobt
 921                  * due to finobt semantics. The original record may or may not
 922                  * exist independent of whether physical inodes exist in this
 923                  * sparse chunk.
 924                  *
 925                  * We must update the finobt record based on the inobt record.
 926                  * rec contains the fully merged and up to date inobt record
 927                  * from the previous call. Set merge false to replace any
 928                  * existing record with this one.
 929                  */
 930                 if (xfs_has_finobt(args.mp)) {
 931                         error = xfs_finobt_insert_sprec(pag, tp, agbp, &rec);
 932                         if (error)
 933                                 return error;
 934                 }
 935         } else {
 936                 /* full chunk - insert new records to both btrees */
 937                 error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, false);
 938                 if (error)
 939                         return error;
 940
 941                 if (xfs_has_finobt(args.mp)) {
 942                         error = xfs_inobt_insert(pag, tp, agbp, newino,
 943                                                  newlen, true);
 944                         if (error)
 945                                 return error;
 946                 }
 947         }
 948
 949         /*
 950          * Update AGI counts and newino.
 951          */
 952         be32_add_cpu(&agi->agi_count, newlen);
 953         be32_add_cpu(&agi->agi_freecount, newlen);
 954         pag->pagi_freecount += newlen;
 955         pag->pagi_count += newlen;
 956         agi->agi_newino = cpu_to_be32(newino);
 957
 958         /*
 959          * Log allocation group header fields
 960          */
 961         xfs_ialloc_log_agi(tp, agbp,
 962                 XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
 963         /*
 964          * Modify/log superblock values for inode count and inode free count.
 965          */
 966         xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
 967         xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
 968         return 0;
 969 }
 970
 971 /*
 972  * Try to retrieve the next record to the left/right from the current one.
 973  */
 974 STATIC int
 975 xfs_ialloc_next_rec(
 976         struct xfs_btree_cur    *cur,
 977         xfs_inobt_rec_incore_t  *rec,
 978         int                     *done,
 979         int                     left)
 980 {
 981         int                     error;
 982         int                     i;
 983
 984         if (left)
 985                 error = xfs_btree_decrement(cur, 0, &i);
 986         else
 987                 error = xfs_btree_increment(cur, 0, &i);
 988
 989         if (error)
 990                 return error;
 991         *done = !i;
 992         if (i) {
 993                 error = xfs_inobt_get_rec(cur, rec, &i);
 994                 if (error)
 995                         return error;
 996                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
 997                         xfs_btree_mark_sick(cur);
 998                         return -EFSCORRUPTED;
 999                 }
1000         }
1001
1002         return 0;
1003 }
1004
1005 STATIC int
1006 xfs_ialloc_get_rec(
1007         struct xfs_btree_cur    *cur,
1008         xfs_agino_t             agino,
1009         xfs_inobt_rec_incore_t  *rec,
1010         int                     *done)
1011 {
1012         int                     error;
1013         int                     i;
1014
1015         error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
1016         if (error)
1017                 return error;
1018         *done = !i;
1019         if (i) {
1020                 error = xfs_inobt_get_rec(cur, rec, &i);
1021                 if (error)
1022                         return error;
1023                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1024                         xfs_btree_mark_sick(cur);
1025                         return -EFSCORRUPTED;
1026                 }
1027         }
1028
1029         return 0;
1030 }
1031
1032 /*
1033  * Return the offset of the first free inode in the record. If the inode chunk
1034  * is sparsely allocated, we convert the record holemask to inode granularity
1035  * and mask off the unallocated regions from the inode free mask.
1036  */
1037 STATIC int
1038 xfs_inobt_first_free_inode(
1039         struct xfs_inobt_rec_incore     *rec)
1040 {
1041         xfs_inofree_t                   realfree;
1042
1043         /* if there are no holes, return the first available offset */
1044         if (!xfs_inobt_issparse(rec->ir_holemask))
1045                 return xfs_lowbit64(rec->ir_free);
1046
1047         realfree = xfs_inobt_irec_to_allocmask(rec);
1048         realfree &= rec->ir_free;
1049
1050         return xfs_lowbit64(realfree);
1051 }
1052
1053 /*
1054  * If this AG has corrupt inodes, check if allocating this inode would fail
1055  * with corruption errors.  Returns 0 if we're clear, or EAGAIN to try again
1056  * somewhere else.
1057  */
1058 static int
1059 xfs_dialloc_check_ino(
1060         struct xfs_perag        *pag,
1061         struct xfs_trans        *tp,
1062         xfs_ino_t               ino)
1063 {
1064         struct xfs_imap         imap;
1065         struct xfs_buf          *bp;
1066         int                     error;
1067
1068         error = xfs_imap(pag, tp, ino, &imap, 0);
1069         if (error)
1070                 return -EAGAIN;
1071
1072         error = xfs_imap_to_bp(pag_mount(pag), tp, &imap, &bp);
1073         if (error)
1074                 return -EAGAIN;
1075
1076         xfs_trans_brelse(tp, bp);
1077         return 0;
1078 }
1079
1080 /*
1081  * Allocate an inode using the inobt-only algorithm.
1082  */
1083 STATIC int
1084 xfs_dialloc_ag_inobt(
1085         struct xfs_perag        *pag,
1086         struct xfs_trans        *tp,
1087         struct xfs_buf          *agbp,
1088         xfs_ino_t               parent,
1089         xfs_ino_t               *inop)
1090 {
1091         struct xfs_mount        *mp = tp->t_mountp;
1092         struct xfs_agi          *agi = agbp->b_addr;
1093         xfs_agnumber_t          pagno = XFS_INO_TO_AGNO(mp, parent);
1094         xfs_agino_t             pagino = XFS_INO_TO_AGINO(mp, parent);
1095         struct xfs_btree_cur    *cur, *tcur;
1096         struct xfs_inobt_rec_incore rec, trec;
1097         xfs_ino_t               ino;
1098         int                     error;
1099         int                     offset;
1100         int                     i, j;
1101         int                     searchdistance = 10;
1102
1103         ASSERT(xfs_perag_initialised_agi(pag));
1104         ASSERT(xfs_perag_allows_inodes(pag));
1105         ASSERT(pag->pagi_freecount > 0);
1106
1107  restart_pagno:
1108         cur = xfs_inobt_init_cursor(pag, tp, agbp);
1109         /*
1110          * If pagino is 0 (this is the root inode allocation) use newino.
1111          * This must work because we've just allocated some.
1112          */
1113         if (!pagino)
1114                 pagino = be32_to_cpu(agi->agi_newino);
1115
1116         error = xfs_check_agi_freecount(cur);
1117         if (error)
1118                 goto error0;
1119
1120         /*
1121          * If in the same AG as the parent, try to get near the parent.
1122          */
1123         if (pagno == pag_agno(pag)) {
1124                 int             doneleft;       /* done, to the left */
1125                 int             doneright;      /* done, to the right */
1126
1127                 error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
1128                 if (error)
1129                         goto error0;
1130                 if (XFS_IS_CORRUPT(mp, i != 1)) {
1131                         xfs_btree_mark_sick(cur);
1132                         error = -EFSCORRUPTED;
1133                         goto error0;
1134                 }
1135
1136                 error = xfs_inobt_get_rec(cur, &rec, &j);
1137                 if (error)
1138                         goto error0;
1139                 if (XFS_IS_CORRUPT(mp, j != 1)) {
1140                         xfs_btree_mark_sick(cur);
1141                         error = -EFSCORRUPTED;
1142                         goto error0;
1143                 }
1144
1145                 if (rec.ir_freecount > 0) {
1146                         /*
1147                          * Found a free inode in the same chunk
1148                          * as the parent, done.
1149                          */
1150                         goto alloc_inode;
1151                 }
1152
1153
1154                 /*
1155                  * In the same AG as parent, but parent's chunk is full.
1156                  */
1157
1158                 /* duplicate the cursor, search left & right simultaneously */
1159                 error = xfs_btree_dup_cursor(cur, &tcur);
1160                 if (error)
1161                         goto error0;
1162
1163                 /*
1164                  * Skip to last blocks looked up if same parent inode.
1165                  */
1166                 if (pagino != NULLAGINO &&
1167                     pag->pagl_pagino == pagino &&
1168                     pag->pagl_leftrec != NULLAGINO &&
1169                     pag->pagl_rightrec != NULLAGINO) {
1170                         error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
1171                                                    &trec, &doneleft);
1172                         if (error)
1173                                 goto error1;
1174
1175                         error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
1176                                                    &rec, &doneright);
1177                         if (error)
1178                                 goto error1;
1179                 } else {
1180                         /* search left with tcur, back up 1 record */
1181                         error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
1182                         if (error)
1183                                 goto error1;
1184
1185                         /* search right with cur, go forward 1 record. */
1186                         error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
1187                         if (error)
1188                                 goto error1;
1189                 }
1190
1191                 /*
1192                  * Loop until we find an inode chunk with a free inode.
1193                  */
1194                 while (--searchdistance > 0 && (!doneleft || !doneright)) {
1195                         int     useleft;  /* using left inode chunk this time */
1196
1197                         /* figure out the closer block if both are valid. */
1198                         if (!doneleft && !doneright) {
1199                                 useleft = pagino -
1200                                  (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
1201                                   rec.ir_startino - pagino;
1202                         } else {
1203                                 useleft = !doneleft;
1204                         }
1205
1206                         /* free inodes to the left? */
1207                         if (useleft && trec.ir_freecount) {
1208                                 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1209                                 cur = tcur;
1210
1211                                 pag->pagl_leftrec = trec.ir_startino;
1212                                 pag->pagl_rightrec = rec.ir_startino;
1213                                 pag->pagl_pagino = pagino;
1214                                 rec = trec;
1215                                 goto alloc_inode;
1216                         }
1217
1218                         /* free inodes to the right? */
1219                         if (!useleft && rec.ir_freecount) {
1220                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1221
1222                                 pag->pagl_leftrec = trec.ir_startino;
1223                                 pag->pagl_rightrec = rec.ir_startino;
1224                                 pag->pagl_pagino = pagino;
1225                                 goto alloc_inode;
1226                         }
1227
1228                         /* get next record to check */
1229                         if (useleft) {
1230                                 error = xfs_ialloc_next_rec(tcur, &trec,
1231                                                                  &doneleft, 1);
1232                         } else {
1233                                 error = xfs_ialloc_next_rec(cur, &rec,
1234                                                                  &doneright, 0);
1235                         }
1236                         if (error)
1237                                 goto error1;
1238                 }
1239
1240                 if (searchdistance <= 0) {
1241                         /*
1242                          * Not in range - save last search
1243                          * location and allocate a new inode
1244                          */
1245                         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1246                         pag->pagl_leftrec = trec.ir_startino;
1247                         pag->pagl_rightrec = rec.ir_startino;
1248                         pag->pagl_pagino = pagino;
1249
1250                 } else {
1251                         /*
1252                          * We've reached the end of the btree. because
1253                          * we are only searching a small chunk of the
1254                          * btree each search, there is obviously free
1255                          * inodes closer to the parent inode than we
1256                          * are now. restart the search again.
1257                          */
1258                         pag->pagl_pagino = NULLAGINO;
1259                         pag->pagl_leftrec = NULLAGINO;
1260                         pag->pagl_rightrec = NULLAGINO;
1261                         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1262                         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1263                         goto restart_pagno;
1264                 }
1265         }
1266
1267         /*
1268          * In a different AG from the parent.
1269          * See if the most recently allocated block has any free.
1270          */
1271         if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
1272                 error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1273                                          XFS_LOOKUP_EQ, &i);
1274                 if (error)
1275                         goto error0;
1276
1277                 if (i == 1) {
1278                         error = xfs_inobt_get_rec(cur, &rec, &j);
1279                         if (error)
1280                                 goto error0;
1281
1282                         if (j == 1 && rec.ir_freecount > 0) {
1283                                 /*
1284                                  * The last chunk allocated in the group
1285                                  * still has a free inode.
1286                                  */
1287                                 goto alloc_inode;
1288                         }
1289                 }
1290         }
1291
1292         /*
1293          * None left in the last group, search the whole AG
1294          */
1295         error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1296         if (error)
1297                 goto error0;
1298         if (XFS_IS_CORRUPT(mp, i != 1)) {
1299                 xfs_btree_mark_sick(cur);
1300                 error = -EFSCORRUPTED;
1301                 goto error0;
1302         }
1303
1304         for (;;) {
1305                 error = xfs_inobt_get_rec(cur, &rec, &i);
1306                 if (error)
1307                         goto error0;
1308                 if (XFS_IS_CORRUPT(mp, i != 1)) {
1309                         xfs_btree_mark_sick(cur);
1310                         error = -EFSCORRUPTED;
1311                         goto error0;
1312                 }
1313                 if (rec.ir_freecount > 0)
1314                         break;
1315                 error = xfs_btree_increment(cur, 0, &i);
1316                 if (error)
1317                         goto error0;
1318                 if (XFS_IS_CORRUPT(mp, i != 1)) {
1319                         xfs_btree_mark_sick(cur);
1320                         error = -EFSCORRUPTED;
1321                         goto error0;
1322                 }
1323         }
1324
1325 alloc_inode:
1326         offset = xfs_inobt_first_free_inode(&rec);
1327         ASSERT(offset >= 0);
1328         ASSERT(offset < XFS_INODES_PER_CHUNK);
1329         ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1330                                    XFS_INODES_PER_CHUNK) == 0);
1331         ino = xfs_agino_to_ino(pag, rec.ir_startino + offset);
1332
1333         if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
1334                 error = xfs_dialloc_check_ino(pag, tp, ino);
1335                 if (error)
1336                         goto error0;
1337         }
1338
1339         rec.ir_free &= ~XFS_INOBT_MASK(offset);
1340         rec.ir_freecount--;
1341         error = xfs_inobt_update(cur, &rec);
1342         if (error)
1343                 goto error0;
1344         be32_add_cpu(&agi->agi_freecount, -1);
1345         xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1346         pag->pagi_freecount--;
1347
1348         error = xfs_check_agi_freecount(cur);
1349         if (error)
1350                 goto error0;
1351
1352         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1353         xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1354         *inop = ino;
1355         return 0;
1356 error1:
1357         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1358 error0:
1359         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1360         return error;
1361 }
1362
1363 /*
1364  * Use the free inode btree to allocate an inode based on distance from the
1365  * parent. Note that the provided cursor may be deleted and replaced.
1366  */
1367 STATIC int
1368 xfs_dialloc_ag_finobt_near(
1369         xfs_agino_t                     pagino,
1370         struct xfs_btree_cur            **ocur,
1371         struct xfs_inobt_rec_incore     *rec)
1372 {
1373         struct xfs_btree_cur            *lcur = *ocur;  /* left search cursor */
1374         struct xfs_btree_cur            *rcur;  /* right search cursor */
1375         struct xfs_inobt_rec_incore     rrec;
1376         int                             error;
1377         int                             i, j;
1378
1379         error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
1380         if (error)
1381                 return error;
1382
1383         if (i == 1) {
1384                 error = xfs_inobt_get_rec(lcur, rec, &i);
1385                 if (error)
1386                         return error;
1387                 if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) {
1388                         xfs_btree_mark_sick(lcur);
1389                         return -EFSCORRUPTED;
1390                 }
1391
1392                 /*
1393                  * See if we've landed in the parent inode record. The finobt
1394                  * only tracks chunks with at least one free inode, so record
1395                  * existence is enough.
1396                  */
1397                 if (pagino >= rec->ir_startino &&
1398                     pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
1399                         return 0;
1400         }
1401
1402         error = xfs_btree_dup_cursor(lcur, &rcur);
1403         if (error)
1404                 return error;
1405
1406         error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
1407         if (error)
1408                 goto error_rcur;
1409         if (j == 1) {
1410                 error = xfs_inobt_get_rec(rcur, &rrec, &j);
1411                 if (error)
1412                         goto error_rcur;
1413                 if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) {
1414                         xfs_btree_mark_sick(lcur);
1415                         error = -EFSCORRUPTED;
1416                         goto error_rcur;
1417                 }
1418         }
1419
1420         if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) {
1421                 xfs_btree_mark_sick(lcur);
1422                 error = -EFSCORRUPTED;
1423                 goto error_rcur;
1424         }
1425         if (i == 1 && j == 1) {
1426                 /*
1427                  * Both the left and right records are valid. Choose the closer
1428                  * inode chunk to the target.
1429                  */
1430                 if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
1431                     (rrec.ir_startino - pagino)) {
1432                         *rec = rrec;
1433                         xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1434                         *ocur = rcur;
1435                 } else {
1436                         xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1437                 }
1438         } else if (j == 1) {
1439                 /* only the right record is valid */
1440                 *rec = rrec;
1441                 xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1442                 *ocur = rcur;
1443         } else if (i == 1) {
1444                 /* only the left record is valid */
1445                 xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1446         }
1447
1448         return 0;
1449
1450 error_rcur:
1451         xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
1452         return error;
1453 }
1454
1455 /*
1456  * Use the free inode btree to find a free inode based on a newino hint. If
1457  * the hint is NULL, find the first free inode in the AG.
1458  */
1459 STATIC int
1460 xfs_dialloc_ag_finobt_newino(
1461         struct xfs_agi                  *agi,
1462         struct xfs_btree_cur            *cur,
1463         struct xfs_inobt_rec_incore     *rec)
1464 {
1465         int error;
1466         int i;
1467
1468         if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
1469                 error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1470                                          XFS_LOOKUP_EQ, &i);
1471                 if (error)
1472                         return error;
1473                 if (i == 1) {
1474                         error = xfs_inobt_get_rec(cur, rec, &i);
1475                         if (error)
1476                                 return error;
1477                         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1478                                 xfs_btree_mark_sick(cur);
1479                                 return -EFSCORRUPTED;
1480                         }
1481                         return 0;
1482                 }
1483         }
1484
1485         /*
1486          * Find the first inode available in the AG.
1487          */
1488         error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1489         if (error)
1490                 return error;
1491         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1492                 xfs_btree_mark_sick(cur);
1493                 return -EFSCORRUPTED;
1494         }
1495
1496         error = xfs_inobt_get_rec(cur, rec, &i);
1497         if (error)
1498                 return error;
1499         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1500                 xfs_btree_mark_sick(cur);
1501                 return -EFSCORRUPTED;
1502         }
1503
1504         return 0;
1505 }
1506
1507 /*
1508  * Update the inobt based on a modification made to the finobt. Also ensure that
1509  * the records from both trees are equivalent post-modification.
1510  */
1511 STATIC int
1512 xfs_dialloc_ag_update_inobt(
1513         struct xfs_btree_cur            *cur,   /* inobt cursor */
1514         struct xfs_inobt_rec_incore     *frec,  /* finobt record */
1515         int                             offset) /* inode offset */
1516 {
1517         struct xfs_inobt_rec_incore     rec;
1518         int                             error;
1519         int                             i;
1520
1521         error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
1522         if (error)
1523                 return error;
1524         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1525                 xfs_btree_mark_sick(cur);
1526                 return -EFSCORRUPTED;
1527         }
1528
1529         error = xfs_inobt_get_rec(cur, &rec, &i);
1530         if (error)
1531                 return error;
1532         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
1533                 xfs_btree_mark_sick(cur);
1534                 return -EFSCORRUPTED;
1535         }
1536         ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
1537                                    XFS_INODES_PER_CHUNK) == 0);
1538
1539         rec.ir_free &= ~XFS_INOBT_MASK(offset);
1540         rec.ir_freecount--;
1541
1542         if (XFS_IS_CORRUPT(cur->bc_mp,
1543                            rec.ir_free != frec->ir_free ||
1544                            rec.ir_freecount != frec->ir_freecount)) {
1545                 xfs_btree_mark_sick(cur);
1546                 return -EFSCORRUPTED;
1547         }
1548
1549         return xfs_inobt_update(cur, &rec);
1550 }
1551
1552 /*
1553  * Allocate an inode using the free inode btree, if available. Otherwise, fall
1554  * back to the inobt search algorithm.
1555  *
1556  * The caller selected an AG for us, and made sure that free inodes are
1557  * available.
1558  */
1559 static int
1560 xfs_dialloc_ag(
1561         struct xfs_perag        *pag,
1562         struct xfs_trans        *tp,
1563         struct xfs_buf          *agbp,
1564         xfs_ino_t               parent,
1565         xfs_ino_t               *inop)
1566 {
1567         struct xfs_mount                *mp = tp->t_mountp;
1568         struct xfs_agi                  *agi = agbp->b_addr;
1569         xfs_agnumber_t                  pagno = XFS_INO_TO_AGNO(mp, parent);
1570         xfs_agino_t                     pagino = XFS_INO_TO_AGINO(mp, parent);
1571         struct xfs_btree_cur            *cur;   /* finobt cursor */
1572         struct xfs_btree_cur            *icur;  /* inobt cursor */
1573         struct xfs_inobt_rec_incore     rec;
1574         xfs_ino_t                       ino;
1575         int                             error;
1576         int                             offset;
1577         int                             i;
1578
1579         if (!xfs_has_finobt(mp))
1580                 return xfs_dialloc_ag_inobt(pag, tp, agbp, parent, inop);
1581
1582         /*
1583          * If pagino is 0 (this is the root inode allocation) use newino.
1584          * This must work because we've just allocated some.
1585          */
1586         if (!pagino)
1587                 pagino = be32_to_cpu(agi->agi_newino);
1588
1589         cur = xfs_finobt_init_cursor(pag, tp, agbp);
1590
1591         error = xfs_check_agi_freecount(cur);
1592         if (error)
1593                 goto error_cur;
1594
1595         /*
1596          * The search algorithm depends on whether we're in the same AG as the
1597          * parent. If so, find the closest available inode to the parent. If
1598          * not, consider the agi hint or find the first free inode in the AG.
1599          */
1600         if (pag_agno(pag) == pagno)
1601                 error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
1602         else
1603                 error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
1604         if (error)
1605                 goto error_cur;
1606
1607         offset = xfs_inobt_first_free_inode(&rec);
1608         ASSERT(offset >= 0);
1609         ASSERT(offset < XFS_INODES_PER_CHUNK);
1610         ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1611                                    XFS_INODES_PER_CHUNK) == 0);
1612         ino = xfs_agino_to_ino(pag, rec.ir_startino + offset);
1613
1614         if (xfs_ag_has_sickness(pag, XFS_SICK_AG_INODES)) {
1615                 error = xfs_dialloc_check_ino(pag, tp, ino);
1616                 if (error)
1617                         goto error_cur;
1618         }
1619
1620         /*
1621          * Modify or remove the finobt record.
1622          */
1623         rec.ir_free &= ~XFS_INOBT_MASK(offset);
1624         rec.ir_freecount--;
1625         if (rec.ir_freecount)
1626                 error = xfs_inobt_update(cur, &rec);
1627         else
1628                 error = xfs_btree_delete(cur, &i);
1629         if (error)
1630                 goto error_cur;
1631
1632         /*
1633          * The finobt has now been updated appropriately. We haven't updated the
1634          * agi and superblock yet, so we can create an inobt cursor and validate
1635          * the original freecount. If all is well, make the equivalent update to
1636          * the inobt using the finobt record and offset information.
1637          */
1638         icur = xfs_inobt_init_cursor(pag, tp, agbp);
1639
1640         error = xfs_check_agi_freecount(icur);
1641         if (error)
1642                 goto error_icur;
1643
1644         error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
1645         if (error)
1646                 goto error_icur;
1647
1648         /*
1649          * Both trees have now been updated. We must update the perag and
1650          * superblock before we can check the freecount for each btree.
1651          */
1652         be32_add_cpu(&agi->agi_freecount, -1);
1653         xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1654         pag->pagi_freecount--;
1655
1656         xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1657
1658         error = xfs_check_agi_freecount(icur);
1659         if (error)
1660                 goto error_icur;
1661         error = xfs_check_agi_freecount(cur);
1662         if (error)
1663                 goto error_icur;
1664
1665         xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
1666         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1667         *inop = ino;
1668         return 0;
1669
1670 error_icur:
1671         xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
1672 error_cur:
1673         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1674         return error;
1675 }
1676
1677 static int
1678 xfs_dialloc_roll(
1679         struct xfs_trans        **tpp,
1680         struct xfs_buf          *agibp)
1681 {
1682         struct xfs_trans        *tp = *tpp;
1683         struct xfs_dquot_acct   *dqinfo;
1684         int                     error;
1685
1686         /*
1687          * Hold to on to the agibp across the commit so no other allocation can
1688          * come in and take the free inodes we just allocated for our caller.
1689          */
1690         xfs_trans_bhold(tp, agibp);
1691
1692         /*
1693          * We want the quota changes to be associated with the next transaction,
1694          * NOT this one. So, detach the dqinfo from this and attach it to the
1695          * next transaction.
1696          */
1697         dqinfo = tp->t_dqinfo;
1698         tp->t_dqinfo = NULL;
1699
1700         error = xfs_trans_roll(&tp);
1701
1702         /* Re-attach the quota info that we detached from prev trx. */
1703         tp->t_dqinfo = dqinfo;
1704
1705         /*
1706          * Join the buffer even on commit error so that the buffer is released
1707          * when the caller cancels the transaction and doesn't have to handle
1708          * this error case specially.
1709          */
1710         xfs_trans_bjoin(tp, agibp);
1711         *tpp = tp;
1712         return error;
1713 }
1714
1715 static bool
1716 xfs_dialloc_good_ag(
1717         struct xfs_perag        *pag,
1718         struct xfs_trans        *tp,
1719         umode_t                 mode,
1720         int                     flags,
1721         bool                    ok_alloc)
1722 {
1723         struct xfs_mount        *mp = tp->t_mountp;
1724         xfs_extlen_t            ineed;
1725         xfs_extlen_t            longest = 0;
1726         int                     needspace;
1727         int                     error;
1728
1729         if (!pag)
1730                 return false;
1731         if (!xfs_perag_allows_inodes(pag))
1732                 return false;
1733
1734         if (!xfs_perag_initialised_agi(pag)) {
1735                 error = xfs_ialloc_read_agi(pag, tp, 0, NULL);
1736                 if (error)
1737                         return false;
1738         }
1739
1740         if (pag->pagi_freecount)
1741                 return true;
1742         if (!ok_alloc)
1743                 return false;
1744
1745         if (!xfs_perag_initialised_agf(pag)) {
1746                 error = xfs_alloc_read_agf(pag, tp, flags, NULL);
1747                 if (error)
1748                         return false;
1749         }
1750
1751         /*
1752          * Check that there is enough free space for the file plus a chunk of
1753          * inodes if we need to allocate some. If this is the first pass across
1754          * the AGs, take into account the potential space needed for alignment
1755          * of inode chunks when checking the longest contiguous free space in
1756          * the AG - this prevents us from getting ENOSPC because we have free
1757          * space larger than ialloc_blks but alignment constraints prevent us
1758          * from using it.
1759          *
1760          * If we can't find an AG with space for full alignment slack to be
1761          * taken into account, we must be near ENOSPC in all AGs.  Hence we
1762          * don't include alignment for the second pass and so if we fail
1763          * allocation due to alignment issues then it is most likely a real
1764          * ENOSPC condition.
1765          *
1766          * XXX(dgc): this calculation is now bogus thanks to the per-ag
1767          * reservations that xfs_alloc_fix_freelist() now does via
1768          * xfs_alloc_space_available(). When the AG fills up, pagf_freeblks will
1769          * be more than large enough for the check below to succeed, but
1770          * xfs_alloc_space_available() will fail because of the non-zero
1771          * metadata reservation and hence we won't actually be able to allocate
1772          * more inodes in this AG. We do soooo much unnecessary work near ENOSPC
1773          * because of this.
1774          */
1775         ineed = M_IGEO(mp)->ialloc_min_blks;
1776         if (flags && ineed > 1)
1777                 ineed += M_IGEO(mp)->cluster_align;
1778         longest = pag->pagf_longest;
1779         if (!longest)
1780                 longest = pag->pagf_flcount > 0;
1781         needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
1782
1783         if (pag->pagf_freeblks < needspace + ineed || longest < ineed)
1784                 return false;
1785         return true;
1786 }
1787
1788 static int
1789 xfs_dialloc_try_ag(
1790         struct xfs_perag        *pag,
1791         struct xfs_trans        **tpp,
1792         xfs_ino_t               parent,
1793         xfs_ino_t               *new_ino,
1794         bool                    ok_alloc)
1795 {
1796         struct xfs_buf          *agbp;
1797         xfs_ino_t               ino;
1798         int                     error;
1799
1800         /*
1801          * Then read in the AGI buffer and recheck with the AGI buffer
1802          * lock held.
1803          */
1804         error = xfs_ialloc_read_agi(pag, *tpp, 0, &agbp);
1805         if (error)
1806                 return error;
1807
1808         if (!pag->pagi_freecount) {
1809                 if (!ok_alloc) {
1810                         error = -EAGAIN;
1811                         goto out_release;
1812                 }
1813
1814                 error = xfs_ialloc_ag_alloc(pag, *tpp, agbp);
1815                 if (error < 0)
1816                         goto out_release;
1817
1818                 /*
1819                  * We successfully allocated space for an inode cluster in this
1820                  * AG.  Roll the transaction so that we can allocate one of the
1821                  * new inodes.
1822                  */
1823                 ASSERT(pag->pagi_freecount > 0);
1824                 error = xfs_dialloc_roll(tpp, agbp);
1825                 if (error)
1826                         goto out_release;
1827         }
1828
1829         /* Allocate an inode in the found AG */
1830         error = xfs_dialloc_ag(pag, *tpp, agbp, parent, &ino);
1831         if (!error)
1832                 *new_ino = ino;
1833         return error;
1834
1835 out_release:
1836         xfs_trans_brelse(*tpp, agbp);
1837         return error;
1838 }
1839
1840 /*
1841  * Pick an AG for the new inode.
1842  *
1843  * Directories, symlinks, and regular files frequently allocate at least one
1844  * block, so factor that potential expansion when we examine whether an AG has
1845  * enough space for file creation.  Try to keep metadata files all in the same
1846  * AG.
1847  */
1848 static inline xfs_agnumber_t
1849 xfs_dialloc_pick_ag(
1850         struct xfs_mount        *mp,
1851         struct xfs_inode        *dp,
1852         umode_t                 mode)
1853 {
1854         xfs_agnumber_t          start_agno;
1855
1856         if (!dp)
1857                 return 0;
1858         if (xfs_is_metadir_inode(dp)) {
1859                 if (mp->m_sb.sb_logstart)
1860                         return XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart);
1861                 return 0;
1862         }
1863
1864         if (S_ISDIR(mode))
1865                 return (atomic_inc_return(&mp->m_agirotor) - 1) % mp->m_maxagi;
1866
1867         start_agno = XFS_INO_TO_AGNO(mp, dp->i_ino);
1868         if (start_agno >= mp->m_maxagi)
1869                 start_agno = 0;
1870
1871         return start_agno;
1872 }
1873
1874 /*
1875  * Allocate an on-disk inode.
1876  *
1877  * Mode is used to tell whether the new inode is a directory and hence where to
1878  * locate it. The on-disk inode that is allocated will be returned in @new_ino
1879  * on success, otherwise an error will be set to indicate the failure (e.g.
1880  * -ENOSPC).
1881  */
1882 int
1883 xfs_dialloc(
1884         struct xfs_trans        **tpp,
1885         const struct xfs_icreate_args *args,
1886         xfs_ino_t               *new_ino)
1887 {
1888         struct xfs_mount        *mp = (*tpp)->t_mountp;
1889         struct xfs_perag        *pag;
1890         struct xfs_ino_geometry *igeo = M_IGEO(mp);
1891         xfs_ino_t               ino = NULLFSINO;
1892         xfs_ino_t               parent = args->pip ? args->pip->i_ino : 0;
1893         xfs_agnumber_t          agno;
1894         xfs_agnumber_t          start_agno;
1895         umode_t                 mode = args->mode & S_IFMT;
1896         bool                    ok_alloc = true;
1897         bool                    low_space = false;
1898         int                     flags;
1899         int                     error = 0;
1900
1901         start_agno = xfs_dialloc_pick_ag(mp, args->pip, mode);
1902
1903         /*
1904          * If we have already hit the ceiling of inode blocks then clear
1905          * ok_alloc so we scan all available agi structures for a free
1906          * inode.
1907          *
1908          * Read rough value of mp->m_icount by percpu_counter_read_positive,
1909          * which will sacrifice the preciseness but improve the performance.
1910          */
1911         if (igeo->maxicount &&
1912             percpu_counter_read_positive(&mp->m_icount) + igeo->ialloc_inos
1913                                                         > igeo->maxicount) {
1914                 ok_alloc = false;
1915         }
1916
1917         /*
1918          * If we are near to ENOSPC, we want to prefer allocation from AGs that
1919          * have free inodes in them rather than use up free space allocating new
1920          * inode chunks. Hence we turn off allocation for the first non-blocking
1921          * pass through the AGs if we are near ENOSPC to consume free inodes
1922          * that we can immediately allocate, but then we allow allocation on the
1923          * second pass if we fail to find an AG with free inodes in it.
1924          */
1925         if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) <
1926                         mp->m_low_space[XFS_LOWSP_1_PCNT]) {
1927                 ok_alloc = false;
1928                 low_space = true;
1929         }
1930
1931         /*
1932          * Loop until we find an allocation group that either has free inodes
1933          * or in which we can allocate some inodes.  Iterate through the
1934          * allocation groups upward, wrapping at the end.
1935          */
1936         flags = XFS_ALLOC_FLAG_TRYLOCK;
1937 retry:
1938         for_each_perag_wrap_at(mp, start_agno, mp->m_maxagi, agno, pag) {
1939                 if (xfs_dialloc_good_ag(pag, *tpp, mode, flags, ok_alloc)) {
1940                         error = xfs_dialloc_try_ag(pag, tpp, parent,
1941                                         &ino, ok_alloc);
1942                         if (error != -EAGAIN)
1943                                 break;
1944                         error = 0;
1945                 }
1946
1947                 if (xfs_is_shutdown(mp)) {
1948                         error = -EFSCORRUPTED;
1949                         break;
1950                 }
1951         }
1952         if (pag)
1953                 xfs_perag_rele(pag);
1954         if (error)
1955                 return error;
1956         if (ino == NULLFSINO) {
1957                 if (flags) {
1958                         flags = 0;
1959                         if (low_space)
1960                                 ok_alloc = true;
1961                         goto retry;
1962                 }
1963                 return -ENOSPC;
1964         }
1965
1966         /*
1967          * Protect against obviously corrupt allocation btree records. Later
1968          * xfs_iget checks will catch re-allocation of other active in-memory
1969          * and on-disk inodes. If we don't catch reallocating the parent inode
1970          * here we will deadlock in xfs_iget() so we have to do these checks
1971          * first.
1972          */
1973         if (ino == parent || !xfs_verify_dir_ino(mp, ino)) {
1974                 xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
1975                 xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino),
1976                                 XFS_SICK_AG_INOBT);
1977                 return -EFSCORRUPTED;
1978         }
1979
1980         *new_ino = ino;
1981         return 0;
1982 }
1983
1984 /*
1985  * Free the blocks of an inode chunk. We must consider that the inode chunk
1986  * might be sparse and only free the regions that are allocated as part of the
1987  * chunk.
1988  */
1989 static int
1990 xfs_difree_inode_chunk(
1991         struct xfs_trans                *tp,
1992         struct xfs_perag                *pag,
1993         struct xfs_inobt_rec_incore     *rec)
1994 {
1995         struct xfs_mount                *mp = tp->t_mountp;
1996         xfs_agblock_t                   sagbno = XFS_AGINO_TO_AGBNO(mp,
1997                                                         rec->ir_startino);
1998         int                             startidx, endidx;
1999         int                             nextbit;
2000         xfs_agblock_t                   agbno;
2001         int                             contigblk;
2002         DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
2003
2004         if (!xfs_inobt_issparse(rec->ir_holemask)) {
2005                 /* not sparse, calculate extent info directly */
2006                 return xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, sagbno),
2007                                 M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES,
2008                                 XFS_AG_RESV_NONE, 0);
2009         }
2010
2011         /* holemask is only 16-bits (fits in an unsigned long) */
2012         ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
2013         holemask[0] = rec->ir_holemask;
2014
2015         /*
2016          * Find contiguous ranges of zeroes (i.e., allocated regions) in the
2017          * holemask and convert the start/end index of each range to an extent.
2018          * We start with the start and end index both pointing at the first 0 in
2019          * the mask.
2020          */
2021         startidx = endidx = find_first_zero_bit(holemask,
2022                                                 XFS_INOBT_HOLEMASK_BITS);
2023         nextbit = startidx + 1;
2024         while (startidx < XFS_INOBT_HOLEMASK_BITS) {
2025                 int error;
2026
2027                 nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
2028                                              nextbit);
2029                 /*
2030                  * If the next zero bit is contiguous, update the end index of
2031                  * the current range and continue.
2032                  */
2033                 if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
2034                     nextbit == endidx + 1) {
2035                         endidx = nextbit;
2036                         goto next;
2037                 }
2038
2039                 /*
2040                  * nextbit is not contiguous with the current end index. Convert
2041                  * the current start/end to an extent and add it to the free
2042                  * list.
2043                  */
2044                 agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
2045                                   mp->m_sb.sb_inopblock;
2046                 contigblk = ((endidx - startidx + 1) *
2047                              XFS_INODES_PER_HOLEMASK_BIT) /
2048                             mp->m_sb.sb_inopblock;
2049
2050                 ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
2051                 ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
2052                 error = xfs_free_extent_later(tp, xfs_agbno_to_fsb(pag, agbno),
2053                                 contigblk, &XFS_RMAP_OINFO_INODES,
2054                                 XFS_AG_RESV_NONE, 0);
2055                 if (error)
2056                         return error;
2057
2058                 /* reset range to current bit and carry on... */
2059                 startidx = endidx = nextbit;
2060
2061 next:
2062                 nextbit++;
2063         }
2064         return 0;
2065 }
2066
2067 STATIC int
2068 xfs_difree_inobt(
2069         struct xfs_perag                *pag,
2070         struct xfs_trans                *tp,
2071         struct xfs_buf                  *agbp,
2072         xfs_agino_t                     agino,
2073         struct xfs_icluster             *xic,
2074         struct xfs_inobt_rec_incore     *orec)
2075 {
2076         struct xfs_mount                *mp = pag_mount(pag);
2077         struct xfs_agi                  *agi = agbp->b_addr;
2078         struct xfs_btree_cur            *cur;
2079         struct xfs_inobt_rec_incore     rec;
2080         int                             ilen;
2081         int                             error;
2082         int                             i;
2083         int                             off;
2084
2085         ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
2086         ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
2087
2088         /*
2089          * Initialize the cursor.
2090          */
2091         cur = xfs_inobt_init_cursor(pag, tp, agbp);
2092
2093         error = xfs_check_agi_freecount(cur);
2094         if (error)
2095                 goto error0;
2096
2097         /*
2098          * Look for the entry describing this inode.
2099          */
2100         if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
2101                 xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
2102                         __func__, error);
2103                 goto error0;
2104         }
2105         if (XFS_IS_CORRUPT(mp, i != 1)) {
2106                 xfs_btree_mark_sick(cur);
2107                 error = -EFSCORRUPTED;
2108                 goto error0;
2109         }
2110         error = xfs_inobt_get_rec(cur, &rec, &i);
2111         if (error) {
2112                 xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
2113                         __func__, error);
2114                 goto error0;
2115         }
2116         if (XFS_IS_CORRUPT(mp, i != 1)) {
2117                 xfs_btree_mark_sick(cur);
2118                 error = -EFSCORRUPTED;
2119                 goto error0;
2120         }
2121         /*
2122          * Get the offset in the inode chunk.
2123          */
2124         off = agino - rec.ir_startino;
2125         ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
2126         ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
2127         /*
2128          * Mark the inode free & increment the count.
2129          */
2130         rec.ir_free |= XFS_INOBT_MASK(off);
2131         rec.ir_freecount++;
2132
2133         /*
2134          * When an inode chunk is free, it becomes eligible for removal. Don't
2135          * remove the chunk if the block size is large enough for multiple inode
2136          * chunks (that might not be free).
2137          */
2138         if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
2139             mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
2140                 xic->deleted = true;
2141                 xic->first_ino = xfs_agino_to_ino(pag, rec.ir_startino);
2142                 xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
2143
2144                 /*
2145                  * Remove the inode cluster from the AGI B+Tree, adjust the
2146                  * AGI and Superblock inode counts, and mark the disk space
2147                  * to be freed when the transaction is committed.
2148                  */
2149                 ilen = rec.ir_freecount;
2150                 be32_add_cpu(&agi->agi_count, -ilen);
2151                 be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
2152                 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
2153                 pag->pagi_freecount -= ilen - 1;
2154                 pag->pagi_count -= ilen;
2155                 xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
2156                 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
2157
2158                 if ((error = xfs_btree_delete(cur, &i))) {
2159                         xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
2160                                 __func__, error);
2161                         goto error0;
2162                 }
2163
2164                 error = xfs_difree_inode_chunk(tp, pag, &rec);
2165                 if (error)
2166                         goto error0;
2167         } else {
2168                 xic->deleted = false;
2169
2170                 error = xfs_inobt_update(cur, &rec);
2171                 if (error) {
2172                         xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
2173                                 __func__, error);
2174                         goto error0;
2175                 }
2176
2177                 /*
2178                  * Change the inode free counts and log the ag/sb changes.
2179                  */
2180                 be32_add_cpu(&agi->agi_freecount, 1);
2181                 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
2182                 pag->pagi_freecount++;
2183                 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
2184         }
2185
2186         error = xfs_check_agi_freecount(cur);
2187         if (error)
2188                 goto error0;
2189
2190         *orec = rec;
2191         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
2192         return 0;
2193
2194 error0:
2195         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
2196         return error;
2197 }
2198
2199 /*
2200  * Free an inode in the free inode btree.
2201  */
2202 STATIC int
2203 xfs_difree_finobt(
2204         struct xfs_perag                *pag,
2205         struct xfs_trans                *tp,
2206         struct xfs_buf                  *agbp,
2207         xfs_agino_t                     agino,
2208         struct xfs_inobt_rec_incore     *ibtrec) /* inobt record */
2209 {
2210         struct xfs_mount                *mp = pag_mount(pag);
2211         struct xfs_btree_cur            *cur;
2212         struct xfs_inobt_rec_incore     rec;
2213         int                             offset = agino - ibtrec->ir_startino;
2214         int                             error;
2215         int                             i;
2216
2217         cur = xfs_finobt_init_cursor(pag, tp, agbp);
2218
2219         error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
2220         if (error)
2221                 goto error;
2222         if (i == 0) {
2223                 /*
2224                  * If the record does not exist in the finobt, we must have just
2225                  * freed an inode in a previously fully allocated chunk. If not,
2226                  * something is out of sync.
2227                  */
2228                 if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) {
2229                         xfs_btree_mark_sick(cur);
2230                         error = -EFSCORRUPTED;
2231                         goto error;
2232                 }
2233
2234                 error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
2235                                              ibtrec->ir_count,
2236                                              ibtrec->ir_freecount,
2237                                              ibtrec->ir_free, &i);
2238                 if (error)
2239                         goto error;
2240                 ASSERT(i == 1);
2241
2242                 goto out;
2243         }
2244
2245         /*
2246          * Read and update the existing record. We could just copy the ibtrec
2247          * across here, but that would defeat the purpose of having redundant
2248          * metadata. By making the modifications independently, we can catch
2249          * corruptions that we wouldn't see if we just copied from one record
2250          * to another.
2251          */
2252         error = xfs_inobt_get_rec(cur, &rec, &i);
2253         if (error)
2254                 goto error;
2255         if (XFS_IS_CORRUPT(mp, i != 1)) {
2256                 xfs_btree_mark_sick(cur);
2257                 error = -EFSCORRUPTED;
2258                 goto error;
2259         }
2260
2261         rec.ir_free |= XFS_INOBT_MASK(offset);
2262         rec.ir_freecount++;
2263
2264         if (XFS_IS_CORRUPT(mp,
2265                            rec.ir_free != ibtrec->ir_free ||
2266                            rec.ir_freecount != ibtrec->ir_freecount)) {
2267                 xfs_btree_mark_sick(cur);
2268                 error = -EFSCORRUPTED;
2269                 goto error;
2270         }
2271
2272         /*
2273          * The content of inobt records should always match between the inobt
2274          * and finobt. The lifecycle of records in the finobt is different from
2275          * the inobt in that the finobt only tracks records with at least one
2276          * free inode. Hence, if all of the inodes are free and we aren't
2277          * keeping inode chunks permanently on disk, remove the record.
2278          * Otherwise, update the record with the new information.
2279          *
2280          * Note that we currently can't free chunks when the block size is large
2281          * enough for multiple chunks. Leave the finobt record to remain in sync
2282          * with the inobt.
2283          */
2284         if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE &&
2285             mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
2286                 error = xfs_btree_delete(cur, &i);
2287                 if (error)
2288                         goto error;
2289                 ASSERT(i == 1);
2290         } else {
2291                 error = xfs_inobt_update(cur, &rec);
2292                 if (error)
2293                         goto error;
2294         }
2295
2296 out:
2297         error = xfs_check_agi_freecount(cur);
2298         if (error)
2299                 goto error;
2300
2301         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
2302         return 0;
2303
2304 error:
2305         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
2306         return error;
2307 }
2308
2309 /*
2310  * Free disk inode.  Carefully avoids touching the incore inode, all
2311  * manipulations incore are the caller's responsibility.
2312  * The on-disk inode is not changed by this operation, only the
2313  * btree (free inode mask) is changed.
2314  */
2315 int
2316 xfs_difree(
2317         struct xfs_trans        *tp,
2318         struct xfs_perag        *pag,
2319         xfs_ino_t               inode,
2320         struct xfs_icluster     *xic)
2321 {
2322         /* REFERENCED */
2323         xfs_agblock_t           agbno;  /* block number containing inode */
2324         struct xfs_buf          *agbp;  /* buffer for allocation group header */
2325         xfs_agino_t             agino;  /* allocation group inode number */
2326         int                     error;  /* error return value */
2327         struct xfs_mount        *mp = tp->t_mountp;
2328         struct xfs_inobt_rec_incore rec;/* btree record */
2329
2330         /*
2331          * Break up inode number into its components.
2332          */
2333         if (pag_agno(pag) != XFS_INO_TO_AGNO(mp, inode)) {
2334                 xfs_warn(mp, "%s: agno != pag_agno(pag) (%d != %d).",
2335                         __func__, XFS_INO_TO_AGNO(mp, inode), pag_agno(pag));
2336                 ASSERT(0);
2337                 return -EINVAL;
2338         }
2339         agino = XFS_INO_TO_AGINO(mp, inode);
2340         if (inode != xfs_agino_to_ino(pag, agino))  {
2341                 xfs_warn(mp, "%s: inode != xfs_agino_to_ino() (%llu != %llu).",
2342                         __func__, (unsigned long long)inode,
2343                         (unsigned long long)xfs_agino_to_ino(pag, agino));
2344                 ASSERT(0);
2345                 return -EINVAL;
2346         }
2347         agbno = XFS_AGINO_TO_AGBNO(mp, agino);
2348         if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) {
2349                 xfs_warn(mp, "%s: agbno >= xfs_ag_block_count (%d >= %d).",
2350                         __func__, agbno, xfs_ag_block_count(mp, pag_agno(pag)));
2351                 ASSERT(0);
2352                 return -EINVAL;
2353         }
2354         /*
2355          * Get the allocation group header.
2356          */
2357         error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
2358         if (error) {
2359                 xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
2360                         __func__, error);
2361                 return error;
2362         }
2363
2364         /*
2365          * Fix up the inode allocation btree.
2366          */
2367         error = xfs_difree_inobt(pag, tp, agbp, agino, xic, &rec);
2368         if (error)
2369                 goto error0;
2370
2371         /*
2372          * Fix up the free inode btree.
2373          */
2374         if (xfs_has_finobt(mp)) {
2375                 error = xfs_difree_finobt(pag, tp, agbp, agino, &rec);
2376                 if (error)
2377                         goto error0;
2378         }
2379
2380         return 0;
2381
2382 error0:
2383         return error;
2384 }
2385
2386 STATIC int
2387 xfs_imap_lookup(
2388         struct xfs_perag        *pag,
2389         struct xfs_trans        *tp,
2390         xfs_agino_t             agino,
2391         xfs_agblock_t           agbno,
2392         xfs_agblock_t           *chunk_agbno,
2393         xfs_agblock_t           *offset_agbno,
2394         int                     flags)
2395 {
2396         struct xfs_mount        *mp = pag_mount(pag);
2397         struct xfs_inobt_rec_incore rec;
2398         struct xfs_btree_cur    *cur;
2399         struct xfs_buf          *agbp;
2400         int                     error;
2401         int                     i;
2402
2403         error = xfs_ialloc_read_agi(pag, tp, 0, &agbp);
2404         if (error) {
2405                 xfs_alert(mp,
2406                         "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
2407                         __func__, error, pag_agno(pag));
2408                 return error;
2409         }
2410
2411         /*
2412          * Lookup the inode record for the given agino. If the record cannot be
2413          * found, then it's an invalid inode number and we should abort. Once
2414          * we have a record, we need to ensure it contains the inode number
2415          * we are looking up.
2416          */
2417         cur = xfs_inobt_init_cursor(pag, tp, agbp);
2418         error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
2419         if (!error) {
2420                 if (i)
2421                         error = xfs_inobt_get_rec(cur, &rec, &i);
2422                 if (!error && i == 0)
2423                         error = -EINVAL;
2424         }
2425
2426         xfs_trans_brelse(tp, agbp);
2427         xfs_btree_del_cursor(cur, error);
2428         if (error)
2429                 return error;
2430
2431         /* check that the returned record contains the required inode */
2432         if (rec.ir_startino > agino ||
2433             rec.ir_startino + M_IGEO(mp)->ialloc_inos <= agino)
2434                 return -EINVAL;
2435
2436         /* for untrusted inodes check it is allocated first */
2437         if ((flags & XFS_IGET_UNTRUSTED) &&
2438             (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
2439                 return -EINVAL;
2440
2441         *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
2442         *offset_agbno = agbno - *chunk_agbno;
2443         return 0;
2444 }
2445
2446 /*
2447  * Return the location of the inode in imap, for mapping it into a buffer.
2448  */
2449 int
2450 xfs_imap(
2451         struct xfs_perag        *pag,
2452         struct xfs_trans        *tp,
2453         xfs_ino_t               ino,    /* inode to locate */
2454         struct xfs_imap         *imap,  /* location map structure */
2455         uint                    flags)  /* flags for inode btree lookup */
2456 {
2457         struct xfs_mount        *mp = pag_mount(pag);
2458         xfs_agblock_t           agbno;  /* block number of inode in the alloc group */
2459         xfs_agino_t             agino;  /* inode number within alloc group */
2460         xfs_agblock_t           chunk_agbno;    /* first block in inode chunk */
2461         xfs_agblock_t           cluster_agbno;  /* first block in inode cluster */
2462         int                     error;  /* error code */
2463         int                     offset; /* index of inode in its buffer */
2464         xfs_agblock_t           offset_agbno;   /* blks from chunk start to inode */
2465
2466         ASSERT(ino != NULLFSINO);
2467
2468         /*
2469          * Split up the inode number into its parts.
2470          */
2471         agino = XFS_INO_TO_AGINO(mp, ino);
2472         agbno = XFS_AGINO_TO_AGBNO(mp, agino);
2473         if (agbno >= xfs_ag_block_count(mp, pag_agno(pag)) ||
2474             ino != xfs_agino_to_ino(pag, agino)) {
2475                 error = -EINVAL;
2476 #ifdef DEBUG
2477                 /*
2478                  * Don't output diagnostic information for untrusted inodes
2479                  * as they can be invalid without implying corruption.
2480                  */
2481                 if (flags & XFS_IGET_UNTRUSTED)
2482                         return error;
2483                 if (agbno >= xfs_ag_block_count(mp, pag_agno(pag))) {
2484                         xfs_alert(mp,
2485                 "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
2486                                 __func__, (unsigned long long)agbno,
2487                                 (unsigned long)xfs_ag_block_count(mp,
2488                                                         pag_agno(pag)));
2489                 }
2490                 if (ino != xfs_agino_to_ino(pag, agino)) {
2491                         xfs_alert(mp,
2492                 "%s: ino (0x%llx) != xfs_agino_to_ino() (0x%llx)",
2493                                 __func__, ino,
2494                                 xfs_agino_to_ino(pag, agino));
2495                 }
2496                 xfs_stack_trace();
2497 #endif /* DEBUG */
2498                 return error;
2499         }
2500
2501         /*
2502          * For bulkstat and handle lookups, we have an untrusted inode number
2503          * that we have to verify is valid. We cannot do this just by reading
2504          * the inode buffer as it may have been unlinked and removed leaving
2505          * inodes in stale state on disk. Hence we have to do a btree lookup
2506          * in all cases where an untrusted inode number is passed.
2507          */
2508         if (flags & XFS_IGET_UNTRUSTED) {
2509                 error = xfs_imap_lookup(pag, tp, agino, agbno,
2510                                         &chunk_agbno, &offset_agbno, flags);
2511                 if (error)
2512                         return error;
2513                 goto out_map;
2514         }
2515
2516         /*
2517          * If the inode cluster size is the same as the blocksize or
2518          * smaller we get to the buffer by simple arithmetics.
2519          */
2520         if (M_IGEO(mp)->blocks_per_cluster == 1) {
2521                 offset = XFS_INO_TO_OFFSET(mp, ino);
2522                 ASSERT(offset < mp->m_sb.sb_inopblock);
2523
2524                 imap->im_blkno = xfs_agbno_to_daddr(pag, agbno);
2525                 imap->im_len = XFS_FSB_TO_BB(mp, 1);
2526                 imap->im_boffset = (unsigned short)(offset <<
2527                                                         mp->m_sb.sb_inodelog);
2528                 return 0;
2529         }
2530
2531         /*
2532          * If the inode chunks are aligned then use simple maths to
2533          * find the location. Otherwise we have to do a btree
2534          * lookup to find the location.
2535          */
2536         if (M_IGEO(mp)->inoalign_mask) {
2537                 offset_agbno = agbno & M_IGEO(mp)->inoalign_mask;
2538                 chunk_agbno = agbno - offset_agbno;
2539         } else {
2540                 error = xfs_imap_lookup(pag, tp, agino, agbno,
2541                                         &chunk_agbno, &offset_agbno, flags);
2542                 if (error)
2543                         return error;
2544         }
2545
2546 out_map:
2547         ASSERT(agbno >= chunk_agbno);
2548         cluster_agbno = chunk_agbno +
2549                 ((offset_agbno / M_IGEO(mp)->blocks_per_cluster) *
2550                  M_IGEO(mp)->blocks_per_cluster);
2551         offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
2552                 XFS_INO_TO_OFFSET(mp, ino);
2553
2554         imap->im_blkno = xfs_agbno_to_daddr(pag, cluster_agbno);
2555         imap->im_len = XFS_FSB_TO_BB(mp, M_IGEO(mp)->blocks_per_cluster);
2556         imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
2557
2558         /*
2559          * If the inode number maps to a block outside the bounds
2560          * of the file system then return NULL rather than calling
2561          * read_buf and panicing when we get an error from the
2562          * driver.
2563          */
2564         if ((imap->im_blkno + imap->im_len) >
2565             XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
2566                 xfs_alert(mp,
2567         "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
2568                         __func__, (unsigned long long) imap->im_blkno,
2569                         (unsigned long long) imap->im_len,
2570                         XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
2571                 return -EINVAL;
2572         }
2573         return 0;
2574 }
2575
2576 /*
2577  * Log specified fields for the ag hdr (inode section). The growth of the agi
2578  * structure over time requires that we interpret the buffer as two logical
2579  * regions delineated by the end of the unlinked list. This is due to the size
2580  * of the hash table and its location in the middle of the agi.
2581  *
2582  * For example, a request to log a field before agi_unlinked and a field after
2583  * agi_unlinked could cause us to log the entire hash table and use an excessive
2584  * amount of log space. To avoid this behavior, log the region up through
2585  * agi_unlinked in one call and the region after agi_unlinked through the end of
2586  * the structure in another.
2587  */
2588 void
2589 xfs_ialloc_log_agi(
2590         struct xfs_trans        *tp,
2591         struct xfs_buf          *bp,
2592         uint32_t                fields)
2593 {
2594         int                     first;          /* first byte number */
2595         int                     last;           /* last byte number */
2596         static const short      offsets[] = {   /* field starting offsets */
2597                                         /* keep in sync with bit definitions */
2598                 offsetof(xfs_agi_t, agi_magicnum),
2599                 offsetof(xfs_agi_t, agi_versionnum),
2600                 offsetof(xfs_agi_t, agi_seqno),
2601                 offsetof(xfs_agi_t, agi_length),
2602                 offsetof(xfs_agi_t, agi_count),
2603                 offsetof(xfs_agi_t, agi_root),
2604                 offsetof(xfs_agi_t, agi_level),
2605                 offsetof(xfs_agi_t, agi_freecount),
2606                 offsetof(xfs_agi_t, agi_newino),
2607                 offsetof(xfs_agi_t, agi_dirino),
2608                 offsetof(xfs_agi_t, agi_unlinked),
2609                 offsetof(xfs_agi_t, agi_free_root),
2610                 offsetof(xfs_agi_t, agi_free_level),
2611                 offsetof(xfs_agi_t, agi_iblocks),
2612                 sizeof(xfs_agi_t)
2613         };
2614 #ifdef DEBUG
2615         struct xfs_agi          *agi = bp->b_addr;
2616
2617         ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
2618 #endif
2619
2620         /*
2621          * Compute byte offsets for the first and last fields in the first
2622          * region and log the agi buffer. This only logs up through
2623          * agi_unlinked.
2624          */
2625         if (fields & XFS_AGI_ALL_BITS_R1) {
2626                 xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
2627                                   &first, &last);
2628                 xfs_trans_log_buf(tp, bp, first, last);
2629         }
2630
2631         /*
2632          * Mask off the bits in the first region and calculate the first and
2633          * last field offsets for any bits in the second region.
2634          */
2635         fields &= ~XFS_AGI_ALL_BITS_R1;
2636         if (fields) {
2637                 xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
2638                                   &first, &last);
2639                 xfs_trans_log_buf(tp, bp, first, last);
2640         }
2641 }
2642
2643 static xfs_failaddr_t
2644 xfs_agi_verify(
2645         struct xfs_buf          *bp)
2646 {
2647         struct xfs_mount        *mp = bp->b_mount;
2648         struct xfs_agi          *agi = bp->b_addr;
2649         xfs_failaddr_t          fa;
2650         uint32_t                agi_seqno = be32_to_cpu(agi->agi_seqno);
2651         uint32_t                agi_length = be32_to_cpu(agi->agi_length);
2652         int                     i;
2653
2654         if (xfs_has_crc(mp)) {
2655                 if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
2656                         return __this_address;
2657                 if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn)))
2658                         return __this_address;
2659         }
2660
2661         /*
2662          * Validate the magic number of the agi block.
2663          */
2664         if (!xfs_verify_magic(bp, agi->agi_magicnum))
2665                 return __this_address;
2666         if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
2667                 return __this_address;
2668
2669         fa = xfs_validate_ag_length(bp, agi_seqno, agi_length);
2670         if (fa)
2671                 return fa;
2672
2673         if (be32_to_cpu(agi->agi_level) < 1 ||
2674             be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels)
2675                 return __this_address;
2676
2677         if (xfs_has_finobt(mp) &&
2678             (be32_to_cpu(agi->agi_free_level) < 1 ||
2679              be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels))
2680                 return __this_address;
2681
2682         for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
2683                 if (agi->agi_unlinked[i] == cpu_to_be32(NULLAGINO))
2684                         continue;
2685                 if (!xfs_verify_ino(mp, be32_to_cpu(agi->agi_unlinked[i])))
2686                         return __this_address;
2687         }
2688
2689         return NULL;
2690 }
2691
2692 static void
2693 xfs_agi_read_verify(
2694         struct xfs_buf  *bp)
2695 {
2696         struct xfs_mount *mp = bp->b_mount;
2697         xfs_failaddr_t  fa;
2698
2699         if (xfs_has_crc(mp) &&
2700             !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
2701                 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
2702         else {
2703                 fa = xfs_agi_verify(bp);
2704                 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI))
2705                         xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2706         }
2707 }
2708
2709 static void
2710 xfs_agi_write_verify(
2711         struct xfs_buf  *bp)
2712 {
2713         struct xfs_mount        *mp = bp->b_mount;
2714         struct xfs_buf_log_item *bip = bp->b_log_item;
2715         struct xfs_agi          *agi = bp->b_addr;
2716         xfs_failaddr_t          fa;
2717
2718         fa = xfs_agi_verify(bp);
2719         if (fa) {
2720                 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2721                 return;
2722         }
2723
2724         if (!xfs_has_crc(mp))
2725                 return;
2726
2727         if (bip)
2728                 agi->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2729         xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
2730 }
2731
2732 const struct xfs_buf_ops xfs_agi_buf_ops = {
2733         .name = "xfs_agi",
2734         .magic = { cpu_to_be32(XFS_AGI_MAGIC), cpu_to_be32(XFS_AGI_MAGIC) },
2735         .verify_read = xfs_agi_read_verify,
2736         .verify_write = xfs_agi_write_verify,
2737         .verify_struct = xfs_agi_verify,
2738 };
2739
2740 /*
2741  * Read in the allocation group header (inode allocation section)
2742  */
2743 int
2744 xfs_read_agi(
2745         struct xfs_perag        *pag,
2746         struct xfs_trans        *tp,
2747         xfs_buf_flags_t         flags,
2748         struct xfs_buf          **agibpp)
2749 {
2750         struct xfs_mount        *mp = pag_mount(pag);
2751         int                     error;
2752
2753         trace_xfs_read_agi(pag);
2754
2755         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
2756                         XFS_AG_DADDR(mp, pag_agno(pag), XFS_AGI_DADDR(mp)),
2757                         XFS_FSS_TO_BB(mp, 1), flags, agibpp, &xfs_agi_buf_ops);
2758         if (xfs_metadata_is_sick(error))
2759                 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
2760         if (error)
2761                 return error;
2762         if (tp)
2763                 xfs_trans_buf_set_type(tp, *agibpp, XFS_BLFT_AGI_BUF);
2764
2765         xfs_buf_set_ref(*agibpp, XFS_AGI_REF);
2766         return 0;
2767 }
2768
2769 /*
2770  * Read in the agi and initialise the per-ag data. If the caller supplies a
2771  * @agibpp, return the locked AGI buffer to them, otherwise release it.
2772  */
2773 int
2774 xfs_ialloc_read_agi(
2775         struct xfs_perag        *pag,
2776         struct xfs_trans        *tp,
2777         int                     flags,
2778         struct xfs_buf          **agibpp)
2779 {
2780         struct xfs_buf          *agibp;
2781         struct xfs_agi          *agi;
2782         int                     error;
2783
2784         trace_xfs_ialloc_read_agi(pag);
2785
2786         error = xfs_read_agi(pag, tp,
2787                         (flags & XFS_IALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
2788                         &agibp);
2789         if (error)
2790                 return error;
2791
2792         agi = agibp->b_addr;
2793         if (!xfs_perag_initialised_agi(pag)) {
2794                 pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
2795                 pag->pagi_count = be32_to_cpu(agi->agi_count);
2796                 set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
2797         }
2798
2799         /*
2800          * It's possible for these to be out of sync if
2801          * we are in the middle of a forced shutdown.
2802          */
2803         ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
2804                 xfs_is_shutdown(pag_mount(pag)));
2805         if (agibpp)
2806                 *agibpp = agibp;
2807         else
2808                 xfs_trans_brelse(tp, agibp);
2809         return 0;
2810 }
2811
2812 /* How many inodes are backed by inode clusters ondisk? */
2813 STATIC int
2814 xfs_ialloc_count_ondisk(
2815         struct xfs_btree_cur            *cur,
2816         xfs_agino_t                     low,
2817         xfs_agino_t                     high,
2818         unsigned int                    *allocated)
2819 {
2820         struct xfs_inobt_rec_incore     irec;
2821         unsigned int                    ret = 0;
2822         int                             has_record;
2823         int                             error;
2824
2825         error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record);
2826         if (error)
2827                 return error;
2828
2829         while (has_record) {
2830                 unsigned int            i, hole_idx;
2831
2832                 error = xfs_inobt_get_rec(cur, &irec, &has_record);
2833                 if (error)
2834                         return error;
2835                 if (irec.ir_startino > high)
2836                         break;
2837
2838                 for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
2839                         if (irec.ir_startino + i < low)
2840                                 continue;
2841                         if (irec.ir_startino + i > high)
2842                                 break;
2843
2844                         hole_idx = i / XFS_INODES_PER_HOLEMASK_BIT;
2845                         if (!(irec.ir_holemask & (1U << hole_idx)))
2846                                 ret++;
2847                 }
2848
2849                 error = xfs_btree_increment(cur, 0, &has_record);
2850                 if (error)
2851                         return error;
2852         }
2853
2854         *allocated = ret;
2855         return 0;
2856 }
2857
2858 /* Is there an inode record covering a given extent? */
2859 int
2860 xfs_ialloc_has_inodes_at_extent(
2861         struct xfs_btree_cur    *cur,
2862         xfs_agblock_t           bno,
2863         xfs_extlen_t            len,
2864         enum xbtree_recpacking  *outcome)
2865 {
2866         xfs_agino_t             agino;
2867         xfs_agino_t             last_agino;
2868         unsigned int            allocated;
2869         int                     error;
2870
2871         agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno);
2872         last_agino = XFS_AGB_TO_AGINO(cur->bc_mp, bno + len) - 1;
2873
2874         error = xfs_ialloc_count_ondisk(cur, agino, last_agino, &allocated);
2875         if (error)
2876                 return error;
2877
2878         if (allocated == 0)
2879                 *outcome = XBTREE_RECPACKING_EMPTY;
2880         else if (allocated == last_agino - agino + 1)
2881                 *outcome = XBTREE_RECPACKING_FULL;
2882         else
2883                 *outcome = XBTREE_RECPACKING_SPARSE;
2884         return 0;
2885 }
2886
2887 struct xfs_ialloc_count_inodes {
2888         xfs_agino_t                     count;
2889         xfs_agino_t                     freecount;
2890 };
2891
2892 /* Record inode counts across all inobt records. */
2893 STATIC int
2894 xfs_ialloc_count_inodes_rec(
2895         struct xfs_btree_cur            *cur,
2896         const union xfs_btree_rec       *rec,
2897         void                            *priv)
2898 {
2899         struct xfs_inobt_rec_incore     irec;
2900         struct xfs_ialloc_count_inodes  *ci = priv;
2901         xfs_failaddr_t                  fa;
2902
2903         xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
2904         fa = xfs_inobt_check_irec(to_perag(cur->bc_group), &irec);
2905         if (fa)
2906                 return xfs_inobt_complain_bad_rec(cur, fa, &irec);
2907
2908         ci->count += irec.ir_count;
2909         ci->freecount += irec.ir_freecount;
2910
2911         return 0;
2912 }
2913
2914 /* Count allocated and free inodes under an inobt. */
2915 int
2916 xfs_ialloc_count_inodes(
2917         struct xfs_btree_cur            *cur,
2918         xfs_agino_t                     *count,
2919         xfs_agino_t                     *freecount)
2920 {
2921         struct xfs_ialloc_count_inodes  ci = {0};
2922         int                             error;
2923
2924         ASSERT(xfs_btree_is_ino(cur->bc_ops));
2925         error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci);
2926         if (error)
2927                 return error;
2928
2929         *count = ci.count;
2930         *freecount = ci.freecount;
2931         return 0;
2932 }
2933
2934 /*
2935  * Initialize inode-related geometry information.
2936  *
2937  * Compute the inode btree min and max levels and set maxicount.
2938  *
2939  * Set the inode cluster size.  This may still be overridden by the file
2940  * system block size if it is larger than the chosen cluster size.
2941  *
2942  * For v5 filesystems, scale the cluster size with the inode size to keep a
2943  * constant ratio of inode per cluster buffer, but only if mkfs has set the
2944  * inode alignment value appropriately for larger cluster sizes.
2945  *
2946  * Then compute the inode cluster alignment information.
2947  */
2948 void
2949 xfs_ialloc_setup_geometry(
2950         struct xfs_mount        *mp)
2951 {
2952         struct xfs_sb           *sbp = &mp->m_sb;
2953         struct xfs_ino_geometry *igeo = M_IGEO(mp);
2954         uint64_t                icount;
2955         uint                    inodes;
2956
2957         igeo->new_diflags2 = 0;
2958         if (xfs_has_bigtime(mp))
2959                 igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME;
2960         if (xfs_has_large_extent_counts(mp))
2961                 igeo->new_diflags2 |= XFS_DIFLAG2_NREXT64;
2962
2963         /* Compute inode btree geometry. */
2964         igeo->agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
2965         igeo->inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, true);
2966         igeo->inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, false);
2967         igeo->inobt_mnr[0] = igeo->inobt_mxr[0] / 2;
2968         igeo->inobt_mnr[1] = igeo->inobt_mxr[1] / 2;
2969
2970         igeo->ialloc_inos = max_t(uint16_t, XFS_INODES_PER_CHUNK,
2971                         sbp->sb_inopblock);
2972         igeo->ialloc_blks = igeo->ialloc_inos >> sbp->sb_inopblog;
2973
2974         if (sbp->sb_spino_align)
2975                 igeo->ialloc_min_blks = sbp->sb_spino_align;
2976         else
2977                 igeo->ialloc_min_blks = igeo->ialloc_blks;
2978
2979         /* Compute and fill in value of m_ino_geo.inobt_maxlevels. */
2980         inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
2981         igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr,
2982                         inodes);
2983         ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk());
2984
2985         /*
2986          * Set the maximum inode count for this filesystem, being careful not
2987          * to use obviously garbage sb_inopblog/sb_inopblock values.  Regular
2988          * users should never get here due to failing sb verification, but
2989          * certain users (xfs_db) need to be usable even with corrupt metadata.
2990          */
2991         if (sbp->sb_imax_pct && igeo->ialloc_blks) {
2992                 /*
2993                  * Make sure the maximum inode count is a multiple
2994                  * of the units we allocate inodes in.
2995                  */
2996                 icount = sbp->sb_dblocks * sbp->sb_imax_pct;
2997                 do_div(icount, 100);
2998                 do_div(icount, igeo->ialloc_blks);
2999                 igeo->maxicount = XFS_FSB_TO_INO(mp,
3000                                 icount * igeo->ialloc_blks);
3001         } else {
3002                 igeo->maxicount = 0;
3003         }
3004
3005         /*
3006          * Compute the desired size of an inode cluster buffer size, which
3007          * starts at 8K and (on v5 filesystems) scales up with larger inode
3008          * sizes.
3009          *
3010          * Preserve the desired inode cluster size because the sparse inodes
3011          * feature uses that desired size (not the actual size) to compute the
3012          * sparse inode alignment.  The mount code validates this value, so we
3013          * cannot change the behavior.
3014          */
3015         igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE;
3016         if (xfs_has_v3inodes(mp)) {
3017                 int     new_size = igeo->inode_cluster_size_raw;
3018
3019                 new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
3020                 if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
3021                         igeo->inode_cluster_size_raw = new_size;
3022         }
3023
3024         /* Calculate inode cluster ratios. */
3025         if (igeo->inode_cluster_size_raw > mp->m_sb.sb_blocksize)
3026                 igeo->blocks_per_cluster = XFS_B_TO_FSBT(mp,
3027                                 igeo->inode_cluster_size_raw);
3028         else
3029                 igeo->blocks_per_cluster = 1;
3030         igeo->inode_cluster_size = XFS_FSB_TO_B(mp, igeo->blocks_per_cluster);
3031         igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster);
3032
3033         /* Calculate inode cluster alignment. */
3034         if (xfs_has_align(mp) &&
3035             mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster)
3036                 igeo->cluster_align = mp->m_sb.sb_inoalignmt;
3037         else
3038                 igeo->cluster_align = 1;
3039         igeo->inoalign_mask = igeo->cluster_align - 1;
3040         igeo->cluster_align_inodes = XFS_FSB_TO_INO(mp, igeo->cluster_align);
3041
3042         /*
3043          * If we are using stripe alignment, check whether
3044          * the stripe unit is a multiple of the inode alignment
3045          */
3046         if (mp->m_dalign && igeo->inoalign_mask &&
3047             !(mp->m_dalign & igeo->inoalign_mask))
3048                 igeo->ialloc_align = mp->m_dalign;
3049         else
3050                 igeo->ialloc_align = 0;
3051
3052         if (mp->m_sb.sb_blocksize > PAGE_SIZE)
3053                 igeo->min_folio_order = mp->m_sb.sb_blocklog - PAGE_SHIFT;
3054         else
3055                 igeo->min_folio_order = 0;
3056 }
3057
3058 /* Compute the location of the root directory inode that is laid out by mkfs. */
3059 xfs_ino_t
3060 xfs_ialloc_calc_rootino(
3061         struct xfs_mount        *mp,
3062         int                     sunit)
3063 {
3064         struct xfs_ino_geometry *igeo = M_IGEO(mp);
3065         xfs_agblock_t           first_bno;
3066
3067         /*
3068          * Pre-calculate the geometry of AG 0.  We know what it looks like
3069          * because libxfs knows how to create allocation groups now.
3070          *
3071          * first_bno is the first block in which mkfs could possibly have
3072          * allocated the root directory inode, once we factor in the metadata
3073          * that mkfs formats before it.  Namely, the four AG headers...
3074          */
3075         first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
3076
3077         /* ...the two free space btree roots... */
3078         first_bno += 2;
3079
3080         /* ...the inode btree root... */
3081         first_bno += 1;
3082
3083         /* ...the initial AGFL... */
3084         first_bno += xfs_alloc_min_freelist(mp, NULL);
3085
3086         /* ...the free inode btree root... */
3087         if (xfs_has_finobt(mp))
3088                 first_bno++;
3089
3090         /* ...the reverse mapping btree root... */
3091         if (xfs_has_rmapbt(mp))
3092                 first_bno++;
3093
3094         /* ...the reference count btree... */
3095         if (xfs_has_reflink(mp))
3096                 first_bno++;
3097
3098         /*
3099          * ...and the log, if it is allocated in the first allocation group.
3100          *
3101          * This can happen with filesystems that only have a single
3102          * allocation group, or very odd geometries created by old mkfs
3103          * versions on very small filesystems.
3104          */
3105         if (xfs_ag_contains_log(mp, 0))
3106                  first_bno += mp->m_sb.sb_logblocks;
3107
3108         /*
3109          * Now round first_bno up to whatever allocation alignment is given
3110          * by the filesystem or was passed in.
3111          */
3112         if (xfs_has_dalign(mp) && igeo->ialloc_align > 0)
3113                 first_bno = roundup(first_bno, sunit);
3114         else if (xfs_has_align(mp) &&
3115                         mp->m_sb.sb_inoalignmt > 1)
3116                 first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt);
3117
3118         return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno));
3119 }
3120
3121 /*
3122  * Ensure there are not sparse inode clusters that cross the new EOAG.
3123  *
3124  * This is a no-op for non-spinode filesystems since clusters are always fully
3125  * allocated and checking the bnobt suffices.  However, a spinode filesystem
3126  * could have a record where the upper inodes are free blocks.  If those blocks
3127  * were removed from the filesystem, the inode record would extend beyond EOAG,
3128  * which will be flagged as corruption.
3129  */
3130 int
3131 xfs_ialloc_check_shrink(
3132         struct xfs_perag        *pag,
3133         struct xfs_trans        *tp,
3134         struct xfs_buf          *agibp,
3135         xfs_agblock_t           new_length)
3136 {
3137         struct xfs_inobt_rec_incore rec;
3138         struct xfs_btree_cur    *cur;
3139         xfs_agino_t             agino;
3140         int                     has;
3141         int                     error;
3142
3143         if (!xfs_has_sparseinodes(pag_mount(pag)))
3144                 return 0;
3145
3146         cur = xfs_inobt_init_cursor(pag, tp, agibp);
3147
3148         /* Look up the inobt record that would correspond to the new EOFS. */
3149         agino = XFS_AGB_TO_AGINO(pag_mount(pag), new_length);
3150         error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has);
3151         if (error || !has)
3152                 goto out;
3153
3154         error = xfs_inobt_get_rec(cur, &rec, &has);
3155         if (error)
3156                 goto out;
3157
3158         if (!has) {
3159                 xfs_ag_mark_sick(pag, XFS_SICK_AG_INOBT);
3160                 error = -EFSCORRUPTED;
3161                 goto out;
3162         }
3163
3164         /* If the record covers inodes that would be beyond EOFS, bail out. */
3165         if (rec.ir_startino + XFS_INODES_PER_CHUNK > agino) {
3166                 error = -ENOSPC;
3167                 goto out;
3168         }
3169 out:
3170         xfs_btree_del_cursor(cur, error);
3171         return error;
3172 }