libxfs/xfs_da_btree.c

   1 /*
   2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3  * Copyright (c) 2013 Red Hat, Inc.
   4  * All Rights Reserved.
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License as
   8  * published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it would be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write the Free Software Foundation,
  17  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  18  */
  19 #include "libxfs_priv.h"
  20 #include "xfs_fs.h"
  21 #include "xfs_shared.h"
  22 #include "xfs_format.h"
  23 #include "xfs_log_format.h"
  24 #include "xfs_trans_resv.h"
  25 #include "xfs_bit.h"
  26 #include "xfs_mount.h"
  27 #include "xfs_da_format.h"
  28 #include "xfs_da_btree.h"
  29 #include "xfs_dir2.h"
  30 #include "xfs_dir2_priv.h"
  31 #include "xfs_inode.h"
  32 #include "xfs_trans.h"
  33 #include "xfs_alloc.h"
  34 #include "xfs_bmap.h"
  35 #include "xfs_attr_leaf.h"
  36 #include "xfs_trace.h"
  37 #include "xfs_cksum.h"
  38
  39 /*
  40  * xfs_da_btree.c
  41  *
  42  * Routines to implement directories as Btrees of hashed names.
  43  */
  44
  45 /*========================================================================
  46  * Function prototypes for the kernel.
  47  *========================================================================*/
  48
  49 /*
  50  * Routines used for growing the Btree.
  51  */
  52 STATIC int xfs_da3_root_split(xfs_da_state_t *state,
  53                                             xfs_da_state_blk_t *existing_root,
  54                                             xfs_da_state_blk_t *new_child);
  55 STATIC int xfs_da3_node_split(xfs_da_state_t *state,
  56                                             xfs_da_state_blk_t *existing_blk,
  57                                             xfs_da_state_blk_t *split_blk,
  58                                             xfs_da_state_blk_t *blk_to_add,
  59                                             int treelevel,
  60                                             int *result);
  61 STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state,
  62                                          xfs_da_state_blk_t *node_blk_1,
  63                                          xfs_da_state_blk_t *node_blk_2);
  64 STATIC void xfs_da3_node_add(xfs_da_state_t *state,
  65                                    xfs_da_state_blk_t *old_node_blk,
  66                                    xfs_da_state_blk_t *new_node_blk);
  67
  68 /*
  69  * Routines used for shrinking the Btree.
  70  */
  71 STATIC int xfs_da3_root_join(xfs_da_state_t *state,
  72                                            xfs_da_state_blk_t *root_blk);
  73 STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval);
  74 STATIC void xfs_da3_node_remove(xfs_da_state_t *state,
  75                                               xfs_da_state_blk_t *drop_blk);
  76 STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state,
  77                                          xfs_da_state_blk_t *src_node_blk,
  78                                          xfs_da_state_blk_t *dst_node_blk);
  79
  80 /*
  81  * Utility routines.
  82  */
  83 STATIC int      xfs_da3_blk_unlink(xfs_da_state_t *state,
  84                                   xfs_da_state_blk_t *drop_blk,
  85                                   xfs_da_state_blk_t *save_blk);
  86
  87
  88 kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
  89
  90 /*
  91  * Allocate a dir-state structure.
  92  * We don't put them on the stack since they're large.
  93  */
  94 xfs_da_state_t *
  95 xfs_da_state_alloc(void)
  96 {
  97         return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
  98 }
  99
 100 /*
 101  * Kill the altpath contents of a da-state structure.
 102  */
 103 STATIC void
 104 xfs_da_state_kill_altpath(xfs_da_state_t *state)
 105 {
 106         int     i;
 107
 108         for (i = 0; i < state->altpath.active; i++)
 109                 state->altpath.blk[i].bp = NULL;
 110         state->altpath.active = 0;
 111 }
 112
 113 /*
 114  * Free a da-state structure.
 115  */
 116 void
 117 xfs_da_state_free(xfs_da_state_t *state)
 118 {
 119         xfs_da_state_kill_altpath(state);
 120 #ifdef DEBUG
 121         memset((char *)state, 0, sizeof(*state));
 122 #endif /* DEBUG */
 123         kmem_zone_free(xfs_da_state_zone, state);
 124 }
 125
 126 static xfs_failaddr_t
 127 xfs_da3_node_verify(
 128         struct xfs_buf          *bp)
 129 {
 130         struct xfs_mount        *mp = bp->b_target->bt_mount;
 131         struct xfs_da_intnode   *hdr = bp->b_addr;
 132         struct xfs_da3_icnode_hdr ichdr;
 133         const struct xfs_dir_ops *ops;
 134
 135         ops = xfs_dir_get_ops(mp, NULL);
 136
 137         ops->node_hdr_from_disk(&ichdr, hdr);
 138
 139         if (xfs_sb_version_hascrc(&mp->m_sb)) {
 140                 struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
 141
 142                 if (ichdr.magic != XFS_DA3_NODE_MAGIC)
 143                         return __this_address;
 144
 145                 if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
 146                         return __this_address;
 147                 if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
 148                         return __this_address;
 149                 if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
 150                         return __this_address;
 151         } else {
 152                 if (ichdr.magic != XFS_DA_NODE_MAGIC)
 153                         return __this_address;
 154         }
 155         if (ichdr.level == 0)
 156                 return __this_address;
 157         if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
 158                 return __this_address;
 159         if (ichdr.count == 0)
 160                 return __this_address;
 161
 162         /*
 163          * we don't know if the node is for and attribute or directory tree,
 164          * so only fail if the count is outside both bounds
 165          */
 166         if (ichdr.count > mp->m_dir_geo->node_ents &&
 167             ichdr.count > mp->m_attr_geo->node_ents)
 168                 return __this_address;
 169
 170         /* XXX: hash order check? */
 171
 172         return NULL;
 173 }
 174
 175 static void
 176 xfs_da3_node_write_verify(
 177         struct xfs_buf  *bp)
 178 {
 179         struct xfs_mount        *mp = bp->b_target->bt_mount;
 180         struct xfs_buf_log_item *bip = bp->b_log_item;
 181         struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
 182         xfs_failaddr_t          fa;
 183
 184         fa = xfs_da3_node_verify(bp);
 185         if (fa) {
 186                 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 187                 return;
 188         }
 189
 190         if (!xfs_sb_version_hascrc(&mp->m_sb))
 191                 return;
 192
 193         if (bip)
 194                 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
 195
 196         xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
 197 }
 198
 199 /*
 200  * leaf/node format detection on trees is sketchy, so a node read can be done on
 201  * leaf level blocks when detection identifies the tree as a node format tree
 202  * incorrectly. In this case, we need to swap the verifier to match the correct
 203  * format of the block being read.
 204  */
 205 static void
 206 xfs_da3_node_read_verify(
 207         struct xfs_buf          *bp)
 208 {
 209         struct xfs_da_blkinfo   *info = bp->b_addr;
 210         xfs_failaddr_t          fa;
 211
 212         switch (be16_to_cpu(info->magic)) {
 213                 case XFS_DA3_NODE_MAGIC:
 214                         if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
 215                                 xfs_verifier_error(bp, -EFSBADCRC,
 216                                                 __this_address);
 217                                 break;
 218                         }
 219                         /* fall through */
 220                 case XFS_DA_NODE_MAGIC:
 221                         fa = xfs_da3_node_verify(bp);
 222                         if (fa)
 223                                 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 224                         return;
 225                 case XFS_ATTR_LEAF_MAGIC:
 226                 case XFS_ATTR3_LEAF_MAGIC:
 227                         bp->b_ops = &xfs_attr3_leaf_buf_ops;
 228                         bp->b_ops->verify_read(bp);
 229                         return;
 230                 case XFS_DIR2_LEAFN_MAGIC:
 231                 case XFS_DIR3_LEAFN_MAGIC:
 232                         bp->b_ops = &xfs_dir3_leafn_buf_ops;
 233                         bp->b_ops->verify_read(bp);
 234                         return;
 235                 default:
 236                         xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
 237                         break;
 238         }
 239 }
 240
 241 /* Verify the structure of a da3 block. */
 242 static xfs_failaddr_t
 243 xfs_da3_node_verify_struct(
 244         struct xfs_buf          *bp)
 245 {
 246         struct xfs_da_blkinfo   *info = bp->b_addr;
 247
 248         switch (be16_to_cpu(info->magic)) {
 249         case XFS_DA3_NODE_MAGIC:
 250         case XFS_DA_NODE_MAGIC:
 251                 return xfs_da3_node_verify(bp);
 252         case XFS_ATTR_LEAF_MAGIC:
 253         case XFS_ATTR3_LEAF_MAGIC:
 254                 bp->b_ops = &xfs_attr3_leaf_buf_ops;
 255                 return bp->b_ops->verify_struct(bp);
 256         case XFS_DIR2_LEAFN_MAGIC:
 257         case XFS_DIR3_LEAFN_MAGIC:
 258                 bp->b_ops = &xfs_dir3_leafn_buf_ops;
 259                 return bp->b_ops->verify_struct(bp);
 260         default:
 261                 return __this_address;
 262         }
 263 }
 264
 265 const struct xfs_buf_ops xfs_da3_node_buf_ops = {
 266         .name = "xfs_da3_node",
 267         .verify_read = xfs_da3_node_read_verify,
 268         .verify_write = xfs_da3_node_write_verify,
 269         .verify_struct = xfs_da3_node_verify_struct,
 270 };
 271
 272 int
 273 xfs_da3_node_read(
 274         struct xfs_trans        *tp,
 275         struct xfs_inode        *dp,
 276         xfs_dablk_t             bno,
 277         xfs_daddr_t             mappedbno,
 278         struct xfs_buf          **bpp,
 279         int                     which_fork)
 280 {
 281         int                     err;
 282
 283         err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
 284                                         which_fork, &xfs_da3_node_buf_ops);
 285         if (!err && tp && *bpp) {
 286                 struct xfs_da_blkinfo   *info = (*bpp)->b_addr;
 287                 int                     type;
 288
 289                 switch (be16_to_cpu(info->magic)) {
 290                 case XFS_DA_NODE_MAGIC:
 291                 case XFS_DA3_NODE_MAGIC:
 292                         type = XFS_BLFT_DA_NODE_BUF;
 293                         break;
 294                 case XFS_ATTR_LEAF_MAGIC:
 295                 case XFS_ATTR3_LEAF_MAGIC:
 296                         type = XFS_BLFT_ATTR_LEAF_BUF;
 297                         break;
 298                 case XFS_DIR2_LEAFN_MAGIC:
 299                 case XFS_DIR3_LEAFN_MAGIC:
 300                         type = XFS_BLFT_DIR_LEAFN_BUF;
 301                         break;
 302                 default:
 303                         XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
 304                                         tp->t_mountp, info, sizeof(*info));
 305                         xfs_trans_brelse(tp, *bpp);
 306                         *bpp = NULL;
 307                         return -EFSCORRUPTED;
 308                 }
 309                 xfs_trans_buf_set_type(tp, *bpp, type);
 310         }
 311         return err;
 312 }
 313
 314 /*========================================================================
 315  * Routines used for growing the Btree.
 316  *========================================================================*/
 317
 318 /*
 319  * Create the initial contents of an intermediate node.
 320  */
 321 int
 322 xfs_da3_node_create(
 323         struct xfs_da_args      *args,
 324         xfs_dablk_t             blkno,
 325         int                     level,
 326         struct xfs_buf          **bpp,
 327         int                     whichfork)
 328 {
 329         struct xfs_da_intnode   *node;
 330         struct xfs_trans        *tp = args->trans;
 331         struct xfs_mount        *mp = tp->t_mountp;
 332         struct xfs_da3_icnode_hdr ichdr = {0};
 333         struct xfs_buf          *bp;
 334         int                     error;
 335         struct xfs_inode        *dp = args->dp;
 336
 337         trace_xfs_da_node_create(args);
 338         ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
 339
 340         error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
 341         if (error)
 342                 return error;
 343         bp->b_ops = &xfs_da3_node_buf_ops;
 344         xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
 345         node = bp->b_addr;
 346
 347         if (xfs_sb_version_hascrc(&mp->m_sb)) {
 348                 struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
 349
 350                 memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr));
 351                 ichdr.magic = XFS_DA3_NODE_MAGIC;
 352                 hdr3->info.blkno = cpu_to_be64(bp->b_bn);
 353                 hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
 354                 uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid);
 355         } else {
 356                 ichdr.magic = XFS_DA_NODE_MAGIC;
 357         }
 358         ichdr.level = level;
 359
 360         dp->d_ops->node_hdr_to_disk(node, &ichdr);
 361         xfs_trans_log_buf(tp, bp,
 362                 XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
 363
 364         *bpp = bp;
 365         return 0;
 366 }
 367
 368 /*
 369  * Split a leaf node, rebalance, then possibly split
 370  * intermediate nodes, rebalance, etc.
 371  */
 372 int                                                     /* error */
 373 xfs_da3_split(
 374         struct xfs_da_state     *state)
 375 {
 376         struct xfs_da_state_blk *oldblk;
 377         struct xfs_da_state_blk *newblk;
 378         struct xfs_da_state_blk *addblk;
 379         struct xfs_da_intnode   *node;
 380         int                     max;
 381         int                     action = 0;
 382         int                     error;
 383         int                     i;
 384
 385         trace_xfs_da_split(state->args);
 386
 387         /*
 388          * Walk back up the tree splitting/inserting/adjusting as necessary.
 389          * If we need to insert and there isn't room, split the node, then
 390          * decide which fragment to insert the new block from below into.
 391          * Note that we may split the root this way, but we need more fixup.
 392          */
 393         max = state->path.active - 1;
 394         ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
 395         ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
 396                state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
 397
 398         addblk = &state->path.blk[max];         /* initial dummy value */
 399         for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
 400                 oldblk = &state->path.blk[i];
 401                 newblk = &state->altpath.blk[i];
 402
 403                 /*
 404                  * If a leaf node then
 405                  *     Allocate a new leaf node, then rebalance across them.
 406                  * else if an intermediate node then
 407                  *     We split on the last layer, must we split the node?
 408                  */
 409                 switch (oldblk->magic) {
 410                 case XFS_ATTR_LEAF_MAGIC:
 411                         error = xfs_attr3_leaf_split(state, oldblk, newblk);
 412                         if ((error != 0) && (error != -ENOSPC)) {
 413                                 return error;   /* GROT: attr is inconsistent */
 414                         }
 415                         if (!error) {
 416                                 addblk = newblk;
 417                                 break;
 418                         }
 419                         /*
 420                          * Entry wouldn't fit, split the leaf again. The new
 421                          * extrablk will be consumed by xfs_da3_node_split if
 422                          * the node is split.
 423                          */
 424                         state->extravalid = 1;
 425                         if (state->inleaf) {
 426                                 state->extraafter = 0;  /* before newblk */
 427                                 trace_xfs_attr_leaf_split_before(state->args);
 428                                 error = xfs_attr3_leaf_split(state, oldblk,
 429                                                             &state->extrablk);
 430                         } else {
 431                                 state->extraafter = 1;  /* after newblk */
 432                                 trace_xfs_attr_leaf_split_after(state->args);
 433                                 error = xfs_attr3_leaf_split(state, newblk,
 434                                                             &state->extrablk);
 435                         }
 436                         if (error)
 437                                 return error;   /* GROT: attr inconsistent */
 438                         addblk = newblk;
 439                         break;
 440                 case XFS_DIR2_LEAFN_MAGIC:
 441                         error = xfs_dir2_leafn_split(state, oldblk, newblk);
 442                         if (error)
 443                                 return error;
 444                         addblk = newblk;
 445                         break;
 446                 case XFS_DA_NODE_MAGIC:
 447                         error = xfs_da3_node_split(state, oldblk, newblk, addblk,
 448                                                          max - i, &action);
 449                         addblk->bp = NULL;
 450                         if (error)
 451                                 return error;   /* GROT: dir is inconsistent */
 452                         /*
 453                          * Record the newly split block for the next time thru?
 454                          */
 455                         if (action)
 456                                 addblk = newblk;
 457                         else
 458                                 addblk = NULL;
 459                         break;
 460                 }
 461
 462                 /*
 463                  * Update the btree to show the new hashval for this child.
 464                  */
 465                 xfs_da3_fixhashpath(state, &state->path);
 466         }
 467         if (!addblk)
 468                 return 0;
 469
 470         /*
 471          * xfs_da3_node_split() should have consumed any extra blocks we added
 472          * during a double leaf split in the attr fork. This is guaranteed as
 473          * we can't be here if the attr fork only has a single leaf block.
 474          */
 475         ASSERT(state->extravalid == 0 ||
 476                state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
 477
 478         /*
 479          * Split the root node.
 480          */
 481         ASSERT(state->path.active == 0);
 482         oldblk = &state->path.blk[0];
 483         error = xfs_da3_root_split(state, oldblk, addblk);
 484         if (error) {
 485                 addblk->bp = NULL;
 486                 return error;   /* GROT: dir is inconsistent */
 487         }
 488
 489         /*
 490          * Update pointers to the node which used to be block 0 and just got
 491          * bumped because of the addition of a new root node.  Note that the
 492          * original block 0 could be at any position in the list of blocks in
 493          * the tree.
 494          *
 495          * Note: the magic numbers and sibling pointers are in the same physical
 496          * place for both v2 and v3 headers (by design). Hence it doesn't matter
 497          * which version of the xfs_da_intnode structure we use here as the
 498          * result will be the same using either structure.
 499          */
 500         node = oldblk->bp->b_addr;
 501         if (node->hdr.info.forw) {
 502                 ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
 503                 node = addblk->bp->b_addr;
 504                 node->hdr.info.back = cpu_to_be32(oldblk->blkno);
 505                 xfs_trans_log_buf(state->args->trans, addblk->bp,
 506                                   XFS_DA_LOGRANGE(node, &node->hdr.info,
 507                                   sizeof(node->hdr.info)));
 508         }
 509         node = oldblk->bp->b_addr;
 510         if (node->hdr.info.back) {
 511                 ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
 512                 node = addblk->bp->b_addr;
 513                 node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
 514                 xfs_trans_log_buf(state->args->trans, addblk->bp,
 515                                   XFS_DA_LOGRANGE(node, &node->hdr.info,
 516                                   sizeof(node->hdr.info)));
 517         }
 518         addblk->bp = NULL;
 519         return 0;
 520 }
 521
 522 /*
 523  * Split the root.  We have to create a new root and point to the two
 524  * parts (the split old root) that we just created.  Copy block zero to
 525  * the EOF, extending the inode in process.
 526  */
 527 STATIC int                                              /* error */
 528 xfs_da3_root_split(
 529         struct xfs_da_state     *state,
 530         struct xfs_da_state_blk *blk1,
 531         struct xfs_da_state_blk *blk2)
 532 {
 533         struct xfs_da_intnode   *node;
 534         struct xfs_da_intnode   *oldroot;
 535         struct xfs_da_node_entry *btree;
 536         struct xfs_da3_icnode_hdr nodehdr;
 537         struct xfs_da_args      *args;
 538         struct xfs_buf          *bp;
 539         struct xfs_inode        *dp;
 540         struct xfs_trans        *tp;
 541         struct xfs_dir2_leaf    *leaf;
 542         xfs_dablk_t             blkno;
 543         int                     level;
 544         int                     error;
 545         int                     size;
 546
 547         trace_xfs_da_root_split(state->args);
 548
 549         /*
 550          * Copy the existing (incorrect) block from the root node position
 551          * to a free space somewhere.
 552          */
 553         args = state->args;
 554         error = xfs_da_grow_inode(args, &blkno);
 555         if (error)
 556                 return error;
 557
 558         dp = args->dp;
 559         tp = args->trans;
 560         error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
 561         if (error)
 562                 return error;
 563         node = bp->b_addr;
 564         oldroot = blk1->bp->b_addr;
 565         if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
 566             oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
 567                 struct xfs_da3_icnode_hdr icnodehdr;
 568
 569                 dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot);
 570                 btree = dp->d_ops->node_tree_p(oldroot);
 571                 size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot);
 572                 level = icnodehdr.level;
 573
 574                 /*
 575                  * we are about to copy oldroot to bp, so set up the type
 576                  * of bp while we know exactly what it will be.
 577                  */
 578                 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
 579         } else {
 580                 struct xfs_dir3_icleaf_hdr leafhdr;
 581                 struct xfs_dir2_leaf_entry *ents;
 582
 583                 leaf = (xfs_dir2_leaf_t *)oldroot;
 584                 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 585                 ents = dp->d_ops->leaf_ents_p(leaf);
 586
 587                 ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
 588                        leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
 589                 size = (int)((char *)&ents[leafhdr.count] - (char *)leaf);
 590                 level = 0;
 591
 592                 /*
 593                  * we are about to copy oldroot to bp, so set up the type
 594                  * of bp while we know exactly what it will be.
 595                  */
 596                 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
 597         }
 598
 599         /*
 600          * we can copy most of the information in the node from one block to
 601          * another, but for CRC enabled headers we have to make sure that the
 602          * block specific identifiers are kept intact. We update the buffer
 603          * directly for this.
 604          */
 605         memcpy(node, oldroot, size);
 606         if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
 607             oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
 608                 struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
 609
 610                 node3->hdr.info.blkno = cpu_to_be64(bp->b_bn);
 611         }
 612         xfs_trans_log_buf(tp, bp, 0, size - 1);
 613
 614         bp->b_ops = blk1->bp->b_ops;
 615         xfs_trans_buf_copy_type(bp, blk1->bp);
 616         blk1->bp = bp;
 617         blk1->blkno = blkno;
 618
 619         /*
 620          * Set up the new root node.
 621          */
 622         error = xfs_da3_node_create(args,
 623                 (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0,
 624                 level + 1, &bp, args->whichfork);
 625         if (error)
 626                 return error;
 627
 628         node = bp->b_addr;
 629         dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 630         btree = dp->d_ops->node_tree_p(node);
 631         btree[0].hashval = cpu_to_be32(blk1->hashval);
 632         btree[0].before = cpu_to_be32(blk1->blkno);
 633         btree[1].hashval = cpu_to_be32(blk2->hashval);
 634         btree[1].before = cpu_to_be32(blk2->blkno);
 635         nodehdr.count = 2;
 636         dp->d_ops->node_hdr_to_disk(node, &nodehdr);
 637
 638 #ifdef DEBUG
 639         if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
 640             oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
 641                 ASSERT(blk1->blkno >= args->geo->leafblk &&
 642                        blk1->blkno < args->geo->freeblk);
 643                 ASSERT(blk2->blkno >= args->geo->leafblk &&
 644                        blk2->blkno < args->geo->freeblk);
 645         }
 646 #endif
 647
 648         /* Header is already logged by xfs_da_node_create */
 649         xfs_trans_log_buf(tp, bp,
 650                 XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2));
 651
 652         return 0;
 653 }
 654
 655 /*
 656  * Split the node, rebalance, then add the new entry.
 657  */
 658 STATIC int                                              /* error */
 659 xfs_da3_node_split(
 660         struct xfs_da_state     *state,
 661         struct xfs_da_state_blk *oldblk,
 662         struct xfs_da_state_blk *newblk,
 663         struct xfs_da_state_blk *addblk,
 664         int                     treelevel,
 665         int                     *result)
 666 {
 667         struct xfs_da_intnode   *node;
 668         struct xfs_da3_icnode_hdr nodehdr;
 669         xfs_dablk_t             blkno;
 670         int                     newcount;
 671         int                     error;
 672         int                     useextra;
 673         struct xfs_inode        *dp = state->args->dp;
 674
 675         trace_xfs_da_node_split(state->args);
 676
 677         node = oldblk->bp->b_addr;
 678         dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 679
 680         /*
 681          * With V2 dirs the extra block is data or freespace.
 682          */
 683         useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
 684         newcount = 1 + useextra;
 685         /*
 686          * Do we have to split the node?
 687          */
 688         if (nodehdr.count + newcount > state->args->geo->node_ents) {
 689                 /*
 690                  * Allocate a new node, add to the doubly linked chain of
 691                  * nodes, then move some of our excess entries into it.
 692                  */
 693                 error = xfs_da_grow_inode(state->args, &blkno);
 694                 if (error)
 695                         return error;   /* GROT: dir is inconsistent */
 696
 697                 error = xfs_da3_node_create(state->args, blkno, treelevel,
 698                                            &newblk->bp, state->args->whichfork);
 699                 if (error)
 700                         return error;   /* GROT: dir is inconsistent */
 701                 newblk->blkno = blkno;
 702                 newblk->magic = XFS_DA_NODE_MAGIC;
 703                 xfs_da3_node_rebalance(state, oldblk, newblk);
 704                 error = xfs_da3_blk_link(state, oldblk, newblk);
 705                 if (error)
 706                         return error;
 707                 *result = 1;
 708         } else {
 709                 *result = 0;
 710         }
 711
 712         /*
 713          * Insert the new entry(s) into the correct block
 714          * (updating last hashval in the process).
 715          *
 716          * xfs_da3_node_add() inserts BEFORE the given index,
 717          * and as a result of using node_lookup_int() we always
 718          * point to a valid entry (not after one), but a split
 719          * operation always results in a new block whose hashvals
 720          * FOLLOW the current block.
 721          *
 722          * If we had double-split op below us, then add the extra block too.
 723          */
 724         node = oldblk->bp->b_addr;
 725         dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 726         if (oldblk->index <= nodehdr.count) {
 727                 oldblk->index++;
 728                 xfs_da3_node_add(state, oldblk, addblk);
 729                 if (useextra) {
 730                         if (state->extraafter)
 731                                 oldblk->index++;
 732                         xfs_da3_node_add(state, oldblk, &state->extrablk);
 733                         state->extravalid = 0;
 734                 }
 735         } else {
 736                 newblk->index++;
 737                 xfs_da3_node_add(state, newblk, addblk);
 738                 if (useextra) {
 739                         if (state->extraafter)
 740                                 newblk->index++;
 741                         xfs_da3_node_add(state, newblk, &state->extrablk);
 742                         state->extravalid = 0;
 743                 }
 744         }
 745
 746         return 0;
 747 }
 748
 749 /*
 750  * Balance the btree elements between two intermediate nodes,
 751  * usually one full and one empty.
 752  *
 753  * NOTE: if blk2 is empty, then it will get the upper half of blk1.
 754  */
 755 STATIC void
 756 xfs_da3_node_rebalance(
 757         struct xfs_da_state     *state,
 758         struct xfs_da_state_blk *blk1,
 759         struct xfs_da_state_blk *blk2)
 760 {
 761         struct xfs_da_intnode   *node1;
 762         struct xfs_da_intnode   *node2;
 763         struct xfs_da_intnode   *tmpnode;
 764         struct xfs_da_node_entry *btree1;
 765         struct xfs_da_node_entry *btree2;
 766         struct xfs_da_node_entry *btree_s;
 767         struct xfs_da_node_entry *btree_d;
 768         struct xfs_da3_icnode_hdr nodehdr1;
 769         struct xfs_da3_icnode_hdr nodehdr2;
 770         struct xfs_trans        *tp;
 771         int                     count;
 772         int                     tmp;
 773         int                     swap = 0;
 774         struct xfs_inode        *dp = state->args->dp;
 775
 776         trace_xfs_da_node_rebalance(state->args);
 777
 778         node1 = blk1->bp->b_addr;
 779         node2 = blk2->bp->b_addr;
 780         dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
 781         dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
 782         btree1 = dp->d_ops->node_tree_p(node1);
 783         btree2 = dp->d_ops->node_tree_p(node2);
 784
 785         /*
 786          * Figure out how many entries need to move, and in which direction.
 787          * Swap the nodes around if that makes it simpler.
 788          */
 789         if (nodehdr1.count > 0 && nodehdr2.count > 0 &&
 790             ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
 791              (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
 792                         be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
 793                 tmpnode = node1;
 794                 node1 = node2;
 795                 node2 = tmpnode;
 796                 dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
 797                 dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
 798                 btree1 = dp->d_ops->node_tree_p(node1);
 799                 btree2 = dp->d_ops->node_tree_p(node2);
 800                 swap = 1;
 801         }
 802
 803         count = (nodehdr1.count - nodehdr2.count) / 2;
 804         if (count == 0)
 805                 return;
 806         tp = state->args->trans;
 807         /*
 808          * Two cases: high-to-low and low-to-high.
 809          */
 810         if (count > 0) {
 811                 /*
 812                  * Move elements in node2 up to make a hole.
 813                  */
 814                 tmp = nodehdr2.count;
 815                 if (tmp > 0) {
 816                         tmp *= (uint)sizeof(xfs_da_node_entry_t);
 817                         btree_s = &btree2[0];
 818                         btree_d = &btree2[count];
 819                         memmove(btree_d, btree_s, tmp);
 820                 }
 821
 822                 /*
 823                  * Move the req'd B-tree elements from high in node1 to
 824                  * low in node2.
 825                  */
 826                 nodehdr2.count += count;
 827                 tmp = count * (uint)sizeof(xfs_da_node_entry_t);
 828                 btree_s = &btree1[nodehdr1.count - count];
 829                 btree_d = &btree2[0];
 830                 memcpy(btree_d, btree_s, tmp);
 831                 nodehdr1.count -= count;
 832         } else {
 833                 /*
 834                  * Move the req'd B-tree elements from low in node2 to
 835                  * high in node1.
 836                  */
 837                 count = -count;
 838                 tmp = count * (uint)sizeof(xfs_da_node_entry_t);
 839                 btree_s = &btree2[0];
 840                 btree_d = &btree1[nodehdr1.count];
 841                 memcpy(btree_d, btree_s, tmp);
 842                 nodehdr1.count += count;
 843
 844                 xfs_trans_log_buf(tp, blk1->bp,
 845                         XFS_DA_LOGRANGE(node1, btree_d, tmp));
 846
 847                 /*
 848                  * Move elements in node2 down to fill the hole.
 849                  */
 850                 tmp  = nodehdr2.count - count;
 851                 tmp *= (uint)sizeof(xfs_da_node_entry_t);
 852                 btree_s = &btree2[count];
 853                 btree_d = &btree2[0];
 854                 memmove(btree_d, btree_s, tmp);
 855                 nodehdr2.count -= count;
 856         }
 857
 858         /*
 859          * Log header of node 1 and all current bits of node 2.
 860          */
 861         dp->d_ops->node_hdr_to_disk(node1, &nodehdr1);
 862         xfs_trans_log_buf(tp, blk1->bp,
 863                 XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size));
 864
 865         dp->d_ops->node_hdr_to_disk(node2, &nodehdr2);
 866         xfs_trans_log_buf(tp, blk2->bp,
 867                 XFS_DA_LOGRANGE(node2, &node2->hdr,
 868                                 dp->d_ops->node_hdr_size +
 869                                 (sizeof(btree2[0]) * nodehdr2.count)));
 870
 871         /*
 872          * Record the last hashval from each block for upward propagation.
 873          * (note: don't use the swapped node pointers)
 874          */
 875         if (swap) {
 876                 node1 = blk1->bp->b_addr;
 877                 node2 = blk2->bp->b_addr;
 878                 dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
 879                 dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
 880                 btree1 = dp->d_ops->node_tree_p(node1);
 881                 btree2 = dp->d_ops->node_tree_p(node2);
 882         }
 883         blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
 884         blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
 885
 886         /*
 887          * Adjust the expected index for insertion.
 888          */
 889         if (blk1->index >= nodehdr1.count) {
 890                 blk2->index = blk1->index - nodehdr1.count;
 891                 blk1->index = nodehdr1.count + 1;       /* make it invalid */
 892         }
 893 }
 894
 895 /*
 896  * Add a new entry to an intermediate node.
 897  */
 898 STATIC void
 899 xfs_da3_node_add(
 900         struct xfs_da_state     *state,
 901         struct xfs_da_state_blk *oldblk,
 902         struct xfs_da_state_blk *newblk)
 903 {
 904         struct xfs_da_intnode   *node;
 905         struct xfs_da3_icnode_hdr nodehdr;
 906         struct xfs_da_node_entry *btree;
 907         int                     tmp;
 908         struct xfs_inode        *dp = state->args->dp;
 909
 910         trace_xfs_da_node_add(state->args);
 911
 912         node = oldblk->bp->b_addr;
 913         dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 914         btree = dp->d_ops->node_tree_p(node);
 915
 916         ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
 917         ASSERT(newblk->blkno != 0);
 918         if (state->args->whichfork == XFS_DATA_FORK)
 919                 ASSERT(newblk->blkno >= state->args->geo->leafblk &&
 920                        newblk->blkno < state->args->geo->freeblk);
 921
 922         /*
 923          * We may need to make some room before we insert the new node.
 924          */
 925         tmp = 0;
 926         if (oldblk->index < nodehdr.count) {
 927                 tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree);
 928                 memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp);
 929         }
 930         btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval);
 931         btree[oldblk->index].before = cpu_to_be32(newblk->blkno);
 932         xfs_trans_log_buf(state->args->trans, oldblk->bp,
 933                 XFS_DA_LOGRANGE(node, &btree[oldblk->index],
 934                                 tmp + sizeof(*btree)));
 935
 936         nodehdr.count += 1;
 937         dp->d_ops->node_hdr_to_disk(node, &nodehdr);
 938         xfs_trans_log_buf(state->args->trans, oldblk->bp,
 939                 XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
 940
 941         /*
 942          * Copy the last hash value from the oldblk to propagate upwards.
 943          */
 944         oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
 945 }
 946
 947 /*========================================================================
 948  * Routines used for shrinking the Btree.
 949  *========================================================================*/
 950
 951 /*
 952  * Deallocate an empty leaf node, remove it from its parent,
 953  * possibly deallocating that block, etc...
 954  */
 955 int
 956 xfs_da3_join(
 957         struct xfs_da_state     *state)
 958 {
 959         struct xfs_da_state_blk *drop_blk;
 960         struct xfs_da_state_blk *save_blk;
 961         int                     action = 0;
 962         int                     error;
 963
 964         trace_xfs_da_join(state->args);
 965
 966         drop_blk = &state->path.blk[ state->path.active-1 ];
 967         save_blk = &state->altpath.blk[ state->path.active-1 ];
 968         ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
 969         ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
 970                drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
 971
 972         /*
 973          * Walk back up the tree joining/deallocating as necessary.
 974          * When we stop dropping blocks, break out.
 975          */
 976         for (  ; state->path.active >= 2; drop_blk--, save_blk--,
 977                  state->path.active--) {
 978                 /*
 979                  * See if we can combine the block with a neighbor.
 980                  *   (action == 0) => no options, just leave
 981                  *   (action == 1) => coalesce, then unlink
 982                  *   (action == 2) => block empty, unlink it
 983                  */
 984                 switch (drop_blk->magic) {
 985                 case XFS_ATTR_LEAF_MAGIC:
 986                         error = xfs_attr3_leaf_toosmall(state, &action);
 987                         if (error)
 988                                 return error;
 989                         if (action == 0)
 990                                 return 0;
 991                         xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
 992                         break;
 993                 case XFS_DIR2_LEAFN_MAGIC:
 994                         error = xfs_dir2_leafn_toosmall(state, &action);
 995                         if (error)
 996                                 return error;
 997                         if (action == 0)
 998                                 return 0;
 999                         xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
1000                         break;
1001                 case XFS_DA_NODE_MAGIC:
1002                         /*
1003                          * Remove the offending node, fixup hashvals,
1004                          * check for a toosmall neighbor.
1005                          */
1006                         xfs_da3_node_remove(state, drop_blk);
1007                         xfs_da3_fixhashpath(state, &state->path);
1008                         error = xfs_da3_node_toosmall(state, &action);
1009                         if (error)
1010                                 return error;
1011                         if (action == 0)
1012                                 return 0;
1013                         xfs_da3_node_unbalance(state, drop_blk, save_blk);
1014                         break;
1015                 }
1016                 xfs_da3_fixhashpath(state, &state->altpath);
1017                 error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
1018                 xfs_da_state_kill_altpath(state);
1019                 if (error)
1020                         return error;
1021                 error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
1022                                                          drop_blk->bp);
1023                 drop_blk->bp = NULL;
1024                 if (error)
1025                         return error;
1026         }
1027         /*
1028          * We joined all the way to the top.  If it turns out that
1029          * we only have one entry in the root, make the child block
1030          * the new root.
1031          */
1032         xfs_da3_node_remove(state, drop_blk);
1033         xfs_da3_fixhashpath(state, &state->path);
1034         error = xfs_da3_root_join(state, &state->path.blk[0]);
1035         return error;
1036 }
1037
1038 #ifdef  DEBUG
1039 static void
1040 xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
1041 {
1042         __be16  magic = blkinfo->magic;
1043
1044         if (level == 1) {
1045                 ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
1046                        magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
1047                        magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
1048                        magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
1049         } else {
1050                 ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
1051                        magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
1052         }
1053         ASSERT(!blkinfo->forw);
1054         ASSERT(!blkinfo->back);
1055 }
1056 #else   /* !DEBUG */
1057 #define xfs_da_blkinfo_onlychild_validate(blkinfo, level)
1058 #endif  /* !DEBUG */
1059
1060 /*
1061  * We have only one entry in the root.  Copy the only remaining child of
1062  * the old root to block 0 as the new root node.
1063  */
1064 STATIC int
1065 xfs_da3_root_join(
1066         struct xfs_da_state     *state,
1067         struct xfs_da_state_blk *root_blk)
1068 {
1069         struct xfs_da_intnode   *oldroot;
1070         struct xfs_da_args      *args;
1071         xfs_dablk_t             child;
1072         struct xfs_buf          *bp;
1073         struct xfs_da3_icnode_hdr oldroothdr;
1074         struct xfs_da_node_entry *btree;
1075         int                     error;
1076         struct xfs_inode        *dp = state->args->dp;
1077
1078         trace_xfs_da_root_join(state->args);
1079
1080         ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
1081
1082         args = state->args;
1083         oldroot = root_blk->bp->b_addr;
1084         dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot);
1085         ASSERT(oldroothdr.forw == 0);
1086         ASSERT(oldroothdr.back == 0);
1087
1088         /*
1089          * If the root has more than one child, then don't do anything.
1090          */
1091         if (oldroothdr.count > 1)
1092                 return 0;
1093
1094         /*
1095          * Read in the (only) child block, then copy those bytes into
1096          * the root block's buffer and free the original child block.
1097          */
1098         btree = dp->d_ops->node_tree_p(oldroot);
1099         child = be32_to_cpu(btree[0].before);
1100         ASSERT(child != 0);
1101         error = xfs_da3_node_read(args->trans, dp, child, -1, &bp,
1102                                              args->whichfork);
1103         if (error)
1104                 return error;
1105         xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
1106
1107         /*
1108          * This could be copying a leaf back into the root block in the case of
1109          * there only being a single leaf block left in the tree. Hence we have
1110          * to update the b_ops pointer as well to match the buffer type change
1111          * that could occur. For dir3 blocks we also need to update the block
1112          * number in the buffer header.
1113          */
1114         memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
1115         root_blk->bp->b_ops = bp->b_ops;
1116         xfs_trans_buf_copy_type(root_blk->bp, bp);
1117         if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
1118                 struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
1119                 da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
1120         }
1121         xfs_trans_log_buf(args->trans, root_blk->bp, 0,
1122                           args->geo->blksize - 1);
1123         error = xfs_da_shrink_inode(args, child, bp);
1124         return error;
1125 }
1126
1127 /*
1128  * Check a node block and its neighbors to see if the block should be
1129  * collapsed into one or the other neighbor.  Always keep the block
1130  * with the smaller block number.
1131  * If the current block is over 50% full, don't try to join it, return 0.
1132  * If the block is empty, fill in the state structure and return 2.
1133  * If it can be collapsed, fill in the state structure and return 1.
1134  * If nothing can be done, return 0.
1135  */
1136 STATIC int
1137 xfs_da3_node_toosmall(
1138         struct xfs_da_state     *state,
1139         int                     *action)
1140 {
1141         struct xfs_da_intnode   *node;
1142         struct xfs_da_state_blk *blk;
1143         struct xfs_da_blkinfo   *info;
1144         xfs_dablk_t             blkno;
1145         struct xfs_buf          *bp;
1146         struct xfs_da3_icnode_hdr nodehdr;
1147         int                     count;
1148         int                     forward;
1149         int                     error;
1150         int                     retval;
1151         int                     i;
1152         struct xfs_inode        *dp = state->args->dp;
1153
1154         trace_xfs_da_node_toosmall(state->args);
1155
1156         /*
1157          * Check for the degenerate case of the block being over 50% full.
1158          * If so, it's not worth even looking to see if we might be able
1159          * to coalesce with a sibling.
1160          */
1161         blk = &state->path.blk[ state->path.active-1 ];
1162         info = blk->bp->b_addr;
1163         node = (xfs_da_intnode_t *)info;
1164         dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1165         if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
1166                 *action = 0;    /* blk over 50%, don't try to join */
1167                 return 0;       /* blk over 50%, don't try to join */
1168         }
1169
1170         /*
1171          * Check for the degenerate case of the block being empty.
1172          * If the block is empty, we'll simply delete it, no need to
1173          * coalesce it with a sibling block.  We choose (arbitrarily)
1174          * to merge with the forward block unless it is NULL.
1175          */
1176         if (nodehdr.count == 0) {
1177                 /*
1178                  * Make altpath point to the block we want to keep and
1179                  * path point to the block we want to drop (this one).
1180                  */
1181                 forward = (info->forw != 0);
1182                 memcpy(&state->altpath, &state->path, sizeof(state->path));
1183                 error = xfs_da3_path_shift(state, &state->altpath, forward,
1184                                                  0, &retval);
1185                 if (error)
1186                         return error;
1187                 if (retval) {
1188                         *action = 0;
1189                 } else {
1190                         *action = 2;
1191                 }
1192                 return 0;
1193         }
1194
1195         /*
1196          * Examine each sibling block to see if we can coalesce with
1197          * at least 25% free space to spare.  We need to figure out
1198          * whether to merge with the forward or the backward block.
1199          * We prefer coalescing with the lower numbered sibling so as
1200          * to shrink a directory over time.
1201          */
1202         count  = state->args->geo->node_ents;
1203         count -= state->args->geo->node_ents >> 2;
1204         count -= nodehdr.count;
1205
1206         /* start with smaller blk num */
1207         forward = nodehdr.forw < nodehdr.back;
1208         for (i = 0; i < 2; forward = !forward, i++) {
1209                 struct xfs_da3_icnode_hdr thdr;
1210                 if (forward)
1211                         blkno = nodehdr.forw;
1212                 else
1213                         blkno = nodehdr.back;
1214                 if (blkno == 0)
1215                         continue;
1216                 error = xfs_da3_node_read(state->args->trans, dp,
1217                                         blkno, -1, &bp, state->args->whichfork);
1218                 if (error)
1219                         return error;
1220
1221                 node = bp->b_addr;
1222                 dp->d_ops->node_hdr_from_disk(&thdr, node);
1223                 xfs_trans_brelse(state->args->trans, bp);
1224
1225                 if (count - thdr.count >= 0)
1226                         break;  /* fits with at least 25% to spare */
1227         }
1228         if (i >= 2) {
1229                 *action = 0;
1230                 return 0;
1231         }
1232
1233         /*
1234          * Make altpath point to the block we want to keep (the lower
1235          * numbered block) and path point to the block we want to drop.
1236          */
1237         memcpy(&state->altpath, &state->path, sizeof(state->path));
1238         if (blkno < blk->blkno) {
1239                 error = xfs_da3_path_shift(state, &state->altpath, forward,
1240                                                  0, &retval);
1241         } else {
1242                 error = xfs_da3_path_shift(state, &state->path, forward,
1243                                                  0, &retval);
1244         }
1245         if (error)
1246                 return error;
1247         if (retval) {
1248                 *action = 0;
1249                 return 0;
1250         }
1251         *action = 1;
1252         return 0;
1253 }
1254
1255 /*
1256  * Pick up the last hashvalue from an intermediate node.
1257  */
1258 STATIC uint
1259 xfs_da3_node_lasthash(
1260         struct xfs_inode        *dp,
1261         struct xfs_buf          *bp,
1262         int                     *count)
1263 {
1264         struct xfs_da_intnode    *node;
1265         struct xfs_da_node_entry *btree;
1266         struct xfs_da3_icnode_hdr nodehdr;
1267
1268         node = bp->b_addr;
1269         dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1270         if (count)
1271                 *count = nodehdr.count;
1272         if (!nodehdr.count)
1273                 return 0;
1274         btree = dp->d_ops->node_tree_p(node);
1275         return be32_to_cpu(btree[nodehdr.count - 1].hashval);
1276 }
1277
1278 /*
1279  * Walk back up the tree adjusting hash values as necessary,
1280  * when we stop making changes, return.
1281  */
1282 void
1283 xfs_da3_fixhashpath(
1284         struct xfs_da_state     *state,
1285         struct xfs_da_state_path *path)
1286 {
1287         struct xfs_da_state_blk *blk;
1288         struct xfs_da_intnode   *node;
1289         struct xfs_da_node_entry *btree;
1290         xfs_dahash_t            lasthash=0;
1291         int                     level;
1292         int                     count;
1293         struct xfs_inode        *dp = state->args->dp;
1294
1295         trace_xfs_da_fixhashpath(state->args);
1296
1297         level = path->active-1;
1298         blk = &path->blk[ level ];
1299         switch (blk->magic) {
1300         case XFS_ATTR_LEAF_MAGIC:
1301                 lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
1302                 if (count == 0)
1303                         return;
1304                 break;
1305         case XFS_DIR2_LEAFN_MAGIC:
1306                 lasthash = xfs_dir2_leaf_lasthash(dp, blk->bp, &count);
1307                 if (count == 0)
1308                         return;
1309                 break;
1310         case XFS_DA_NODE_MAGIC:
1311                 lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count);
1312                 if (count == 0)
1313                         return;
1314                 break;
1315         }
1316         for (blk--, level--; level >= 0; blk--, level--) {
1317                 struct xfs_da3_icnode_hdr nodehdr;
1318
1319                 node = blk->bp->b_addr;
1320                 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1321                 btree = dp->d_ops->node_tree_p(node);
1322                 if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
1323                         break;
1324                 blk->hashval = lasthash;
1325                 btree[blk->index].hashval = cpu_to_be32(lasthash);
1326                 xfs_trans_log_buf(state->args->trans, blk->bp,
1327                                   XFS_DA_LOGRANGE(node, &btree[blk->index],
1328                                                   sizeof(*btree)));
1329
1330                 lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval);
1331         }
1332 }
1333
1334 /*
1335  * Remove an entry from an intermediate node.
1336  */
1337 STATIC void
1338 xfs_da3_node_remove(
1339         struct xfs_da_state     *state,
1340         struct xfs_da_state_blk *drop_blk)
1341 {
1342         struct xfs_da_intnode   *node;
1343         struct xfs_da3_icnode_hdr nodehdr;
1344         struct xfs_da_node_entry *btree;
1345         int                     index;
1346         int                     tmp;
1347         struct xfs_inode        *dp = state->args->dp;
1348
1349         trace_xfs_da_node_remove(state->args);
1350
1351         node = drop_blk->bp->b_addr;
1352         dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1353         ASSERT(drop_blk->index < nodehdr.count);
1354         ASSERT(drop_blk->index >= 0);
1355
1356         /*
1357          * Copy over the offending entry, or just zero it out.
1358          */
1359         index = drop_blk->index;
1360         btree = dp->d_ops->node_tree_p(node);
1361         if (index < nodehdr.count - 1) {
1362                 tmp  = nodehdr.count - index - 1;
1363                 tmp *= (uint)sizeof(xfs_da_node_entry_t);
1364                 memmove(&btree[index], &btree[index + 1], tmp);
1365                 xfs_trans_log_buf(state->args->trans, drop_blk->bp,
1366                     XFS_DA_LOGRANGE(node, &btree[index], tmp));
1367                 index = nodehdr.count - 1;
1368         }
1369         memset(&btree[index], 0, sizeof(xfs_da_node_entry_t));
1370         xfs_trans_log_buf(state->args->trans, drop_blk->bp,
1371             XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
1372         nodehdr.count -= 1;
1373         dp->d_ops->node_hdr_to_disk(node, &nodehdr);
1374         xfs_trans_log_buf(state->args->trans, drop_blk->bp,
1375             XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
1376
1377         /*
1378          * Copy the last hash value from the block to propagate upwards.
1379          */
1380         drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval);
1381 }
1382
1383 /*
1384  * Unbalance the elements between two intermediate nodes,
1385  * move all Btree elements from one node into another.
1386  */
1387 STATIC void
1388 xfs_da3_node_unbalance(
1389         struct xfs_da_state     *state,
1390         struct xfs_da_state_blk *drop_blk,
1391         struct xfs_da_state_blk *save_blk)
1392 {
1393         struct xfs_da_intnode   *drop_node;
1394         struct xfs_da_intnode   *save_node;
1395         struct xfs_da_node_entry *drop_btree;
1396         struct xfs_da_node_entry *save_btree;
1397         struct xfs_da3_icnode_hdr drop_hdr;
1398         struct xfs_da3_icnode_hdr save_hdr;
1399         struct xfs_trans        *tp;
1400         int                     sindex;
1401         int                     tmp;
1402         struct xfs_inode        *dp = state->args->dp;
1403
1404         trace_xfs_da_node_unbalance(state->args);
1405
1406         drop_node = drop_blk->bp->b_addr;
1407         save_node = save_blk->bp->b_addr;
1408         dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node);
1409         dp->d_ops->node_hdr_from_disk(&save_hdr, save_node);
1410         drop_btree = dp->d_ops->node_tree_p(drop_node);
1411         save_btree = dp->d_ops->node_tree_p(save_node);
1412         tp = state->args->trans;
1413
1414         /*
1415          * If the dying block has lower hashvals, then move all the
1416          * elements in the remaining block up to make a hole.
1417          */
1418         if ((be32_to_cpu(drop_btree[0].hashval) <
1419                         be32_to_cpu(save_btree[0].hashval)) ||
1420             (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) <
1421                         be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) {
1422                 /* XXX: check this - is memmove dst correct? */
1423                 tmp = save_hdr.count * sizeof(xfs_da_node_entry_t);
1424                 memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp);
1425
1426                 sindex = 0;
1427                 xfs_trans_log_buf(tp, save_blk->bp,
1428                         XFS_DA_LOGRANGE(save_node, &save_btree[0],
1429                                 (save_hdr.count + drop_hdr.count) *
1430                                                 sizeof(xfs_da_node_entry_t)));
1431         } else {
1432                 sindex = save_hdr.count;
1433                 xfs_trans_log_buf(tp, save_blk->bp,
1434                         XFS_DA_LOGRANGE(save_node, &save_btree[sindex],
1435                                 drop_hdr.count * sizeof(xfs_da_node_entry_t)));
1436         }
1437
1438         /*
1439          * Move all the B-tree elements from drop_blk to save_blk.
1440          */
1441         tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t);
1442         memcpy(&save_btree[sindex], &drop_btree[0], tmp);
1443         save_hdr.count += drop_hdr.count;
1444
1445         dp->d_ops->node_hdr_to_disk(save_node, &save_hdr);
1446         xfs_trans_log_buf(tp, save_blk->bp,
1447                 XFS_DA_LOGRANGE(save_node, &save_node->hdr,
1448                                 dp->d_ops->node_hdr_size));
1449
1450         /*
1451          * Save the last hashval in the remaining block for upward propagation.
1452          */
1453         save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval);
1454 }
1455
1456 /*========================================================================
1457  * Routines used for finding things in the Btree.
1458  *========================================================================*/
1459
1460 /*
1461  * Walk down the Btree looking for a particular filename, filling
1462  * in the state structure as we go.
1463  *
1464  * We will set the state structure to point to each of the elements
1465  * in each of the nodes where either the hashval is or should be.
1466  *
1467  * We support duplicate hashval's so for each entry in the current
1468  * node that could contain the desired hashval, descend.  This is a
1469  * pruned depth-first tree search.
1470  */
1471 int                                                     /* error */
1472 xfs_da3_node_lookup_int(
1473         struct xfs_da_state     *state,
1474         int                     *result)
1475 {
1476         struct xfs_da_state_blk *blk;
1477         struct xfs_da_blkinfo   *curr;
1478         struct xfs_da_intnode   *node;
1479         struct xfs_da_node_entry *btree;
1480         struct xfs_da3_icnode_hdr nodehdr;
1481         struct xfs_da_args      *args;
1482         xfs_dablk_t             blkno;
1483         xfs_dahash_t            hashval;
1484         xfs_dahash_t            btreehashval;
1485         int                     probe;
1486         int                     span;
1487         int                     max;
1488         int                     error;
1489         int                     retval;
1490         unsigned int            expected_level = 0;
1491         struct xfs_inode        *dp = state->args->dp;
1492
1493         args = state->args;
1494
1495         /*
1496          * Descend thru the B-tree searching each level for the right
1497          * node to use, until the right hashval is found.
1498          */
1499         blkno = args->geo->leafblk;
1500         for (blk = &state->path.blk[0], state->path.active = 1;
1501                          state->path.active <= XFS_DA_NODE_MAXDEPTH;
1502                          blk++, state->path.active++) {
1503                 /*
1504                  * Read the next node down in the tree.
1505                  */
1506                 blk->blkno = blkno;
1507                 error = xfs_da3_node_read(args->trans, args->dp, blkno,
1508                                         -1, &blk->bp, args->whichfork);
1509                 if (error) {
1510                         blk->blkno = 0;
1511                         state->path.active--;
1512                         return error;
1513                 }
1514                 curr = blk->bp->b_addr;
1515                 blk->magic = be16_to_cpu(curr->magic);
1516
1517                 if (blk->magic == XFS_ATTR_LEAF_MAGIC ||
1518                     blk->magic == XFS_ATTR3_LEAF_MAGIC) {
1519                         blk->magic = XFS_ATTR_LEAF_MAGIC;
1520                         blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
1521                         break;
1522                 }
1523
1524                 if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1525                     blk->magic == XFS_DIR3_LEAFN_MAGIC) {
1526                         blk->magic = XFS_DIR2_LEAFN_MAGIC;
1527                         blk->hashval = xfs_dir2_leaf_lasthash(args->dp,
1528                                                               blk->bp, NULL);
1529                         break;
1530                 }
1531
1532                 blk->magic = XFS_DA_NODE_MAGIC;
1533
1534
1535                 /*
1536                  * Search an intermediate node for a match.
1537                  */
1538                 node = blk->bp->b_addr;
1539                 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1540                 btree = dp->d_ops->node_tree_p(node);
1541
1542                 /* Tree taller than we can handle; bail out! */
1543                 if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH)
1544                         return -EFSCORRUPTED;
1545
1546                 /* Check the level from the root. */
1547                 if (blkno == args->geo->leafblk)
1548                         expected_level = nodehdr.level - 1;
1549                 else if (expected_level != nodehdr.level)
1550                         return -EFSCORRUPTED;
1551                 else
1552                         expected_level--;
1553
1554                 max = nodehdr.count;
1555                 blk->hashval = be32_to_cpu(btree[max - 1].hashval);
1556
1557                 /*
1558                  * Binary search.  (note: small blocks will skip loop)
1559                  */
1560                 probe = span = max / 2;
1561                 hashval = args->hashval;
1562                 while (span > 4) {
1563                         span /= 2;
1564                         btreehashval = be32_to_cpu(btree[probe].hashval);
1565                         if (btreehashval < hashval)
1566                                 probe += span;
1567                         else if (btreehashval > hashval)
1568                                 probe -= span;
1569                         else
1570                                 break;
1571                 }
1572                 ASSERT((probe >= 0) && (probe < max));
1573                 ASSERT((span <= 4) ||
1574                         (be32_to_cpu(btree[probe].hashval) == hashval));
1575
1576                 /*
1577                  * Since we may have duplicate hashval's, find the first
1578                  * matching hashval in the node.
1579                  */
1580                 while (probe > 0 &&
1581                        be32_to_cpu(btree[probe].hashval) >= hashval) {
1582                         probe--;
1583                 }
1584                 while (probe < max &&
1585                        be32_to_cpu(btree[probe].hashval) < hashval) {
1586                         probe++;
1587                 }
1588
1589                 /*
1590                  * Pick the right block to descend on.
1591                  */
1592                 if (probe == max) {
1593                         blk->index = max - 1;
1594                         blkno = be32_to_cpu(btree[max - 1].before);
1595                 } else {
1596                         blk->index = probe;
1597                         blkno = be32_to_cpu(btree[probe].before);
1598                 }
1599
1600                 /* We can't point back to the root. */
1601                 if (blkno == args->geo->leafblk)
1602                         return -EFSCORRUPTED;
1603         }
1604
1605         if (expected_level != 0)
1606                 return -EFSCORRUPTED;
1607
1608         /*
1609          * A leaf block that ends in the hashval that we are interested in
1610          * (final hashval == search hashval) means that the next block may
1611          * contain more entries with the same hashval, shift upward to the
1612          * next leaf and keep searching.
1613          */
1614         for (;;) {
1615                 if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
1616                         retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
1617                                                         &blk->index, state);
1618                 } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1619                         retval = xfs_attr3_leaf_lookup_int(blk->bp, args);
1620                         blk->index = args->index;
1621                         args->blkno = blk->blkno;
1622                 } else {
1623                         ASSERT(0);
1624                         return -EFSCORRUPTED;
1625                 }
1626                 if (((retval == -ENOENT) || (retval == -ENOATTR)) &&
1627                     (blk->hashval == args->hashval)) {
1628                         error = xfs_da3_path_shift(state, &state->path, 1, 1,
1629                                                          &retval);
1630                         if (error)
1631                                 return error;
1632                         if (retval == 0) {
1633                                 continue;
1634                         } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1635                                 /* path_shift() gives ENOENT */
1636                                 retval = -ENOATTR;
1637                         }
1638                 }
1639                 break;
1640         }
1641         *result = retval;
1642         return 0;
1643 }
1644
1645 /*========================================================================
1646  * Utility routines.
1647  *========================================================================*/
1648
1649 /*
1650  * Compare two intermediate nodes for "order".
1651  */
1652 STATIC int
1653 xfs_da3_node_order(
1654         struct xfs_inode *dp,
1655         struct xfs_buf  *node1_bp,
1656         struct xfs_buf  *node2_bp)
1657 {
1658         struct xfs_da_intnode   *node1;
1659         struct xfs_da_intnode   *node2;
1660         struct xfs_da_node_entry *btree1;
1661         struct xfs_da_node_entry *btree2;
1662         struct xfs_da3_icnode_hdr node1hdr;
1663         struct xfs_da3_icnode_hdr node2hdr;
1664
1665         node1 = node1_bp->b_addr;
1666         node2 = node2_bp->b_addr;
1667         dp->d_ops->node_hdr_from_disk(&node1hdr, node1);
1668         dp->d_ops->node_hdr_from_disk(&node2hdr, node2);
1669         btree1 = dp->d_ops->node_tree_p(node1);
1670         btree2 = dp->d_ops->node_tree_p(node2);
1671
1672         if (node1hdr.count > 0 && node2hdr.count > 0 &&
1673             ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
1674              (be32_to_cpu(btree2[node2hdr.count - 1].hashval) <
1675               be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) {
1676                 return 1;
1677         }
1678         return 0;
1679 }
1680
1681 /*
1682  * Link a new block into a doubly linked list of blocks (of whatever type).
1683  */
1684 int                                                     /* error */
1685 xfs_da3_blk_link(
1686         struct xfs_da_state     *state,
1687         struct xfs_da_state_blk *old_blk,
1688         struct xfs_da_state_blk *new_blk)
1689 {
1690         struct xfs_da_blkinfo   *old_info;
1691         struct xfs_da_blkinfo   *new_info;
1692         struct xfs_da_blkinfo   *tmp_info;
1693         struct xfs_da_args      *args;
1694         struct xfs_buf          *bp;
1695         int                     before = 0;
1696         int                     error;
1697         struct xfs_inode        *dp = state->args->dp;
1698
1699         /*
1700          * Set up environment.
1701          */
1702         args = state->args;
1703         ASSERT(args != NULL);
1704         old_info = old_blk->bp->b_addr;
1705         new_info = new_blk->bp->b_addr;
1706         ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
1707                old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1708                old_blk->magic == XFS_ATTR_LEAF_MAGIC);
1709
1710         switch (old_blk->magic) {
1711         case XFS_ATTR_LEAF_MAGIC:
1712                 before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
1713                 break;
1714         case XFS_DIR2_LEAFN_MAGIC:
1715                 before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp);
1716                 break;
1717         case XFS_DA_NODE_MAGIC:
1718                 before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp);
1719                 break;
1720         }
1721
1722         /*
1723          * Link blocks in appropriate order.
1724          */
1725         if (before) {
1726                 /*
1727                  * Link new block in before existing block.
1728                  */
1729                 trace_xfs_da_link_before(args);
1730                 new_info->forw = cpu_to_be32(old_blk->blkno);
1731                 new_info->back = old_info->back;
1732                 if (old_info->back) {
1733                         error = xfs_da3_node_read(args->trans, dp,
1734                                                 be32_to_cpu(old_info->back),
1735                                                 -1, &bp, args->whichfork);
1736                         if (error)
1737                                 return error;
1738                         ASSERT(bp != NULL);
1739                         tmp_info = bp->b_addr;
1740                         ASSERT(tmp_info->magic == old_info->magic);
1741                         ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
1742                         tmp_info->forw = cpu_to_be32(new_blk->blkno);
1743                         xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
1744                 }
1745                 old_info->back = cpu_to_be32(new_blk->blkno);
1746         } else {
1747                 /*
1748                  * Link new block in after existing block.
1749                  */
1750                 trace_xfs_da_link_after(args);
1751                 new_info->forw = old_info->forw;
1752                 new_info->back = cpu_to_be32(old_blk->blkno);
1753                 if (old_info->forw) {
1754                         error = xfs_da3_node_read(args->trans, dp,
1755                                                 be32_to_cpu(old_info->forw),
1756                                                 -1, &bp, args->whichfork);
1757                         if (error)
1758                                 return error;
1759                         ASSERT(bp != NULL);
1760                         tmp_info = bp->b_addr;
1761                         ASSERT(tmp_info->magic == old_info->magic);
1762                         ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
1763                         tmp_info->back = cpu_to_be32(new_blk->blkno);
1764                         xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
1765                 }
1766                 old_info->forw = cpu_to_be32(new_blk->blkno);
1767         }
1768
1769         xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
1770         xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
1771         return 0;
1772 }
1773
1774 /*
1775  * Unlink a block from a doubly linked list of blocks.
1776  */
1777 STATIC int                                              /* error */
1778 xfs_da3_blk_unlink(
1779         struct xfs_da_state     *state,
1780         struct xfs_da_state_blk *drop_blk,
1781         struct xfs_da_state_blk *save_blk)
1782 {
1783         struct xfs_da_blkinfo   *drop_info;
1784         struct xfs_da_blkinfo   *save_info;
1785         struct xfs_da_blkinfo   *tmp_info;
1786         struct xfs_da_args      *args;
1787         struct xfs_buf          *bp;
1788         int                     error;
1789
1790         /*
1791          * Set up environment.
1792          */
1793         args = state->args;
1794         ASSERT(args != NULL);
1795         save_info = save_blk->bp->b_addr;
1796         drop_info = drop_blk->bp->b_addr;
1797         ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
1798                save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1799                save_blk->magic == XFS_ATTR_LEAF_MAGIC);
1800         ASSERT(save_blk->magic == drop_blk->magic);
1801         ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
1802                (be32_to_cpu(save_info->back) == drop_blk->blkno));
1803         ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
1804                (be32_to_cpu(drop_info->back) == save_blk->blkno));
1805
1806         /*
1807          * Unlink the leaf block from the doubly linked chain of leaves.
1808          */
1809         if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
1810                 trace_xfs_da_unlink_back(args);
1811                 save_info->back = drop_info->back;
1812                 if (drop_info->back) {
1813                         error = xfs_da3_node_read(args->trans, args->dp,
1814                                                 be32_to_cpu(drop_info->back),
1815                                                 -1, &bp, args->whichfork);
1816                         if (error)
1817                                 return error;
1818                         ASSERT(bp != NULL);
1819                         tmp_info = bp->b_addr;
1820                         ASSERT(tmp_info->magic == save_info->magic);
1821                         ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
1822                         tmp_info->forw = cpu_to_be32(save_blk->blkno);
1823                         xfs_trans_log_buf(args->trans, bp, 0,
1824                                                     sizeof(*tmp_info) - 1);
1825                 }
1826         } else {
1827                 trace_xfs_da_unlink_forward(args);
1828                 save_info->forw = drop_info->forw;
1829                 if (drop_info->forw) {
1830                         error = xfs_da3_node_read(args->trans, args->dp,
1831                                                 be32_to_cpu(drop_info->forw),
1832                                                 -1, &bp, args->whichfork);
1833                         if (error)
1834                                 return error;
1835                         ASSERT(bp != NULL);
1836                         tmp_info = bp->b_addr;
1837                         ASSERT(tmp_info->magic == save_info->magic);
1838                         ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
1839                         tmp_info->back = cpu_to_be32(save_blk->blkno);
1840                         xfs_trans_log_buf(args->trans, bp, 0,
1841                                                     sizeof(*tmp_info) - 1);
1842                 }
1843         }
1844
1845         xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
1846         return 0;
1847 }
1848
1849 /*
1850  * Move a path "forward" or "!forward" one block at the current level.
1851  *
1852  * This routine will adjust a "path" to point to the next block
1853  * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
1854  * Btree, including updating pointers to the intermediate nodes between
1855  * the new bottom and the root.
1856  */
1857 int                                                     /* error */
1858 xfs_da3_path_shift(
1859         struct xfs_da_state     *state,
1860         struct xfs_da_state_path *path,
1861         int                     forward,
1862         int                     release,
1863         int                     *result)
1864 {
1865         struct xfs_da_state_blk *blk;
1866         struct xfs_da_blkinfo   *info;
1867         struct xfs_da_intnode   *node;
1868         struct xfs_da_args      *args;
1869         struct xfs_da_node_entry *btree;
1870         struct xfs_da3_icnode_hdr nodehdr;
1871         struct xfs_buf          *bp;
1872         xfs_dablk_t             blkno = 0;
1873         int                     level;
1874         int                     error;
1875         struct xfs_inode        *dp = state->args->dp;
1876
1877         trace_xfs_da_path_shift(state->args);
1878
1879         /*
1880          * Roll up the Btree looking for the first block where our
1881          * current index is not at the edge of the block.  Note that
1882          * we skip the bottom layer because we want the sibling block.
1883          */
1884         args = state->args;
1885         ASSERT(args != NULL);
1886         ASSERT(path != NULL);
1887         ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1888         level = (path->active-1) - 1;   /* skip bottom layer in path */
1889         for (blk = &path->blk[level]; level >= 0; blk--, level--) {
1890                 node = blk->bp->b_addr;
1891                 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1892                 btree = dp->d_ops->node_tree_p(node);
1893
1894                 if (forward && (blk->index < nodehdr.count - 1)) {
1895                         blk->index++;
1896                         blkno = be32_to_cpu(btree[blk->index].before);
1897                         break;
1898                 } else if (!forward && (blk->index > 0)) {
1899                         blk->index--;
1900                         blkno = be32_to_cpu(btree[blk->index].before);
1901                         break;
1902                 }
1903         }
1904         if (level < 0) {
1905                 *result = -ENOENT;      /* we're out of our tree */
1906                 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
1907                 return 0;
1908         }
1909
1910         /*
1911          * Roll down the edge of the subtree until we reach the
1912          * same depth we were at originally.
1913          */
1914         for (blk++, level++; level < path->active; blk++, level++) {
1915                 /*
1916                  * Read the next child block into a local buffer.
1917                  */
1918                 error = xfs_da3_node_read(args->trans, dp, blkno, -1, &bp,
1919                                           args->whichfork);
1920                 if (error)
1921                         return error;
1922
1923                 /*
1924                  * Release the old block (if it's dirty, the trans doesn't
1925                  * actually let go) and swap the local buffer into the path
1926                  * structure. This ensures failure of the above read doesn't set
1927                  * a NULL buffer in an active slot in the path.
1928                  */
1929                 if (release)
1930                         xfs_trans_brelse(args->trans, blk->bp);
1931                 blk->blkno = blkno;
1932                 blk->bp = bp;
1933
1934                 info = blk->bp->b_addr;
1935                 ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
1936                        info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
1937                        info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
1938                        info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
1939                        info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
1940                        info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
1941
1942
1943                 /*
1944                  * Note: we flatten the magic number to a single type so we
1945                  * don't have to compare against crc/non-crc types elsewhere.
1946                  */
1947                 switch (be16_to_cpu(info->magic)) {
1948                 case XFS_DA_NODE_MAGIC:
1949                 case XFS_DA3_NODE_MAGIC:
1950                         blk->magic = XFS_DA_NODE_MAGIC;
1951                         node = (xfs_da_intnode_t *)info;
1952                         dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1953                         btree = dp->d_ops->node_tree_p(node);
1954                         blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
1955                         if (forward)
1956                                 blk->index = 0;
1957                         else
1958                                 blk->index = nodehdr.count - 1;
1959                         blkno = be32_to_cpu(btree[blk->index].before);
1960                         break;
1961                 case XFS_ATTR_LEAF_MAGIC:
1962                 case XFS_ATTR3_LEAF_MAGIC:
1963                         blk->magic = XFS_ATTR_LEAF_MAGIC;
1964                         ASSERT(level == path->active-1);
1965                         blk->index = 0;
1966                         blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
1967                         break;
1968                 case XFS_DIR2_LEAFN_MAGIC:
1969                 case XFS_DIR3_LEAFN_MAGIC:
1970                         blk->magic = XFS_DIR2_LEAFN_MAGIC;
1971                         ASSERT(level == path->active-1);
1972                         blk->index = 0;
1973                         blk->hashval = xfs_dir2_leaf_lasthash(args->dp,
1974                                                               blk->bp, NULL);
1975                         break;
1976                 default:
1977                         ASSERT(0);
1978                         break;
1979                 }
1980         }
1981         *result = 0;
1982         return 0;
1983 }
1984
1985
1986 /*========================================================================
1987  * Utility routines.
1988  *========================================================================*/
1989
1990 /*
1991  * Implement a simple hash on a character string.
1992  * Rotate the hash value by 7 bits, then XOR each character in.
1993  * This is implemented with some source-level loop unrolling.
1994  */
1995 xfs_dahash_t
1996 xfs_da_hashname(const uint8_t *name, int namelen)
1997 {
1998         xfs_dahash_t hash;
1999
2000         /*
2001          * Do four characters at a time as long as we can.
2002          */
2003         for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
2004                 hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
2005                        (name[3] << 0) ^ rol32(hash, 7 * 4);
2006
2007         /*
2008          * Now do the rest of the characters.
2009          */
2010         switch (namelen) {
2011         case 3:
2012                 return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
2013                        rol32(hash, 7 * 3);
2014         case 2:
2015                 return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
2016         case 1:
2017                 return (name[0] << 0) ^ rol32(hash, 7 * 1);
2018         default: /* case 0: */
2019                 return hash;
2020         }
2021 }
2022
2023 enum xfs_dacmp
2024 xfs_da_compname(
2025         struct xfs_da_args *args,
2026         const unsigned char *name,
2027         int             len)
2028 {
2029         return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
2030                                         XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
2031 }
2032
2033 static xfs_dahash_t
2034 xfs_default_hashname(
2035         struct xfs_name *name)
2036 {
2037         return xfs_da_hashname(name->name, name->len);
2038 }
2039
2040 const struct xfs_nameops xfs_default_nameops = {
2041         .hashname       = xfs_default_hashname,
2042         .compname       = xfs_da_compname
2043 };
2044
2045 int
2046 xfs_da_grow_inode_int(
2047         struct xfs_da_args      *args,
2048         xfs_fileoff_t           *bno,
2049         int                     count)
2050 {
2051         struct xfs_trans        *tp = args->trans;
2052         struct xfs_inode        *dp = args->dp;
2053         int                     w = args->whichfork;
2054         xfs_rfsblock_t          nblks = dp->i_d.di_nblocks;
2055         struct xfs_bmbt_irec    map, *mapp;
2056         int                     nmap, error, got, i, mapi;
2057
2058         /*
2059          * Find a spot in the file space to put the new block.
2060          */
2061         error = xfs_bmap_first_unused(tp, dp, count, bno, w);
2062         if (error)
2063                 return error;
2064
2065         /*
2066          * Try mapping it in one filesystem block.
2067          */
2068         nmap = 1;
2069         ASSERT(args->firstblock != NULL);
2070         error = xfs_bmapi_write(tp, dp, *bno, count,
2071                         xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
2072                         args->firstblock, args->total, &map, &nmap,
2073                         args->dfops);
2074         if (error)
2075                 return error;
2076
2077         ASSERT(nmap <= 1);
2078         if (nmap == 1) {
2079                 mapp = &map;
2080                 mapi = 1;
2081         } else if (nmap == 0 && count > 1) {
2082                 xfs_fileoff_t           b;
2083                 int                     c;
2084
2085                 /*
2086                  * If we didn't get it and the block might work if fragmented,
2087                  * try without the CONTIG flag.  Loop until we get it all.
2088                  */
2089                 mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
2090                 for (b = *bno, mapi = 0; b < *bno + count; ) {
2091                         nmap = MIN(XFS_BMAP_MAX_NMAP, count);
2092                         c = (int)(*bno + count - b);
2093                         error = xfs_bmapi_write(tp, dp, b, c,
2094                                         xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
2095                                         args->firstblock, args->total,
2096                                         &mapp[mapi], &nmap, args->dfops);
2097                         if (error)
2098                                 goto out_free_map;
2099                         if (nmap < 1)
2100                                 break;
2101                         mapi += nmap;
2102                         b = mapp[mapi - 1].br_startoff +
2103                             mapp[mapi - 1].br_blockcount;
2104                 }
2105         } else {
2106                 mapi = 0;
2107                 mapp = NULL;
2108         }
2109
2110         /*
2111          * Count the blocks we got, make sure it matches the total.
2112          */
2113         for (i = 0, got = 0; i < mapi; i++)
2114                 got += mapp[i].br_blockcount;
2115         if (got != count || mapp[0].br_startoff != *bno ||
2116             mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
2117             *bno + count) {
2118                 error = -ENOSPC;
2119                 goto out_free_map;
2120         }
2121
2122         /* account for newly allocated blocks in reserved blocks total */
2123         args->total -= dp->i_d.di_nblocks - nblks;
2124
2125 out_free_map:
2126         if (mapp != &map)
2127                 kmem_free(mapp);
2128         return error;
2129 }
2130
2131 /*
2132  * Add a block to the btree ahead of the file.
2133  * Return the new block number to the caller.
2134  */
2135 int
2136 xfs_da_grow_inode(
2137         struct xfs_da_args      *args,
2138         xfs_dablk_t             *new_blkno)
2139 {
2140         xfs_fileoff_t           bno;
2141         int                     error;
2142
2143         trace_xfs_da_grow_inode(args);
2144
2145         bno = args->geo->leafblk;
2146         error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount);
2147         if (!error)
2148                 *new_blkno = (xfs_dablk_t)bno;
2149         return error;
2150 }
2151
2152 /*
2153  * Ick.  We need to always be able to remove a btree block, even
2154  * if there's no space reservation because the filesystem is full.
2155  * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
2156  * It swaps the target block with the last block in the file.  The
2157  * last block in the file can always be removed since it can't cause
2158  * a bmap btree split to do that.
2159  */
2160 STATIC int
2161 xfs_da3_swap_lastblock(
2162         struct xfs_da_args      *args,
2163         xfs_dablk_t             *dead_blknop,
2164         struct xfs_buf          **dead_bufp)
2165 {
2166         struct xfs_da_blkinfo   *dead_info;
2167         struct xfs_da_blkinfo   *sib_info;
2168         struct xfs_da_intnode   *par_node;
2169         struct xfs_da_intnode   *dead_node;
2170         struct xfs_dir2_leaf    *dead_leaf2;
2171         struct xfs_da_node_entry *btree;
2172         struct xfs_da3_icnode_hdr par_hdr;
2173         struct xfs_inode        *dp;
2174         struct xfs_trans        *tp;
2175         struct xfs_mount        *mp;
2176         struct xfs_buf          *dead_buf;
2177         struct xfs_buf          *last_buf;
2178         struct xfs_buf          *sib_buf;
2179         struct xfs_buf          *par_buf;
2180         xfs_dahash_t            dead_hash;
2181         xfs_fileoff_t           lastoff;
2182         xfs_dablk_t             dead_blkno;
2183         xfs_dablk_t             last_blkno;
2184         xfs_dablk_t             sib_blkno;
2185         xfs_dablk_t             par_blkno;
2186         int                     error;
2187         int                     w;
2188         int                     entno;
2189         int                     level;
2190         int                     dead_level;
2191
2192         trace_xfs_da_swap_lastblock(args);
2193
2194         dead_buf = *dead_bufp;
2195         dead_blkno = *dead_blknop;
2196         tp = args->trans;
2197         dp = args->dp;
2198         w = args->whichfork;
2199         ASSERT(w == XFS_DATA_FORK);
2200         mp = dp->i_mount;
2201         lastoff = args->geo->freeblk;
2202         error = xfs_bmap_last_before(tp, dp, &lastoff, w);
2203         if (error)
2204                 return error;
2205         if (unlikely(lastoff == 0)) {
2206                 XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
2207                                  mp);
2208                 return -EFSCORRUPTED;
2209         }
2210         /*
2211          * Read the last block in the btree space.
2212          */
2213         last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
2214         error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w);
2215         if (error)
2216                 return error;
2217         /*
2218          * Copy the last block into the dead buffer and log it.
2219          */
2220         memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
2221         xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
2222         dead_info = dead_buf->b_addr;
2223         /*
2224          * Get values from the moved block.
2225          */
2226         if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
2227             dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
2228                 struct xfs_dir3_icleaf_hdr leafhdr;
2229                 struct xfs_dir2_leaf_entry *ents;
2230
2231                 dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
2232                 dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2);
2233                 ents = dp->d_ops->leaf_ents_p(dead_leaf2);
2234                 dead_level = 0;
2235                 dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
2236         } else {
2237                 struct xfs_da3_icnode_hdr deadhdr;
2238
2239                 dead_node = (xfs_da_intnode_t *)dead_info;
2240                 dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node);
2241                 btree = dp->d_ops->node_tree_p(dead_node);
2242                 dead_level = deadhdr.level;
2243                 dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
2244         }
2245         sib_buf = par_buf = NULL;
2246         /*
2247          * If the moved block has a left sibling, fix up the pointers.
2248          */
2249         if ((sib_blkno = be32_to_cpu(dead_info->back))) {
2250                 error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
2251                 if (error)
2252                         goto done;
2253                 sib_info = sib_buf->b_addr;
2254                 if (unlikely(
2255                     be32_to_cpu(sib_info->forw) != last_blkno ||
2256                     sib_info->magic != dead_info->magic)) {
2257                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
2258                                          XFS_ERRLEVEL_LOW, mp);
2259                         error = -EFSCORRUPTED;
2260                         goto done;
2261                 }
2262                 sib_info->forw = cpu_to_be32(dead_blkno);
2263                 xfs_trans_log_buf(tp, sib_buf,
2264                         XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
2265                                         sizeof(sib_info->forw)));
2266                 sib_buf = NULL;
2267         }
2268         /*
2269          * If the moved block has a right sibling, fix up the pointers.
2270          */
2271         if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
2272                 error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
2273                 if (error)
2274                         goto done;
2275                 sib_info = sib_buf->b_addr;
2276                 if (unlikely(
2277                        be32_to_cpu(sib_info->back) != last_blkno ||
2278                        sib_info->magic != dead_info->magic)) {
2279                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
2280                                          XFS_ERRLEVEL_LOW, mp);
2281                         error = -EFSCORRUPTED;
2282                         goto done;
2283                 }
2284                 sib_info->back = cpu_to_be32(dead_blkno);
2285                 xfs_trans_log_buf(tp, sib_buf,
2286                         XFS_DA_LOGRANGE(sib_info, &sib_info->back,
2287                                         sizeof(sib_info->back)));
2288                 sib_buf = NULL;
2289         }
2290         par_blkno = args->geo->leafblk;
2291         level = -1;
2292         /*
2293          * Walk down the tree looking for the parent of the moved block.
2294          */
2295         for (;;) {
2296                 error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
2297                 if (error)
2298                         goto done;
2299                 par_node = par_buf->b_addr;
2300                 dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
2301                 if (level >= 0 && level != par_hdr.level + 1) {
2302                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
2303                                          XFS_ERRLEVEL_LOW, mp);
2304                         error = -EFSCORRUPTED;
2305                         goto done;
2306                 }
2307                 level = par_hdr.level;
2308                 btree = dp->d_ops->node_tree_p(par_node);
2309                 for (entno = 0;
2310                      entno < par_hdr.count &&
2311                      be32_to_cpu(btree[entno].hashval) < dead_hash;
2312                      entno++)
2313                         continue;
2314                 if (entno == par_hdr.count) {
2315                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
2316                                          XFS_ERRLEVEL_LOW, mp);
2317                         error = -EFSCORRUPTED;
2318                         goto done;
2319                 }
2320                 par_blkno = be32_to_cpu(btree[entno].before);
2321                 if (level == dead_level + 1)
2322                         break;
2323                 xfs_trans_brelse(tp, par_buf);
2324                 par_buf = NULL;
2325         }
2326         /*
2327          * We're in the right parent block.
2328          * Look for the right entry.
2329          */
2330         for (;;) {
2331                 for (;
2332                      entno < par_hdr.count &&
2333                      be32_to_cpu(btree[entno].before) != last_blkno;
2334                      entno++)
2335                         continue;
2336                 if (entno < par_hdr.count)
2337                         break;
2338                 par_blkno = par_hdr.forw;
2339                 xfs_trans_brelse(tp, par_buf);
2340                 par_buf = NULL;
2341                 if (unlikely(par_blkno == 0)) {
2342                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
2343                                          XFS_ERRLEVEL_LOW, mp);
2344                         error = -EFSCORRUPTED;
2345                         goto done;
2346                 }
2347                 error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
2348                 if (error)
2349                         goto done;
2350                 par_node = par_buf->b_addr;
2351                 dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
2352                 if (par_hdr.level != level) {
2353                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
2354                                          XFS_ERRLEVEL_LOW, mp);
2355                         error = -EFSCORRUPTED;
2356                         goto done;
2357                 }
2358                 btree = dp->d_ops->node_tree_p(par_node);
2359                 entno = 0;
2360         }
2361         /*
2362          * Update the parent entry pointing to the moved block.
2363          */
2364         btree[entno].before = cpu_to_be32(dead_blkno);
2365         xfs_trans_log_buf(tp, par_buf,
2366                 XFS_DA_LOGRANGE(par_node, &btree[entno].before,
2367                                 sizeof(btree[entno].before)));
2368         *dead_blknop = last_blkno;
2369         *dead_bufp = last_buf;
2370         return 0;
2371 done:
2372         if (par_buf)
2373                 xfs_trans_brelse(tp, par_buf);
2374         if (sib_buf)
2375                 xfs_trans_brelse(tp, sib_buf);
2376         xfs_trans_brelse(tp, last_buf);
2377         return error;
2378 }
2379
2380 /*
2381  * Remove a btree block from a directory or attribute.
2382  */
2383 int
2384 xfs_da_shrink_inode(
2385         xfs_da_args_t   *args,
2386         xfs_dablk_t     dead_blkno,
2387         struct xfs_buf  *dead_buf)
2388 {
2389         xfs_inode_t *dp;
2390         int done, error, w, count;
2391         xfs_trans_t *tp;
2392
2393         trace_xfs_da_shrink_inode(args);
2394
2395         dp = args->dp;
2396         w = args->whichfork;
2397         tp = args->trans;
2398         count = args->geo->fsbcount;
2399         for (;;) {
2400                 /*
2401                  * Remove extents.  If we get ENOSPC for a dir we have to move
2402                  * the last block to the place we want to kill.
2403                  */
2404                 error = xfs_bunmapi(tp, dp, dead_blkno, count,
2405                                     xfs_bmapi_aflag(w), 0, args->firstblock,
2406                                     args->dfops, &done);
2407                 if (error == -ENOSPC) {
2408                         if (w != XFS_DATA_FORK)
2409                                 break;
2410                         error = xfs_da3_swap_lastblock(args, &dead_blkno,
2411                                                       &dead_buf);
2412                         if (error)
2413                                 break;
2414                 } else {
2415                         break;
2416                 }
2417         }
2418         xfs_trans_binval(tp, dead_buf);
2419         return error;
2420 }
2421
2422 /*
2423  * See if the mapping(s) for this btree block are valid, i.e.
2424  * don't contain holes, are logically contiguous, and cover the whole range.
2425  */
2426 STATIC int
2427 xfs_da_map_covers_blocks(
2428         int             nmap,
2429         xfs_bmbt_irec_t *mapp,
2430         xfs_dablk_t     bno,
2431         int             count)
2432 {
2433         int             i;
2434         xfs_fileoff_t   off;
2435
2436         for (i = 0, off = bno; i < nmap; i++) {
2437                 if (mapp[i].br_startblock == HOLESTARTBLOCK ||
2438                     mapp[i].br_startblock == DELAYSTARTBLOCK) {
2439                         return 0;
2440                 }
2441                 if (off != mapp[i].br_startoff) {
2442                         return 0;
2443                 }
2444                 off += mapp[i].br_blockcount;
2445         }
2446         return off == bno + count;
2447 }
2448
2449 /*
2450  * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map.
2451  *
2452  * For the single map case, it is assumed that the caller has provided a pointer
2453  * to a valid xfs_buf_map.  For the multiple map case, this function will
2454  * allocate the xfs_buf_map to hold all the maps and replace the caller's single
2455  * map pointer with the allocated map.
2456  */
2457 static int
2458 xfs_buf_map_from_irec(
2459         struct xfs_mount        *mp,
2460         struct xfs_buf_map      **mapp,
2461         int                     *nmaps,
2462         struct xfs_bmbt_irec    *irecs,
2463         int                     nirecs)
2464 {
2465         struct xfs_buf_map      *map;
2466         int                     i;
2467
2468         ASSERT(*nmaps == 1);
2469         ASSERT(nirecs >= 1);
2470
2471         if (nirecs > 1) {
2472                 map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
2473                                   KM_SLEEP | KM_NOFS);
2474                 if (!map)
2475                         return -ENOMEM;
2476                 *mapp = map;
2477         }
2478
2479         *nmaps = nirecs;
2480         map = *mapp;
2481         for (i = 0; i < *nmaps; i++) {
2482                 ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK &&
2483                        irecs[i].br_startblock != HOLESTARTBLOCK);
2484                 map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
2485                 map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
2486         }
2487         return 0;
2488 }
2489
2490 /*
2491  * Map the block we are given ready for reading. There are three possible return
2492  * values:
2493  *      -1 - will be returned if we land in a hole and mappedbno == -2 so the
2494  *           caller knows not to execute a subsequent read.
2495  *       0 - if we mapped the block successfully
2496  *      >0 - positive error number if there was an error.
2497  */
2498 static int
2499 xfs_dabuf_map(
2500         struct xfs_inode        *dp,
2501         xfs_dablk_t             bno,
2502         xfs_daddr_t             mappedbno,
2503         int                     whichfork,
2504         struct xfs_buf_map      **map,
2505         int                     *nmaps)
2506 {
2507         struct xfs_mount        *mp = dp->i_mount;
2508         int                     nfsb;
2509         int                     error = 0;
2510         struct xfs_bmbt_irec    irec;
2511         struct xfs_bmbt_irec    *irecs = &irec;
2512         int                     nirecs;
2513
2514         ASSERT(map && *map);
2515         ASSERT(*nmaps == 1);
2516
2517         if (whichfork == XFS_DATA_FORK)
2518                 nfsb = mp->m_dir_geo->fsbcount;
2519         else
2520                 nfsb = mp->m_attr_geo->fsbcount;
2521
2522         /*
2523          * Caller doesn't have a mapping.  -2 means don't complain
2524          * if we land in a hole.
2525          */
2526         if (mappedbno == -1 || mappedbno == -2) {
2527                 /*
2528                  * Optimize the one-block case.
2529                  */
2530                 if (nfsb != 1)
2531                         irecs = kmem_zalloc(sizeof(irec) * nfsb,
2532                                             KM_SLEEP | KM_NOFS);
2533
2534                 nirecs = nfsb;
2535                 error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
2536                                        &nirecs, xfs_bmapi_aflag(whichfork));
2537                 if (error)
2538                         goto out;
2539         } else {
2540                 irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
2541                 irecs->br_startoff = (xfs_fileoff_t)bno;
2542                 irecs->br_blockcount = nfsb;
2543                 irecs->br_state = 0;
2544                 nirecs = 1;
2545         }
2546
2547         if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
2548                 error = mappedbno == -2 ? -1 : -EFSCORRUPTED;
2549                 if (unlikely(error == -EFSCORRUPTED)) {
2550                         if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
2551                                 int i;
2552                                 xfs_alert(mp, "%s: bno %lld dir: inode %lld",
2553                                         __func__, (long long)bno,
2554                                         (long long)dp->i_ino);
2555                                 for (i = 0; i < *nmaps; i++) {
2556                                         xfs_alert(mp,
2557 "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
2558                                                 i,
2559                                                 (long long)irecs[i].br_startoff,
2560                                                 (long long)irecs[i].br_startblock,
2561                                                 (long long)irecs[i].br_blockcount,
2562                                                 irecs[i].br_state);
2563                                 }
2564                         }
2565                         XFS_ERROR_REPORT("xfs_da_do_buf(1)",
2566                                          XFS_ERRLEVEL_LOW, mp);
2567                 }
2568                 goto out;
2569         }
2570         error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs);
2571 out:
2572         if (irecs != &irec)
2573                 kmem_free(irecs);
2574         return error;
2575 }
2576
2577 /*
2578  * Get a buffer for the dir/attr block.
2579  */
2580 int
2581 xfs_da_get_buf(
2582         struct xfs_trans        *trans,
2583         struct xfs_inode        *dp,
2584         xfs_dablk_t             bno,
2585         xfs_daddr_t             mappedbno,
2586         struct xfs_buf          **bpp,
2587         int                     whichfork)
2588 {
2589         struct xfs_buf          *bp;
2590         struct xfs_buf_map      map;
2591         struct xfs_buf_map      *mapp;
2592         int                     nmap;
2593         int                     error;
2594
2595         *bpp = NULL;
2596         mapp = &map;
2597         nmap = 1;
2598         error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
2599                                 &mapp, &nmap);
2600         if (error) {
2601                 /* mapping a hole is not an error, but we don't continue */
2602                 if (error == -1)
2603                         error = 0;
2604                 goto out_free;
2605         }
2606
2607         bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
2608                                     mapp, nmap, 0);
2609         error = bp ? bp->b_error : -EIO;
2610         if (error) {
2611                 if (bp)
2612                         xfs_trans_brelse(trans, bp);
2613                 goto out_free;
2614         }
2615
2616         *bpp = bp;
2617
2618 out_free:
2619         if (mapp != &map)
2620                 kmem_free(mapp);
2621
2622         return error;
2623 }
2624
2625 /*
2626  * Get a buffer for the dir/attr block, fill in the contents.
2627  */
2628 int
2629 xfs_da_read_buf(
2630         struct xfs_trans        *trans,
2631         struct xfs_inode        *dp,
2632         xfs_dablk_t             bno,
2633         xfs_daddr_t             mappedbno,
2634         struct xfs_buf          **bpp,
2635         int                     whichfork,
2636         const struct xfs_buf_ops *ops)
2637 {
2638         struct xfs_buf          *bp;
2639         struct xfs_buf_map      map;
2640         struct xfs_buf_map      *mapp;
2641         int                     nmap;
2642         int                     error;
2643
2644         *bpp = NULL;
2645         mapp = &map;
2646         nmap = 1;
2647         error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
2648                                 &mapp, &nmap);
2649         if (error) {
2650                 /* mapping a hole is not an error, but we don't continue */
2651                 if (error == -1)
2652                         error = 0;
2653                 goto out_free;
2654         }
2655
2656         error = xfs_trans_read_buf_map(dp->i_mount, trans,
2657                                         dp->i_mount->m_ddev_targp,
2658                                         mapp, nmap, 0, &bp, ops);
2659         if (error)
2660                 goto out_free;
2661
2662         if (whichfork == XFS_ATTR_FORK)
2663                 xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
2664         else
2665                 xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
2666         *bpp = bp;
2667 out_free:
2668         if (mapp != &map)
2669                 kmem_free(mapp);
2670
2671         return error;
2672 }
2673
2674 /*
2675  * Readahead the dir/attr block.
2676  */
2677 int
2678 xfs_da_reada_buf(
2679         struct xfs_inode        *dp,
2680         xfs_dablk_t             bno,
2681         xfs_daddr_t             mappedbno,
2682         int                     whichfork,
2683         const struct xfs_buf_ops *ops)
2684 {
2685         struct xfs_buf_map      map;
2686         struct xfs_buf_map      *mapp;
2687         int                     nmap;
2688         int                     error;
2689
2690         mapp = &map;
2691         nmap = 1;
2692         error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
2693                                 &mapp, &nmap);
2694         if (error) {
2695                 /* mapping a hole is not an error, but we don't continue */
2696                 if (error == -1)
2697                         error = 0;
2698                 goto out_free;
2699         }
2700
2701         mappedbno = mapp[0].bm_bn;
2702         xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
2703
2704 out_free:
2705         if (mapp != &map)
2706                 kmem_free(mapp);
2707
2708         return error;
2709 }