libxfs/xfs_da_btree.c

   1 /*
   2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3  * Copyright (c) 2013 Red Hat, Inc.
   4  * All Rights Reserved.
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License as
   8  * published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it would be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13  * GNU General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program; if not, write the Free Software Foundation,
  17  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  18  */
  19 #include "libxfs_priv.h"
  20 #include "xfs_fs.h"
  21 #include "xfs_shared.h"
  22 #include "xfs_format.h"
  23 #include "xfs_log_format.h"
  24 #include "xfs_trans_resv.h"
  25 #include "xfs_bit.h"
  26 #include "xfs_mount.h"
  27 #include "xfs_da_format.h"
  28 #include "xfs_da_btree.h"
  29 #include "xfs_dir2.h"
  30 #include "xfs_dir2_priv.h"
  31 #include "xfs_inode.h"
  32 #include "xfs_trans.h"
  33 #include "xfs_alloc.h"
  34 #include "xfs_bmap.h"
  35 #include "xfs_attr_leaf.h"
  36 #include "xfs_trace.h"
  37 #include "xfs_cksum.h"
  38
  39 /*
  40  * xfs_da_btree.c
  41  *
  42  * Routines to implement directories as Btrees of hashed names.
  43  */
  44
  45 /*========================================================================
  46  * Function prototypes for the kernel.
  47  *========================================================================*/
  48
  49 /*
  50  * Routines used for growing the Btree.
  51  */
  52 STATIC int xfs_da3_root_split(xfs_da_state_t *state,
  53                                             xfs_da_state_blk_t *existing_root,
  54                                             xfs_da_state_blk_t *new_child);
  55 STATIC int xfs_da3_node_split(xfs_da_state_t *state,
  56                                             xfs_da_state_blk_t *existing_blk,
  57                                             xfs_da_state_blk_t *split_blk,
  58                                             xfs_da_state_blk_t *blk_to_add,
  59                                             int treelevel,
  60                                             int *result);
  61 STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state,
  62                                          xfs_da_state_blk_t *node_blk_1,
  63                                          xfs_da_state_blk_t *node_blk_2);
  64 STATIC void xfs_da3_node_add(xfs_da_state_t *state,
  65                                    xfs_da_state_blk_t *old_node_blk,
  66                                    xfs_da_state_blk_t *new_node_blk);
  67
  68 /*
  69  * Routines used for shrinking the Btree.
  70  */
  71 STATIC int xfs_da3_root_join(xfs_da_state_t *state,
  72                                            xfs_da_state_blk_t *root_blk);
  73 STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval);
  74 STATIC void xfs_da3_node_remove(xfs_da_state_t *state,
  75                                               xfs_da_state_blk_t *drop_blk);
  76 STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state,
  77                                          xfs_da_state_blk_t *src_node_blk,
  78                                          xfs_da_state_blk_t *dst_node_blk);
  79
  80 /*
  81  * Utility routines.
  82  */
  83 STATIC int      xfs_da3_blk_unlink(xfs_da_state_t *state,
  84                                   xfs_da_state_blk_t *drop_blk,
  85                                   xfs_da_state_blk_t *save_blk);
  86
  87
  88 kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */
  89
  90 /*
  91  * Allocate a dir-state structure.
  92  * We don't put them on the stack since they're large.
  93  */
  94 xfs_da_state_t *
  95 xfs_da_state_alloc(void)
  96 {
  97         return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
  98 }
  99
 100 /*
 101  * Kill the altpath contents of a da-state structure.
 102  */
 103 STATIC void
 104 xfs_da_state_kill_altpath(xfs_da_state_t *state)
 105 {
 106         int     i;
 107
 108         for (i = 0; i < state->altpath.active; i++)
 109                 state->altpath.blk[i].bp = NULL;
 110         state->altpath.active = 0;
 111 }
 112
 113 /*
 114  * Free a da-state structure.
 115  */
 116 void
 117 xfs_da_state_free(xfs_da_state_t *state)
 118 {
 119         xfs_da_state_kill_altpath(state);
 120 #ifdef DEBUG
 121         memset((char *)state, 0, sizeof(*state));
 122 #endif /* DEBUG */
 123         kmem_zone_free(xfs_da_state_zone, state);
 124 }
 125
 126 static xfs_failaddr_t
 127 xfs_da3_node_verify(
 128         struct xfs_buf          *bp)
 129 {
 130         struct xfs_mount        *mp = bp->b_target->bt_mount;
 131         struct xfs_da_intnode   *hdr = bp->b_addr;
 132         struct xfs_da3_icnode_hdr ichdr;
 133         const struct xfs_dir_ops *ops;
 134
 135         ops = xfs_dir_get_ops(mp, NULL);
 136
 137         ops->node_hdr_from_disk(&ichdr, hdr);
 138
 139         if (xfs_sb_version_hascrc(&mp->m_sb)) {
 140                 struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
 141
 142                 if (ichdr.magic != XFS_DA3_NODE_MAGIC)
 143                         return __this_address;
 144
 145                 if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid))
 146                         return __this_address;
 147                 if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
 148                         return __this_address;
 149                 if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
 150                         return __this_address;
 151         } else {
 152                 if (ichdr.magic != XFS_DA_NODE_MAGIC)
 153                         return __this_address;
 154         }
 155         if (ichdr.level == 0)
 156                 return __this_address;
 157         if (ichdr.level > XFS_DA_NODE_MAXDEPTH)
 158                 return __this_address;
 159         if (ichdr.count == 0)
 160                 return __this_address;
 161
 162         /*
 163          * we don't know if the node is for and attribute or directory tree,
 164          * so only fail if the count is outside both bounds
 165          */
 166         if (ichdr.count > mp->m_dir_geo->node_ents &&
 167             ichdr.count > mp->m_attr_geo->node_ents)
 168                 return __this_address;
 169
 170         /* XXX: hash order check? */
 171
 172         return NULL;
 173 }
 174
 175 static void
 176 xfs_da3_node_write_verify(
 177         struct xfs_buf  *bp)
 178 {
 179         struct xfs_mount        *mp = bp->b_target->bt_mount;
 180         struct xfs_buf_log_item *bip = bp->b_fspriv;
 181         struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
 182
 183         if (xfs_da3_node_verify(bp)) {
 184                 xfs_verifier_error(bp, -EFSCORRUPTED);
 185                 return;
 186         }
 187
 188         if (!xfs_sb_version_hascrc(&mp->m_sb))
 189                 return;
 190
 191         if (bip)
 192                 hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
 193
 194         xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
 195 }
 196
 197 /*
 198  * leaf/node format detection on trees is sketchy, so a node read can be done on
 199  * leaf level blocks when detection identifies the tree as a node format tree
 200  * incorrectly. In this case, we need to swap the verifier to match the correct
 201  * format of the block being read.
 202  */
 203 static void
 204 xfs_da3_node_read_verify(
 205         struct xfs_buf          *bp)
 206 {
 207         struct xfs_da_blkinfo   *info = bp->b_addr;
 208
 209         switch (be16_to_cpu(info->magic)) {
 210                 case XFS_DA3_NODE_MAGIC:
 211                         if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
 212                                 xfs_verifier_error(bp, -EFSBADCRC);
 213                                 break;
 214                         }
 215                         /* fall through */
 216                 case XFS_DA_NODE_MAGIC:
 217                         if (xfs_da3_node_verify(bp))
 218                                 xfs_verifier_error(bp, -EFSCORRUPTED);
 219                         return;
 220                 case XFS_ATTR_LEAF_MAGIC:
 221                 case XFS_ATTR3_LEAF_MAGIC:
 222                         bp->b_ops = &xfs_attr3_leaf_buf_ops;
 223                         bp->b_ops->verify_read(bp);
 224                         return;
 225                 case XFS_DIR2_LEAFN_MAGIC:
 226                 case XFS_DIR3_LEAFN_MAGIC:
 227                         bp->b_ops = &xfs_dir3_leafn_buf_ops;
 228                         bp->b_ops->verify_read(bp);
 229                         return;
 230                 default:
 231                         xfs_verifier_error(bp, -EFSCORRUPTED);
 232                         break;
 233         }
 234 }
 235
 236 const struct xfs_buf_ops xfs_da3_node_buf_ops = {
 237         .name = "xfs_da3_node",
 238         .verify_read = xfs_da3_node_read_verify,
 239         .verify_write = xfs_da3_node_write_verify,
 240 };
 241
 242 int
 243 xfs_da3_node_read(
 244         struct xfs_trans        *tp,
 245         struct xfs_inode        *dp,
 246         xfs_dablk_t             bno,
 247         xfs_daddr_t             mappedbno,
 248         struct xfs_buf          **bpp,
 249         int                     which_fork)
 250 {
 251         int                     err;
 252
 253         err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
 254                                         which_fork, &xfs_da3_node_buf_ops);
 255         if (!err && tp && *bpp) {
 256                 struct xfs_da_blkinfo   *info = (*bpp)->b_addr;
 257                 int                     type;
 258
 259                 switch (be16_to_cpu(info->magic)) {
 260                 case XFS_DA_NODE_MAGIC:
 261                 case XFS_DA3_NODE_MAGIC:
 262                         type = XFS_BLFT_DA_NODE_BUF;
 263                         break;
 264                 case XFS_ATTR_LEAF_MAGIC:
 265                 case XFS_ATTR3_LEAF_MAGIC:
 266                         type = XFS_BLFT_ATTR_LEAF_BUF;
 267                         break;
 268                 case XFS_DIR2_LEAFN_MAGIC:
 269                 case XFS_DIR3_LEAFN_MAGIC:
 270                         type = XFS_BLFT_DIR_LEAFN_BUF;
 271                         break;
 272                 default:
 273                         type = 0;
 274                         ASSERT(0);
 275                         break;
 276                 }
 277                 xfs_trans_buf_set_type(tp, *bpp, type);
 278         }
 279         return err;
 280 }
 281
 282 /*========================================================================
 283  * Routines used for growing the Btree.
 284  *========================================================================*/
 285
 286 /*
 287  * Create the initial contents of an intermediate node.
 288  */
 289 int
 290 xfs_da3_node_create(
 291         struct xfs_da_args      *args,
 292         xfs_dablk_t             blkno,
 293         int                     level,
 294         struct xfs_buf          **bpp,
 295         int                     whichfork)
 296 {
 297         struct xfs_da_intnode   *node;
 298         struct xfs_trans        *tp = args->trans;
 299         struct xfs_mount        *mp = tp->t_mountp;
 300         struct xfs_da3_icnode_hdr ichdr = {0};
 301         struct xfs_buf          *bp;
 302         int                     error;
 303         struct xfs_inode        *dp = args->dp;
 304
 305         trace_xfs_da_node_create(args);
 306         ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
 307
 308         error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
 309         if (error)
 310                 return error;
 311         bp->b_ops = &xfs_da3_node_buf_ops;
 312         xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
 313         node = bp->b_addr;
 314
 315         if (xfs_sb_version_hascrc(&mp->m_sb)) {
 316                 struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
 317
 318                 memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr));
 319                 ichdr.magic = XFS_DA3_NODE_MAGIC;
 320                 hdr3->info.blkno = cpu_to_be64(bp->b_bn);
 321                 hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
 322                 uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid);
 323         } else {
 324                 ichdr.magic = XFS_DA_NODE_MAGIC;
 325         }
 326         ichdr.level = level;
 327
 328         dp->d_ops->node_hdr_to_disk(node, &ichdr);
 329         xfs_trans_log_buf(tp, bp,
 330                 XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
 331
 332         *bpp = bp;
 333         return 0;
 334 }
 335
 336 /*
 337  * Split a leaf node, rebalance, then possibly split
 338  * intermediate nodes, rebalance, etc.
 339  */
 340 int                                                     /* error */
 341 xfs_da3_split(
 342         struct xfs_da_state     *state)
 343 {
 344         struct xfs_da_state_blk *oldblk;
 345         struct xfs_da_state_blk *newblk;
 346         struct xfs_da_state_blk *addblk;
 347         struct xfs_da_intnode   *node;
 348         int                     max;
 349         int                     action = 0;
 350         int                     error;
 351         int                     i;
 352
 353         trace_xfs_da_split(state->args);
 354
 355         /*
 356          * Walk back up the tree splitting/inserting/adjusting as necessary.
 357          * If we need to insert and there isn't room, split the node, then
 358          * decide which fragment to insert the new block from below into.
 359          * Note that we may split the root this way, but we need more fixup.
 360          */
 361         max = state->path.active - 1;
 362         ASSERT((max >= 0) && (max < XFS_DA_NODE_MAXDEPTH));
 363         ASSERT(state->path.blk[max].magic == XFS_ATTR_LEAF_MAGIC ||
 364                state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
 365
 366         addblk = &state->path.blk[max];         /* initial dummy value */
 367         for (i = max; (i >= 0) && addblk; state->path.active--, i--) {
 368                 oldblk = &state->path.blk[i];
 369                 newblk = &state->altpath.blk[i];
 370
 371                 /*
 372                  * If a leaf node then
 373                  *     Allocate a new leaf node, then rebalance across them.
 374                  * else if an intermediate node then
 375                  *     We split on the last layer, must we split the node?
 376                  */
 377                 switch (oldblk->magic) {
 378                 case XFS_ATTR_LEAF_MAGIC:
 379                         error = xfs_attr3_leaf_split(state, oldblk, newblk);
 380                         if ((error != 0) && (error != -ENOSPC)) {
 381                                 return error;   /* GROT: attr is inconsistent */
 382                         }
 383                         if (!error) {
 384                                 addblk = newblk;
 385                                 break;
 386                         }
 387                         /*
 388                          * Entry wouldn't fit, split the leaf again. The new
 389                          * extrablk will be consumed by xfs_da3_node_split if
 390                          * the node is split.
 391                          */
 392                         state->extravalid = 1;
 393                         if (state->inleaf) {
 394                                 state->extraafter = 0;  /* before newblk */
 395                                 trace_xfs_attr_leaf_split_before(state->args);
 396                                 error = xfs_attr3_leaf_split(state, oldblk,
 397                                                             &state->extrablk);
 398                         } else {
 399                                 state->extraafter = 1;  /* after newblk */
 400                                 trace_xfs_attr_leaf_split_after(state->args);
 401                                 error = xfs_attr3_leaf_split(state, newblk,
 402                                                             &state->extrablk);
 403                         }
 404                         if (error)
 405                                 return error;   /* GROT: attr inconsistent */
 406                         addblk = newblk;
 407                         break;
 408                 case XFS_DIR2_LEAFN_MAGIC:
 409                         error = xfs_dir2_leafn_split(state, oldblk, newblk);
 410                         if (error)
 411                                 return error;
 412                         addblk = newblk;
 413                         break;
 414                 case XFS_DA_NODE_MAGIC:
 415                         error = xfs_da3_node_split(state, oldblk, newblk, addblk,
 416                                                          max - i, &action);
 417                         addblk->bp = NULL;
 418                         if (error)
 419                                 return error;   /* GROT: dir is inconsistent */
 420                         /*
 421                          * Record the newly split block for the next time thru?
 422                          */
 423                         if (action)
 424                                 addblk = newblk;
 425                         else
 426                                 addblk = NULL;
 427                         break;
 428                 }
 429
 430                 /*
 431                  * Update the btree to show the new hashval for this child.
 432                  */
 433                 xfs_da3_fixhashpath(state, &state->path);
 434         }
 435         if (!addblk)
 436                 return 0;
 437
 438         /*
 439          * xfs_da3_node_split() should have consumed any extra blocks we added
 440          * during a double leaf split in the attr fork. This is guaranteed as
 441          * we can't be here if the attr fork only has a single leaf block.
 442          */
 443         ASSERT(state->extravalid == 0 ||
 444                state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
 445
 446         /*
 447          * Split the root node.
 448          */
 449         ASSERT(state->path.active == 0);
 450         oldblk = &state->path.blk[0];
 451         error = xfs_da3_root_split(state, oldblk, addblk);
 452         if (error) {
 453                 addblk->bp = NULL;
 454                 return error;   /* GROT: dir is inconsistent */
 455         }
 456
 457         /*
 458          * Update pointers to the node which used to be block 0 and just got
 459          * bumped because of the addition of a new root node.  Note that the
 460          * original block 0 could be at any position in the list of blocks in
 461          * the tree.
 462          *
 463          * Note: the magic numbers and sibling pointers are in the same physical
 464          * place for both v2 and v3 headers (by design). Hence it doesn't matter
 465          * which version of the xfs_da_intnode structure we use here as the
 466          * result will be the same using either structure.
 467          */
 468         node = oldblk->bp->b_addr;
 469         if (node->hdr.info.forw) {
 470                 ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
 471                 node = addblk->bp->b_addr;
 472                 node->hdr.info.back = cpu_to_be32(oldblk->blkno);
 473                 xfs_trans_log_buf(state->args->trans, addblk->bp,
 474                                   XFS_DA_LOGRANGE(node, &node->hdr.info,
 475                                   sizeof(node->hdr.info)));
 476         }
 477         node = oldblk->bp->b_addr;
 478         if (node->hdr.info.back) {
 479                 ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
 480                 node = addblk->bp->b_addr;
 481                 node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
 482                 xfs_trans_log_buf(state->args->trans, addblk->bp,
 483                                   XFS_DA_LOGRANGE(node, &node->hdr.info,
 484                                   sizeof(node->hdr.info)));
 485         }
 486         addblk->bp = NULL;
 487         return 0;
 488 }
 489
 490 /*
 491  * Split the root.  We have to create a new root and point to the two
 492  * parts (the split old root) that we just created.  Copy block zero to
 493  * the EOF, extending the inode in process.
 494  */
 495 STATIC int                                              /* error */
 496 xfs_da3_root_split(
 497         struct xfs_da_state     *state,
 498         struct xfs_da_state_blk *blk1,
 499         struct xfs_da_state_blk *blk2)
 500 {
 501         struct xfs_da_intnode   *node;
 502         struct xfs_da_intnode   *oldroot;
 503         struct xfs_da_node_entry *btree;
 504         struct xfs_da3_icnode_hdr nodehdr;
 505         struct xfs_da_args      *args;
 506         struct xfs_buf          *bp;
 507         struct xfs_inode        *dp;
 508         struct xfs_trans        *tp;
 509         struct xfs_dir2_leaf    *leaf;
 510         xfs_dablk_t             blkno;
 511         int                     level;
 512         int                     error;
 513         int                     size;
 514
 515         trace_xfs_da_root_split(state->args);
 516
 517         /*
 518          * Copy the existing (incorrect) block from the root node position
 519          * to a free space somewhere.
 520          */
 521         args = state->args;
 522         error = xfs_da_grow_inode(args, &blkno);
 523         if (error)
 524                 return error;
 525
 526         dp = args->dp;
 527         tp = args->trans;
 528         error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork);
 529         if (error)
 530                 return error;
 531         node = bp->b_addr;
 532         oldroot = blk1->bp->b_addr;
 533         if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
 534             oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
 535                 struct xfs_da3_icnode_hdr icnodehdr;
 536
 537                 dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot);
 538                 btree = dp->d_ops->node_tree_p(oldroot);
 539                 size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot);
 540                 level = icnodehdr.level;
 541
 542                 /*
 543                  * we are about to copy oldroot to bp, so set up the type
 544                  * of bp while we know exactly what it will be.
 545                  */
 546                 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
 547         } else {
 548                 struct xfs_dir3_icleaf_hdr leafhdr;
 549                 struct xfs_dir2_leaf_entry *ents;
 550
 551                 leaf = (xfs_dir2_leaf_t *)oldroot;
 552                 dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
 553                 ents = dp->d_ops->leaf_ents_p(leaf);
 554
 555                 ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
 556                        leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
 557                 size = (int)((char *)&ents[leafhdr.count] - (char *)leaf);
 558                 level = 0;
 559
 560                 /*
 561                  * we are about to copy oldroot to bp, so set up the type
 562                  * of bp while we know exactly what it will be.
 563                  */
 564                 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
 565         }
 566
 567         /*
 568          * we can copy most of the information in the node from one block to
 569          * another, but for CRC enabled headers we have to make sure that the
 570          * block specific identifiers are kept intact. We update the buffer
 571          * directly for this.
 572          */
 573         memcpy(node, oldroot, size);
 574         if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
 575             oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
 576                 struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
 577
 578                 node3->hdr.info.blkno = cpu_to_be64(bp->b_bn);
 579         }
 580         xfs_trans_log_buf(tp, bp, 0, size - 1);
 581
 582         bp->b_ops = blk1->bp->b_ops;
 583         xfs_trans_buf_copy_type(bp, blk1->bp);
 584         blk1->bp = bp;
 585         blk1->blkno = blkno;
 586
 587         /*
 588          * Set up the new root node.
 589          */
 590         error = xfs_da3_node_create(args,
 591                 (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0,
 592                 level + 1, &bp, args->whichfork);
 593         if (error)
 594                 return error;
 595
 596         node = bp->b_addr;
 597         dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 598         btree = dp->d_ops->node_tree_p(node);
 599         btree[0].hashval = cpu_to_be32(blk1->hashval);
 600         btree[0].before = cpu_to_be32(blk1->blkno);
 601         btree[1].hashval = cpu_to_be32(blk2->hashval);
 602         btree[1].before = cpu_to_be32(blk2->blkno);
 603         nodehdr.count = 2;
 604         dp->d_ops->node_hdr_to_disk(node, &nodehdr);
 605
 606 #ifdef DEBUG
 607         if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
 608             oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
 609                 ASSERT(blk1->blkno >= args->geo->leafblk &&
 610                        blk1->blkno < args->geo->freeblk);
 611                 ASSERT(blk2->blkno >= args->geo->leafblk &&
 612                        blk2->blkno < args->geo->freeblk);
 613         }
 614 #endif
 615
 616         /* Header is already logged by xfs_da_node_create */
 617         xfs_trans_log_buf(tp, bp,
 618                 XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2));
 619
 620         return 0;
 621 }
 622
 623 /*
 624  * Split the node, rebalance, then add the new entry.
 625  */
 626 STATIC int                                              /* error */
 627 xfs_da3_node_split(
 628         struct xfs_da_state     *state,
 629         struct xfs_da_state_blk *oldblk,
 630         struct xfs_da_state_blk *newblk,
 631         struct xfs_da_state_blk *addblk,
 632         int                     treelevel,
 633         int                     *result)
 634 {
 635         struct xfs_da_intnode   *node;
 636         struct xfs_da3_icnode_hdr nodehdr;
 637         xfs_dablk_t             blkno;
 638         int                     newcount;
 639         int                     error;
 640         int                     useextra;
 641         struct xfs_inode        *dp = state->args->dp;
 642
 643         trace_xfs_da_node_split(state->args);
 644
 645         node = oldblk->bp->b_addr;
 646         dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 647
 648         /*
 649          * With V2 dirs the extra block is data or freespace.
 650          */
 651         useextra = state->extravalid && state->args->whichfork == XFS_ATTR_FORK;
 652         newcount = 1 + useextra;
 653         /*
 654          * Do we have to split the node?
 655          */
 656         if (nodehdr.count + newcount > state->args->geo->node_ents) {
 657                 /*
 658                  * Allocate a new node, add to the doubly linked chain of
 659                  * nodes, then move some of our excess entries into it.
 660                  */
 661                 error = xfs_da_grow_inode(state->args, &blkno);
 662                 if (error)
 663                         return error;   /* GROT: dir is inconsistent */
 664
 665                 error = xfs_da3_node_create(state->args, blkno, treelevel,
 666                                            &newblk->bp, state->args->whichfork);
 667                 if (error)
 668                         return error;   /* GROT: dir is inconsistent */
 669                 newblk->blkno = blkno;
 670                 newblk->magic = XFS_DA_NODE_MAGIC;
 671                 xfs_da3_node_rebalance(state, oldblk, newblk);
 672                 error = xfs_da3_blk_link(state, oldblk, newblk);
 673                 if (error)
 674                         return error;
 675                 *result = 1;
 676         } else {
 677                 *result = 0;
 678         }
 679
 680         /*
 681          * Insert the new entry(s) into the correct block
 682          * (updating last hashval in the process).
 683          *
 684          * xfs_da3_node_add() inserts BEFORE the given index,
 685          * and as a result of using node_lookup_int() we always
 686          * point to a valid entry (not after one), but a split
 687          * operation always results in a new block whose hashvals
 688          * FOLLOW the current block.
 689          *
 690          * If we had double-split op below us, then add the extra block too.
 691          */
 692         node = oldblk->bp->b_addr;
 693         dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 694         if (oldblk->index <= nodehdr.count) {
 695                 oldblk->index++;
 696                 xfs_da3_node_add(state, oldblk, addblk);
 697                 if (useextra) {
 698                         if (state->extraafter)
 699                                 oldblk->index++;
 700                         xfs_da3_node_add(state, oldblk, &state->extrablk);
 701                         state->extravalid = 0;
 702                 }
 703         } else {
 704                 newblk->index++;
 705                 xfs_da3_node_add(state, newblk, addblk);
 706                 if (useextra) {
 707                         if (state->extraafter)
 708                                 newblk->index++;
 709                         xfs_da3_node_add(state, newblk, &state->extrablk);
 710                         state->extravalid = 0;
 711                 }
 712         }
 713
 714         return 0;
 715 }
 716
 717 /*
 718  * Balance the btree elements between two intermediate nodes,
 719  * usually one full and one empty.
 720  *
 721  * NOTE: if blk2 is empty, then it will get the upper half of blk1.
 722  */
 723 STATIC void
 724 xfs_da3_node_rebalance(
 725         struct xfs_da_state     *state,
 726         struct xfs_da_state_blk *blk1,
 727         struct xfs_da_state_blk *blk2)
 728 {
 729         struct xfs_da_intnode   *node1;
 730         struct xfs_da_intnode   *node2;
 731         struct xfs_da_intnode   *tmpnode;
 732         struct xfs_da_node_entry *btree1;
 733         struct xfs_da_node_entry *btree2;
 734         struct xfs_da_node_entry *btree_s;
 735         struct xfs_da_node_entry *btree_d;
 736         struct xfs_da3_icnode_hdr nodehdr1;
 737         struct xfs_da3_icnode_hdr nodehdr2;
 738         struct xfs_trans        *tp;
 739         int                     count;
 740         int                     tmp;
 741         int                     swap = 0;
 742         struct xfs_inode        *dp = state->args->dp;
 743
 744         trace_xfs_da_node_rebalance(state->args);
 745
 746         node1 = blk1->bp->b_addr;
 747         node2 = blk2->bp->b_addr;
 748         dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
 749         dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
 750         btree1 = dp->d_ops->node_tree_p(node1);
 751         btree2 = dp->d_ops->node_tree_p(node2);
 752
 753         /*
 754          * Figure out how many entries need to move, and in which direction.
 755          * Swap the nodes around if that makes it simpler.
 756          */
 757         if (nodehdr1.count > 0 && nodehdr2.count > 0 &&
 758             ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
 759              (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) <
 760                         be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) {
 761                 tmpnode = node1;
 762                 node1 = node2;
 763                 node2 = tmpnode;
 764                 dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
 765                 dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
 766                 btree1 = dp->d_ops->node_tree_p(node1);
 767                 btree2 = dp->d_ops->node_tree_p(node2);
 768                 swap = 1;
 769         }
 770
 771         count = (nodehdr1.count - nodehdr2.count) / 2;
 772         if (count == 0)
 773                 return;
 774         tp = state->args->trans;
 775         /*
 776          * Two cases: high-to-low and low-to-high.
 777          */
 778         if (count > 0) {
 779                 /*
 780                  * Move elements in node2 up to make a hole.
 781                  */
 782                 tmp = nodehdr2.count;
 783                 if (tmp > 0) {
 784                         tmp *= (uint)sizeof(xfs_da_node_entry_t);
 785                         btree_s = &btree2[0];
 786                         btree_d = &btree2[count];
 787                         memmove(btree_d, btree_s, tmp);
 788                 }
 789
 790                 /*
 791                  * Move the req'd B-tree elements from high in node1 to
 792                  * low in node2.
 793                  */
 794                 nodehdr2.count += count;
 795                 tmp = count * (uint)sizeof(xfs_da_node_entry_t);
 796                 btree_s = &btree1[nodehdr1.count - count];
 797                 btree_d = &btree2[0];
 798                 memcpy(btree_d, btree_s, tmp);
 799                 nodehdr1.count -= count;
 800         } else {
 801                 /*
 802                  * Move the req'd B-tree elements from low in node2 to
 803                  * high in node1.
 804                  */
 805                 count = -count;
 806                 tmp = count * (uint)sizeof(xfs_da_node_entry_t);
 807                 btree_s = &btree2[0];
 808                 btree_d = &btree1[nodehdr1.count];
 809                 memcpy(btree_d, btree_s, tmp);
 810                 nodehdr1.count += count;
 811
 812                 xfs_trans_log_buf(tp, blk1->bp,
 813                         XFS_DA_LOGRANGE(node1, btree_d, tmp));
 814
 815                 /*
 816                  * Move elements in node2 down to fill the hole.
 817                  */
 818                 tmp  = nodehdr2.count - count;
 819                 tmp *= (uint)sizeof(xfs_da_node_entry_t);
 820                 btree_s = &btree2[count];
 821                 btree_d = &btree2[0];
 822                 memmove(btree_d, btree_s, tmp);
 823                 nodehdr2.count -= count;
 824         }
 825
 826         /*
 827          * Log header of node 1 and all current bits of node 2.
 828          */
 829         dp->d_ops->node_hdr_to_disk(node1, &nodehdr1);
 830         xfs_trans_log_buf(tp, blk1->bp,
 831                 XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size));
 832
 833         dp->d_ops->node_hdr_to_disk(node2, &nodehdr2);
 834         xfs_trans_log_buf(tp, blk2->bp,
 835                 XFS_DA_LOGRANGE(node2, &node2->hdr,
 836                                 dp->d_ops->node_hdr_size +
 837                                 (sizeof(btree2[0]) * nodehdr2.count)));
 838
 839         /*
 840          * Record the last hashval from each block for upward propagation.
 841          * (note: don't use the swapped node pointers)
 842          */
 843         if (swap) {
 844                 node1 = blk1->bp->b_addr;
 845                 node2 = blk2->bp->b_addr;
 846                 dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
 847                 dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
 848                 btree1 = dp->d_ops->node_tree_p(node1);
 849                 btree2 = dp->d_ops->node_tree_p(node2);
 850         }
 851         blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
 852         blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
 853
 854         /*
 855          * Adjust the expected index for insertion.
 856          */
 857         if (blk1->index >= nodehdr1.count) {
 858                 blk2->index = blk1->index - nodehdr1.count;
 859                 blk1->index = nodehdr1.count + 1;       /* make it invalid */
 860         }
 861 }
 862
 863 /*
 864  * Add a new entry to an intermediate node.
 865  */
 866 STATIC void
 867 xfs_da3_node_add(
 868         struct xfs_da_state     *state,
 869         struct xfs_da_state_blk *oldblk,
 870         struct xfs_da_state_blk *newblk)
 871 {
 872         struct xfs_da_intnode   *node;
 873         struct xfs_da3_icnode_hdr nodehdr;
 874         struct xfs_da_node_entry *btree;
 875         int                     tmp;
 876         struct xfs_inode        *dp = state->args->dp;
 877
 878         trace_xfs_da_node_add(state->args);
 879
 880         node = oldblk->bp->b_addr;
 881         dp->d_ops->node_hdr_from_disk(&nodehdr, node);
 882         btree = dp->d_ops->node_tree_p(node);
 883
 884         ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
 885         ASSERT(newblk->blkno != 0);
 886         if (state->args->whichfork == XFS_DATA_FORK)
 887                 ASSERT(newblk->blkno >= state->args->geo->leafblk &&
 888                        newblk->blkno < state->args->geo->freeblk);
 889
 890         /*
 891          * We may need to make some room before we insert the new node.
 892          */
 893         tmp = 0;
 894         if (oldblk->index < nodehdr.count) {
 895                 tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree);
 896                 memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp);
 897         }
 898         btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval);
 899         btree[oldblk->index].before = cpu_to_be32(newblk->blkno);
 900         xfs_trans_log_buf(state->args->trans, oldblk->bp,
 901                 XFS_DA_LOGRANGE(node, &btree[oldblk->index],
 902                                 tmp + sizeof(*btree)));
 903
 904         nodehdr.count += 1;
 905         dp->d_ops->node_hdr_to_disk(node, &nodehdr);
 906         xfs_trans_log_buf(state->args->trans, oldblk->bp,
 907                 XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
 908
 909         /*
 910          * Copy the last hash value from the oldblk to propagate upwards.
 911          */
 912         oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
 913 }
 914
 915 /*========================================================================
 916  * Routines used for shrinking the Btree.
 917  *========================================================================*/
 918
 919 /*
 920  * Deallocate an empty leaf node, remove it from its parent,
 921  * possibly deallocating that block, etc...
 922  */
 923 int
 924 xfs_da3_join(
 925         struct xfs_da_state     *state)
 926 {
 927         struct xfs_da_state_blk *drop_blk;
 928         struct xfs_da_state_blk *save_blk;
 929         int                     action = 0;
 930         int                     error;
 931
 932         trace_xfs_da_join(state->args);
 933
 934         drop_blk = &state->path.blk[ state->path.active-1 ];
 935         save_blk = &state->altpath.blk[ state->path.active-1 ];
 936         ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC);
 937         ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC ||
 938                drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
 939
 940         /*
 941          * Walk back up the tree joining/deallocating as necessary.
 942          * When we stop dropping blocks, break out.
 943          */
 944         for (  ; state->path.active >= 2; drop_blk--, save_blk--,
 945                  state->path.active--) {
 946                 /*
 947                  * See if we can combine the block with a neighbor.
 948                  *   (action == 0) => no options, just leave
 949                  *   (action == 1) => coalesce, then unlink
 950                  *   (action == 2) => block empty, unlink it
 951                  */
 952                 switch (drop_blk->magic) {
 953                 case XFS_ATTR_LEAF_MAGIC:
 954                         error = xfs_attr3_leaf_toosmall(state, &action);
 955                         if (error)
 956                                 return error;
 957                         if (action == 0)
 958                                 return 0;
 959                         xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
 960                         break;
 961                 case XFS_DIR2_LEAFN_MAGIC:
 962                         error = xfs_dir2_leafn_toosmall(state, &action);
 963                         if (error)
 964                                 return error;
 965                         if (action == 0)
 966                                 return 0;
 967                         xfs_dir2_leafn_unbalance(state, drop_blk, save_blk);
 968                         break;
 969                 case XFS_DA_NODE_MAGIC:
 970                         /*
 971                          * Remove the offending node, fixup hashvals,
 972                          * check for a toosmall neighbor.
 973                          */
 974                         xfs_da3_node_remove(state, drop_blk);
 975                         xfs_da3_fixhashpath(state, &state->path);
 976                         error = xfs_da3_node_toosmall(state, &action);
 977                         if (error)
 978                                 return error;
 979                         if (action == 0)
 980                                 return 0;
 981                         xfs_da3_node_unbalance(state, drop_blk, save_blk);
 982                         break;
 983                 }
 984                 xfs_da3_fixhashpath(state, &state->altpath);
 985                 error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
 986                 xfs_da_state_kill_altpath(state);
 987                 if (error)
 988                         return error;
 989                 error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
 990                                                          drop_blk->bp);
 991                 drop_blk->bp = NULL;
 992                 if (error)
 993                         return error;
 994         }
 995         /*
 996          * We joined all the way to the top.  If it turns out that
 997          * we only have one entry in the root, make the child block
 998          * the new root.
 999          */
1000         xfs_da3_node_remove(state, drop_blk);
1001         xfs_da3_fixhashpath(state, &state->path);
1002         error = xfs_da3_root_join(state, &state->path.blk[0]);
1003         return error;
1004 }
1005
1006 #ifdef  DEBUG
1007 static void
1008 xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level)
1009 {
1010         __be16  magic = blkinfo->magic;
1011
1012         if (level == 1) {
1013                 ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
1014                        magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
1015                        magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
1016                        magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
1017         } else {
1018                 ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
1019                        magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
1020         }
1021         ASSERT(!blkinfo->forw);
1022         ASSERT(!blkinfo->back);
1023 }
1024 #else   /* !DEBUG */
1025 #define xfs_da_blkinfo_onlychild_validate(blkinfo, level)
1026 #endif  /* !DEBUG */
1027
1028 /*
1029  * We have only one entry in the root.  Copy the only remaining child of
1030  * the old root to block 0 as the new root node.
1031  */
1032 STATIC int
1033 xfs_da3_root_join(
1034         struct xfs_da_state     *state,
1035         struct xfs_da_state_blk *root_blk)
1036 {
1037         struct xfs_da_intnode   *oldroot;
1038         struct xfs_da_args      *args;
1039         xfs_dablk_t             child;
1040         struct xfs_buf          *bp;
1041         struct xfs_da3_icnode_hdr oldroothdr;
1042         struct xfs_da_node_entry *btree;
1043         int                     error;
1044         struct xfs_inode        *dp = state->args->dp;
1045
1046         trace_xfs_da_root_join(state->args);
1047
1048         ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC);
1049
1050         args = state->args;
1051         oldroot = root_blk->bp->b_addr;
1052         dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot);
1053         ASSERT(oldroothdr.forw == 0);
1054         ASSERT(oldroothdr.back == 0);
1055
1056         /*
1057          * If the root has more than one child, then don't do anything.
1058          */
1059         if (oldroothdr.count > 1)
1060                 return 0;
1061
1062         /*
1063          * Read in the (only) child block, then copy those bytes into
1064          * the root block's buffer and free the original child block.
1065          */
1066         btree = dp->d_ops->node_tree_p(oldroot);
1067         child = be32_to_cpu(btree[0].before);
1068         ASSERT(child != 0);
1069         error = xfs_da3_node_read(args->trans, dp, child, -1, &bp,
1070                                              args->whichfork);
1071         if (error)
1072                 return error;
1073         xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
1074
1075         /*
1076          * This could be copying a leaf back into the root block in the case of
1077          * there only being a single leaf block left in the tree. Hence we have
1078          * to update the b_ops pointer as well to match the buffer type change
1079          * that could occur. For dir3 blocks we also need to update the block
1080          * number in the buffer header.
1081          */
1082         memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
1083         root_blk->bp->b_ops = bp->b_ops;
1084         xfs_trans_buf_copy_type(root_blk->bp, bp);
1085         if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
1086                 struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
1087                 da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
1088         }
1089         xfs_trans_log_buf(args->trans, root_blk->bp, 0,
1090                           args->geo->blksize - 1);
1091         error = xfs_da_shrink_inode(args, child, bp);
1092         return error;
1093 }
1094
1095 /*
1096  * Check a node block and its neighbors to see if the block should be
1097  * collapsed into one or the other neighbor.  Always keep the block
1098  * with the smaller block number.
1099  * If the current block is over 50% full, don't try to join it, return 0.
1100  * If the block is empty, fill in the state structure and return 2.
1101  * If it can be collapsed, fill in the state structure and return 1.
1102  * If nothing can be done, return 0.
1103  */
1104 STATIC int
1105 xfs_da3_node_toosmall(
1106         struct xfs_da_state     *state,
1107         int                     *action)
1108 {
1109         struct xfs_da_intnode   *node;
1110         struct xfs_da_state_blk *blk;
1111         struct xfs_da_blkinfo   *info;
1112         xfs_dablk_t             blkno;
1113         struct xfs_buf          *bp;
1114         struct xfs_da3_icnode_hdr nodehdr;
1115         int                     count;
1116         int                     forward;
1117         int                     error;
1118         int                     retval;
1119         int                     i;
1120         struct xfs_inode        *dp = state->args->dp;
1121
1122         trace_xfs_da_node_toosmall(state->args);
1123
1124         /*
1125          * Check for the degenerate case of the block being over 50% full.
1126          * If so, it's not worth even looking to see if we might be able
1127          * to coalesce with a sibling.
1128          */
1129         blk = &state->path.blk[ state->path.active-1 ];
1130         info = blk->bp->b_addr;
1131         node = (xfs_da_intnode_t *)info;
1132         dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1133         if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
1134                 *action = 0;    /* blk over 50%, don't try to join */
1135                 return 0;       /* blk over 50%, don't try to join */
1136         }
1137
1138         /*
1139          * Check for the degenerate case of the block being empty.
1140          * If the block is empty, we'll simply delete it, no need to
1141          * coalesce it with a sibling block.  We choose (arbitrarily)
1142          * to merge with the forward block unless it is NULL.
1143          */
1144         if (nodehdr.count == 0) {
1145                 /*
1146                  * Make altpath point to the block we want to keep and
1147                  * path point to the block we want to drop (this one).
1148                  */
1149                 forward = (info->forw != 0);
1150                 memcpy(&state->altpath, &state->path, sizeof(state->path));
1151                 error = xfs_da3_path_shift(state, &state->altpath, forward,
1152                                                  0, &retval);
1153                 if (error)
1154                         return error;
1155                 if (retval) {
1156                         *action = 0;
1157                 } else {
1158                         *action = 2;
1159                 }
1160                 return 0;
1161         }
1162
1163         /*
1164          * Examine each sibling block to see if we can coalesce with
1165          * at least 25% free space to spare.  We need to figure out
1166          * whether to merge with the forward or the backward block.
1167          * We prefer coalescing with the lower numbered sibling so as
1168          * to shrink a directory over time.
1169          */
1170         count  = state->args->geo->node_ents;
1171         count -= state->args->geo->node_ents >> 2;
1172         count -= nodehdr.count;
1173
1174         /* start with smaller blk num */
1175         forward = nodehdr.forw < nodehdr.back;
1176         for (i = 0; i < 2; forward = !forward, i++) {
1177                 struct xfs_da3_icnode_hdr thdr;
1178                 if (forward)
1179                         blkno = nodehdr.forw;
1180                 else
1181                         blkno = nodehdr.back;
1182                 if (blkno == 0)
1183                         continue;
1184                 error = xfs_da3_node_read(state->args->trans, dp,
1185                                         blkno, -1, &bp, state->args->whichfork);
1186                 if (error)
1187                         return error;
1188
1189                 node = bp->b_addr;
1190                 dp->d_ops->node_hdr_from_disk(&thdr, node);
1191                 xfs_trans_brelse(state->args->trans, bp);
1192
1193                 if (count - thdr.count >= 0)
1194                         break;  /* fits with at least 25% to spare */
1195         }
1196         if (i >= 2) {
1197                 *action = 0;
1198                 return 0;
1199         }
1200
1201         /*
1202          * Make altpath point to the block we want to keep (the lower
1203          * numbered block) and path point to the block we want to drop.
1204          */
1205         memcpy(&state->altpath, &state->path, sizeof(state->path));
1206         if (blkno < blk->blkno) {
1207                 error = xfs_da3_path_shift(state, &state->altpath, forward,
1208                                                  0, &retval);
1209         } else {
1210                 error = xfs_da3_path_shift(state, &state->path, forward,
1211                                                  0, &retval);
1212         }
1213         if (error)
1214                 return error;
1215         if (retval) {
1216                 *action = 0;
1217                 return 0;
1218         }
1219         *action = 1;
1220         return 0;
1221 }
1222
1223 /*
1224  * Pick up the last hashvalue from an intermediate node.
1225  */
1226 STATIC uint
1227 xfs_da3_node_lasthash(
1228         struct xfs_inode        *dp,
1229         struct xfs_buf          *bp,
1230         int                     *count)
1231 {
1232         struct xfs_da_intnode    *node;
1233         struct xfs_da_node_entry *btree;
1234         struct xfs_da3_icnode_hdr nodehdr;
1235
1236         node = bp->b_addr;
1237         dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1238         if (count)
1239                 *count = nodehdr.count;
1240         if (!nodehdr.count)
1241                 return 0;
1242         btree = dp->d_ops->node_tree_p(node);
1243         return be32_to_cpu(btree[nodehdr.count - 1].hashval);
1244 }
1245
1246 /*
1247  * Walk back up the tree adjusting hash values as necessary,
1248  * when we stop making changes, return.
1249  */
1250 void
1251 xfs_da3_fixhashpath(
1252         struct xfs_da_state     *state,
1253         struct xfs_da_state_path *path)
1254 {
1255         struct xfs_da_state_blk *blk;
1256         struct xfs_da_intnode   *node;
1257         struct xfs_da_node_entry *btree;
1258         xfs_dahash_t            lasthash=0;
1259         int                     level;
1260         int                     count;
1261         struct xfs_inode        *dp = state->args->dp;
1262
1263         trace_xfs_da_fixhashpath(state->args);
1264
1265         level = path->active-1;
1266         blk = &path->blk[ level ];
1267         switch (blk->magic) {
1268         case XFS_ATTR_LEAF_MAGIC:
1269                 lasthash = xfs_attr_leaf_lasthash(blk->bp, &count);
1270                 if (count == 0)
1271                         return;
1272                 break;
1273         case XFS_DIR2_LEAFN_MAGIC:
1274                 lasthash = xfs_dir2_leaf_lasthash(dp, blk->bp, &count);
1275                 if (count == 0)
1276                         return;
1277                 break;
1278         case XFS_DA_NODE_MAGIC:
1279                 lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count);
1280                 if (count == 0)
1281                         return;
1282                 break;
1283         }
1284         for (blk--, level--; level >= 0; blk--, level--) {
1285                 struct xfs_da3_icnode_hdr nodehdr;
1286
1287                 node = blk->bp->b_addr;
1288                 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1289                 btree = dp->d_ops->node_tree_p(node);
1290                 if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
1291                         break;
1292                 blk->hashval = lasthash;
1293                 btree[blk->index].hashval = cpu_to_be32(lasthash);
1294                 xfs_trans_log_buf(state->args->trans, blk->bp,
1295                                   XFS_DA_LOGRANGE(node, &btree[blk->index],
1296                                                   sizeof(*btree)));
1297
1298                 lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval);
1299         }
1300 }
1301
1302 /*
1303  * Remove an entry from an intermediate node.
1304  */
1305 STATIC void
1306 xfs_da3_node_remove(
1307         struct xfs_da_state     *state,
1308         struct xfs_da_state_blk *drop_blk)
1309 {
1310         struct xfs_da_intnode   *node;
1311         struct xfs_da3_icnode_hdr nodehdr;
1312         struct xfs_da_node_entry *btree;
1313         int                     index;
1314         int                     tmp;
1315         struct xfs_inode        *dp = state->args->dp;
1316
1317         trace_xfs_da_node_remove(state->args);
1318
1319         node = drop_blk->bp->b_addr;
1320         dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1321         ASSERT(drop_blk->index < nodehdr.count);
1322         ASSERT(drop_blk->index >= 0);
1323
1324         /*
1325          * Copy over the offending entry, or just zero it out.
1326          */
1327         index = drop_blk->index;
1328         btree = dp->d_ops->node_tree_p(node);
1329         if (index < nodehdr.count - 1) {
1330                 tmp  = nodehdr.count - index - 1;
1331                 tmp *= (uint)sizeof(xfs_da_node_entry_t);
1332                 memmove(&btree[index], &btree[index + 1], tmp);
1333                 xfs_trans_log_buf(state->args->trans, drop_blk->bp,
1334                     XFS_DA_LOGRANGE(node, &btree[index], tmp));
1335                 index = nodehdr.count - 1;
1336         }
1337         memset(&btree[index], 0, sizeof(xfs_da_node_entry_t));
1338         xfs_trans_log_buf(state->args->trans, drop_blk->bp,
1339             XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
1340         nodehdr.count -= 1;
1341         dp->d_ops->node_hdr_to_disk(node, &nodehdr);
1342         xfs_trans_log_buf(state->args->trans, drop_blk->bp,
1343             XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
1344
1345         /*
1346          * Copy the last hash value from the block to propagate upwards.
1347          */
1348         drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval);
1349 }
1350
1351 /*
1352  * Unbalance the elements between two intermediate nodes,
1353  * move all Btree elements from one node into another.
1354  */
1355 STATIC void
1356 xfs_da3_node_unbalance(
1357         struct xfs_da_state     *state,
1358         struct xfs_da_state_blk *drop_blk,
1359         struct xfs_da_state_blk *save_blk)
1360 {
1361         struct xfs_da_intnode   *drop_node;
1362         struct xfs_da_intnode   *save_node;
1363         struct xfs_da_node_entry *drop_btree;
1364         struct xfs_da_node_entry *save_btree;
1365         struct xfs_da3_icnode_hdr drop_hdr;
1366         struct xfs_da3_icnode_hdr save_hdr;
1367         struct xfs_trans        *tp;
1368         int                     sindex;
1369         int                     tmp;
1370         struct xfs_inode        *dp = state->args->dp;
1371
1372         trace_xfs_da_node_unbalance(state->args);
1373
1374         drop_node = drop_blk->bp->b_addr;
1375         save_node = save_blk->bp->b_addr;
1376         dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node);
1377         dp->d_ops->node_hdr_from_disk(&save_hdr, save_node);
1378         drop_btree = dp->d_ops->node_tree_p(drop_node);
1379         save_btree = dp->d_ops->node_tree_p(save_node);
1380         tp = state->args->trans;
1381
1382         /*
1383          * If the dying block has lower hashvals, then move all the
1384          * elements in the remaining block up to make a hole.
1385          */
1386         if ((be32_to_cpu(drop_btree[0].hashval) <
1387                         be32_to_cpu(save_btree[0].hashval)) ||
1388             (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) <
1389                         be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) {
1390                 /* XXX: check this - is memmove dst correct? */
1391                 tmp = save_hdr.count * sizeof(xfs_da_node_entry_t);
1392                 memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp);
1393
1394                 sindex = 0;
1395                 xfs_trans_log_buf(tp, save_blk->bp,
1396                         XFS_DA_LOGRANGE(save_node, &save_btree[0],
1397                                 (save_hdr.count + drop_hdr.count) *
1398                                                 sizeof(xfs_da_node_entry_t)));
1399         } else {
1400                 sindex = save_hdr.count;
1401                 xfs_trans_log_buf(tp, save_blk->bp,
1402                         XFS_DA_LOGRANGE(save_node, &save_btree[sindex],
1403                                 drop_hdr.count * sizeof(xfs_da_node_entry_t)));
1404         }
1405
1406         /*
1407          * Move all the B-tree elements from drop_blk to save_blk.
1408          */
1409         tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t);
1410         memcpy(&save_btree[sindex], &drop_btree[0], tmp);
1411         save_hdr.count += drop_hdr.count;
1412
1413         dp->d_ops->node_hdr_to_disk(save_node, &save_hdr);
1414         xfs_trans_log_buf(tp, save_blk->bp,
1415                 XFS_DA_LOGRANGE(save_node, &save_node->hdr,
1416                                 dp->d_ops->node_hdr_size));
1417
1418         /*
1419          * Save the last hashval in the remaining block for upward propagation.
1420          */
1421         save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval);
1422 }
1423
1424 /*========================================================================
1425  * Routines used for finding things in the Btree.
1426  *========================================================================*/
1427
1428 /*
1429  * Walk down the Btree looking for a particular filename, filling
1430  * in the state structure as we go.
1431  *
1432  * We will set the state structure to point to each of the elements
1433  * in each of the nodes where either the hashval is or should be.
1434  *
1435  * We support duplicate hashval's so for each entry in the current
1436  * node that could contain the desired hashval, descend.  This is a
1437  * pruned depth-first tree search.
1438  */
1439 int                                                     /* error */
1440 xfs_da3_node_lookup_int(
1441         struct xfs_da_state     *state,
1442         int                     *result)
1443 {
1444         struct xfs_da_state_blk *blk;
1445         struct xfs_da_blkinfo   *curr;
1446         struct xfs_da_intnode   *node;
1447         struct xfs_da_node_entry *btree;
1448         struct xfs_da3_icnode_hdr nodehdr;
1449         struct xfs_da_args      *args;
1450         xfs_dablk_t             blkno;
1451         xfs_dahash_t            hashval;
1452         xfs_dahash_t            btreehashval;
1453         int                     probe;
1454         int                     span;
1455         int                     max;
1456         int                     error;
1457         int                     retval;
1458         unsigned int            expected_level = 0;
1459         struct xfs_inode        *dp = state->args->dp;
1460
1461         args = state->args;
1462
1463         /*
1464          * Descend thru the B-tree searching each level for the right
1465          * node to use, until the right hashval is found.
1466          */
1467         blkno = args->geo->leafblk;
1468         for (blk = &state->path.blk[0], state->path.active = 1;
1469                          state->path.active <= XFS_DA_NODE_MAXDEPTH;
1470                          blk++, state->path.active++) {
1471                 /*
1472                  * Read the next node down in the tree.
1473                  */
1474                 blk->blkno = blkno;
1475                 error = xfs_da3_node_read(args->trans, args->dp, blkno,
1476                                         -1, &blk->bp, args->whichfork);
1477                 if (error) {
1478                         blk->blkno = 0;
1479                         state->path.active--;
1480                         return error;
1481                 }
1482                 curr = blk->bp->b_addr;
1483                 blk->magic = be16_to_cpu(curr->magic);
1484
1485                 if (blk->magic == XFS_ATTR_LEAF_MAGIC ||
1486                     blk->magic == XFS_ATTR3_LEAF_MAGIC) {
1487                         blk->magic = XFS_ATTR_LEAF_MAGIC;
1488                         blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
1489                         break;
1490                 }
1491
1492                 if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1493                     blk->magic == XFS_DIR3_LEAFN_MAGIC) {
1494                         blk->magic = XFS_DIR2_LEAFN_MAGIC;
1495                         blk->hashval = xfs_dir2_leaf_lasthash(args->dp,
1496                                                               blk->bp, NULL);
1497                         break;
1498                 }
1499
1500                 blk->magic = XFS_DA_NODE_MAGIC;
1501
1502
1503                 /*
1504                  * Search an intermediate node for a match.
1505                  */
1506                 node = blk->bp->b_addr;
1507                 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1508                 btree = dp->d_ops->node_tree_p(node);
1509
1510                 /* Tree taller than we can handle; bail out! */
1511                 if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH)
1512                         return -EFSCORRUPTED;
1513
1514                 /* Check the level from the root. */
1515                 if (blkno == args->geo->leafblk)
1516                         expected_level = nodehdr.level - 1;
1517                 else if (expected_level != nodehdr.level)
1518                         return -EFSCORRUPTED;
1519                 else
1520                         expected_level--;
1521
1522                 max = nodehdr.count;
1523                 blk->hashval = be32_to_cpu(btree[max - 1].hashval);
1524
1525                 /*
1526                  * Binary search.  (note: small blocks will skip loop)
1527                  */
1528                 probe = span = max / 2;
1529                 hashval = args->hashval;
1530                 while (span > 4) {
1531                         span /= 2;
1532                         btreehashval = be32_to_cpu(btree[probe].hashval);
1533                         if (btreehashval < hashval)
1534                                 probe += span;
1535                         else if (btreehashval > hashval)
1536                                 probe -= span;
1537                         else
1538                                 break;
1539                 }
1540                 ASSERT((probe >= 0) && (probe < max));
1541                 ASSERT((span <= 4) ||
1542                         (be32_to_cpu(btree[probe].hashval) == hashval));
1543
1544                 /*
1545                  * Since we may have duplicate hashval's, find the first
1546                  * matching hashval in the node.
1547                  */
1548                 while (probe > 0 &&
1549                        be32_to_cpu(btree[probe].hashval) >= hashval) {
1550                         probe--;
1551                 }
1552                 while (probe < max &&
1553                        be32_to_cpu(btree[probe].hashval) < hashval) {
1554                         probe++;
1555                 }
1556
1557                 /*
1558                  * Pick the right block to descend on.
1559                  */
1560                 if (probe == max) {
1561                         blk->index = max - 1;
1562                         blkno = be32_to_cpu(btree[max - 1].before);
1563                 } else {
1564                         blk->index = probe;
1565                         blkno = be32_to_cpu(btree[probe].before);
1566                 }
1567
1568                 /* We can't point back to the root. */
1569                 if (blkno == args->geo->leafblk)
1570                         return -EFSCORRUPTED;
1571         }
1572
1573         if (expected_level != 0)
1574                 return -EFSCORRUPTED;
1575
1576         /*
1577          * A leaf block that ends in the hashval that we are interested in
1578          * (final hashval == search hashval) means that the next block may
1579          * contain more entries with the same hashval, shift upward to the
1580          * next leaf and keep searching.
1581          */
1582         for (;;) {
1583                 if (blk->magic == XFS_DIR2_LEAFN_MAGIC) {
1584                         retval = xfs_dir2_leafn_lookup_int(blk->bp, args,
1585                                                         &blk->index, state);
1586                 } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1587                         retval = xfs_attr3_leaf_lookup_int(blk->bp, args);
1588                         blk->index = args->index;
1589                         args->blkno = blk->blkno;
1590                 } else {
1591                         ASSERT(0);
1592                         return -EFSCORRUPTED;
1593                 }
1594                 if (((retval == -ENOENT) || (retval == -ENOATTR)) &&
1595                     (blk->hashval == args->hashval)) {
1596                         error = xfs_da3_path_shift(state, &state->path, 1, 1,
1597                                                          &retval);
1598                         if (error)
1599                                 return error;
1600                         if (retval == 0) {
1601                                 continue;
1602                         } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1603                                 /* path_shift() gives ENOENT */
1604                                 retval = -ENOATTR;
1605                         }
1606                 }
1607                 break;
1608         }
1609         *result = retval;
1610         return 0;
1611 }
1612
1613 /*========================================================================
1614  * Utility routines.
1615  *========================================================================*/
1616
1617 /*
1618  * Compare two intermediate nodes for "order".
1619  */
1620 STATIC int
1621 xfs_da3_node_order(
1622         struct xfs_inode *dp,
1623         struct xfs_buf  *node1_bp,
1624         struct xfs_buf  *node2_bp)
1625 {
1626         struct xfs_da_intnode   *node1;
1627         struct xfs_da_intnode   *node2;
1628         struct xfs_da_node_entry *btree1;
1629         struct xfs_da_node_entry *btree2;
1630         struct xfs_da3_icnode_hdr node1hdr;
1631         struct xfs_da3_icnode_hdr node2hdr;
1632
1633         node1 = node1_bp->b_addr;
1634         node2 = node2_bp->b_addr;
1635         dp->d_ops->node_hdr_from_disk(&node1hdr, node1);
1636         dp->d_ops->node_hdr_from_disk(&node2hdr, node2);
1637         btree1 = dp->d_ops->node_tree_p(node1);
1638         btree2 = dp->d_ops->node_tree_p(node2);
1639
1640         if (node1hdr.count > 0 && node2hdr.count > 0 &&
1641             ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
1642              (be32_to_cpu(btree2[node2hdr.count - 1].hashval) <
1643               be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) {
1644                 return 1;
1645         }
1646         return 0;
1647 }
1648
1649 /*
1650  * Link a new block into a doubly linked list of blocks (of whatever type).
1651  */
1652 int                                                     /* error */
1653 xfs_da3_blk_link(
1654         struct xfs_da_state     *state,
1655         struct xfs_da_state_blk *old_blk,
1656         struct xfs_da_state_blk *new_blk)
1657 {
1658         struct xfs_da_blkinfo   *old_info;
1659         struct xfs_da_blkinfo   *new_info;
1660         struct xfs_da_blkinfo   *tmp_info;
1661         struct xfs_da_args      *args;
1662         struct xfs_buf          *bp;
1663         int                     before = 0;
1664         int                     error;
1665         struct xfs_inode        *dp = state->args->dp;
1666
1667         /*
1668          * Set up environment.
1669          */
1670         args = state->args;
1671         ASSERT(args != NULL);
1672         old_info = old_blk->bp->b_addr;
1673         new_info = new_blk->bp->b_addr;
1674         ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC ||
1675                old_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1676                old_blk->magic == XFS_ATTR_LEAF_MAGIC);
1677
1678         switch (old_blk->magic) {
1679         case XFS_ATTR_LEAF_MAGIC:
1680                 before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
1681                 break;
1682         case XFS_DIR2_LEAFN_MAGIC:
1683                 before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp);
1684                 break;
1685         case XFS_DA_NODE_MAGIC:
1686                 before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp);
1687                 break;
1688         }
1689
1690         /*
1691          * Link blocks in appropriate order.
1692          */
1693         if (before) {
1694                 /*
1695                  * Link new block in before existing block.
1696                  */
1697                 trace_xfs_da_link_before(args);
1698                 new_info->forw = cpu_to_be32(old_blk->blkno);
1699                 new_info->back = old_info->back;
1700                 if (old_info->back) {
1701                         error = xfs_da3_node_read(args->trans, dp,
1702                                                 be32_to_cpu(old_info->back),
1703                                                 -1, &bp, args->whichfork);
1704                         if (error)
1705                                 return error;
1706                         ASSERT(bp != NULL);
1707                         tmp_info = bp->b_addr;
1708                         ASSERT(tmp_info->magic == old_info->magic);
1709                         ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno);
1710                         tmp_info->forw = cpu_to_be32(new_blk->blkno);
1711                         xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
1712                 }
1713                 old_info->back = cpu_to_be32(new_blk->blkno);
1714         } else {
1715                 /*
1716                  * Link new block in after existing block.
1717                  */
1718                 trace_xfs_da_link_after(args);
1719                 new_info->forw = old_info->forw;
1720                 new_info->back = cpu_to_be32(old_blk->blkno);
1721                 if (old_info->forw) {
1722                         error = xfs_da3_node_read(args->trans, dp,
1723                                                 be32_to_cpu(old_info->forw),
1724                                                 -1, &bp, args->whichfork);
1725                         if (error)
1726                                 return error;
1727                         ASSERT(bp != NULL);
1728                         tmp_info = bp->b_addr;
1729                         ASSERT(tmp_info->magic == old_info->magic);
1730                         ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno);
1731                         tmp_info->back = cpu_to_be32(new_blk->blkno);
1732                         xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1);
1733                 }
1734                 old_info->forw = cpu_to_be32(new_blk->blkno);
1735         }
1736
1737         xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
1738         xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
1739         return 0;
1740 }
1741
1742 /*
1743  * Unlink a block from a doubly linked list of blocks.
1744  */
1745 STATIC int                                              /* error */
1746 xfs_da3_blk_unlink(
1747         struct xfs_da_state     *state,
1748         struct xfs_da_state_blk *drop_blk,
1749         struct xfs_da_state_blk *save_blk)
1750 {
1751         struct xfs_da_blkinfo   *drop_info;
1752         struct xfs_da_blkinfo   *save_info;
1753         struct xfs_da_blkinfo   *tmp_info;
1754         struct xfs_da_args      *args;
1755         struct xfs_buf          *bp;
1756         int                     error;
1757
1758         /*
1759          * Set up environment.
1760          */
1761         args = state->args;
1762         ASSERT(args != NULL);
1763         save_info = save_blk->bp->b_addr;
1764         drop_info = drop_blk->bp->b_addr;
1765         ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC ||
1766                save_blk->magic == XFS_DIR2_LEAFN_MAGIC ||
1767                save_blk->magic == XFS_ATTR_LEAF_MAGIC);
1768         ASSERT(save_blk->magic == drop_blk->magic);
1769         ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) ||
1770                (be32_to_cpu(save_info->back) == drop_blk->blkno));
1771         ASSERT((be32_to_cpu(drop_info->forw) == save_blk->blkno) ||
1772                (be32_to_cpu(drop_info->back) == save_blk->blkno));
1773
1774         /*
1775          * Unlink the leaf block from the doubly linked chain of leaves.
1776          */
1777         if (be32_to_cpu(save_info->back) == drop_blk->blkno) {
1778                 trace_xfs_da_unlink_back(args);
1779                 save_info->back = drop_info->back;
1780                 if (drop_info->back) {
1781                         error = xfs_da3_node_read(args->trans, args->dp,
1782                                                 be32_to_cpu(drop_info->back),
1783                                                 -1, &bp, args->whichfork);
1784                         if (error)
1785                                 return error;
1786                         ASSERT(bp != NULL);
1787                         tmp_info = bp->b_addr;
1788                         ASSERT(tmp_info->magic == save_info->magic);
1789                         ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno);
1790                         tmp_info->forw = cpu_to_be32(save_blk->blkno);
1791                         xfs_trans_log_buf(args->trans, bp, 0,
1792                                                     sizeof(*tmp_info) - 1);
1793                 }
1794         } else {
1795                 trace_xfs_da_unlink_forward(args);
1796                 save_info->forw = drop_info->forw;
1797                 if (drop_info->forw) {
1798                         error = xfs_da3_node_read(args->trans, args->dp,
1799                                                 be32_to_cpu(drop_info->forw),
1800                                                 -1, &bp, args->whichfork);
1801                         if (error)
1802                                 return error;
1803                         ASSERT(bp != NULL);
1804                         tmp_info = bp->b_addr;
1805                         ASSERT(tmp_info->magic == save_info->magic);
1806                         ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno);
1807                         tmp_info->back = cpu_to_be32(save_blk->blkno);
1808                         xfs_trans_log_buf(args->trans, bp, 0,
1809                                                     sizeof(*tmp_info) - 1);
1810                 }
1811         }
1812
1813         xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
1814         return 0;
1815 }
1816
1817 /*
1818  * Move a path "forward" or "!forward" one block at the current level.
1819  *
1820  * This routine will adjust a "path" to point to the next block
1821  * "forward" (higher hashvalues) or "!forward" (lower hashvals) in the
1822  * Btree, including updating pointers to the intermediate nodes between
1823  * the new bottom and the root.
1824  */
1825 int                                                     /* error */
1826 xfs_da3_path_shift(
1827         struct xfs_da_state     *state,
1828         struct xfs_da_state_path *path,
1829         int                     forward,
1830         int                     release,
1831         int                     *result)
1832 {
1833         struct xfs_da_state_blk *blk;
1834         struct xfs_da_blkinfo   *info;
1835         struct xfs_da_intnode   *node;
1836         struct xfs_da_args      *args;
1837         struct xfs_da_node_entry *btree;
1838         struct xfs_da3_icnode_hdr nodehdr;
1839         struct xfs_buf          *bp;
1840         xfs_dablk_t             blkno = 0;
1841         int                     level;
1842         int                     error;
1843         struct xfs_inode        *dp = state->args->dp;
1844
1845         trace_xfs_da_path_shift(state->args);
1846
1847         /*
1848          * Roll up the Btree looking for the first block where our
1849          * current index is not at the edge of the block.  Note that
1850          * we skip the bottom layer because we want the sibling block.
1851          */
1852         args = state->args;
1853         ASSERT(args != NULL);
1854         ASSERT(path != NULL);
1855         ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
1856         level = (path->active-1) - 1;   /* skip bottom layer in path */
1857         for (blk = &path->blk[level]; level >= 0; blk--, level--) {
1858                 node = blk->bp->b_addr;
1859                 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1860                 btree = dp->d_ops->node_tree_p(node);
1861
1862                 if (forward && (blk->index < nodehdr.count - 1)) {
1863                         blk->index++;
1864                         blkno = be32_to_cpu(btree[blk->index].before);
1865                         break;
1866                 } else if (!forward && (blk->index > 0)) {
1867                         blk->index--;
1868                         blkno = be32_to_cpu(btree[blk->index].before);
1869                         break;
1870                 }
1871         }
1872         if (level < 0) {
1873                 *result = -ENOENT;      /* we're out of our tree */
1874                 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
1875                 return 0;
1876         }
1877
1878         /*
1879          * Roll down the edge of the subtree until we reach the
1880          * same depth we were at originally.
1881          */
1882         for (blk++, level++; level < path->active; blk++, level++) {
1883                 /*
1884                  * Read the next child block into a local buffer.
1885                  */
1886                 error = xfs_da3_node_read(args->trans, dp, blkno, -1, &bp,
1887                                           args->whichfork);
1888                 if (error)
1889                         return error;
1890
1891                 /*
1892                  * Release the old block (if it's dirty, the trans doesn't
1893                  * actually let go) and swap the local buffer into the path
1894                  * structure. This ensures failure of the above read doesn't set
1895                  * a NULL buffer in an active slot in the path.
1896                  */
1897                 if (release)
1898                         xfs_trans_brelse(args->trans, blk->bp);
1899                 blk->blkno = blkno;
1900                 blk->bp = bp;
1901
1902                 info = blk->bp->b_addr;
1903                 ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
1904                        info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
1905                        info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
1906                        info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) ||
1907                        info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) ||
1908                        info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC));
1909
1910
1911                 /*
1912                  * Note: we flatten the magic number to a single type so we
1913                  * don't have to compare against crc/non-crc types elsewhere.
1914                  */
1915                 switch (be16_to_cpu(info->magic)) {
1916                 case XFS_DA_NODE_MAGIC:
1917                 case XFS_DA3_NODE_MAGIC:
1918                         blk->magic = XFS_DA_NODE_MAGIC;
1919                         node = (xfs_da_intnode_t *)info;
1920                         dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1921                         btree = dp->d_ops->node_tree_p(node);
1922                         blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
1923                         if (forward)
1924                                 blk->index = 0;
1925                         else
1926                                 blk->index = nodehdr.count - 1;
1927                         blkno = be32_to_cpu(btree[blk->index].before);
1928                         break;
1929                 case XFS_ATTR_LEAF_MAGIC:
1930                 case XFS_ATTR3_LEAF_MAGIC:
1931                         blk->magic = XFS_ATTR_LEAF_MAGIC;
1932                         ASSERT(level == path->active-1);
1933                         blk->index = 0;
1934                         blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
1935                         break;
1936                 case XFS_DIR2_LEAFN_MAGIC:
1937                 case XFS_DIR3_LEAFN_MAGIC:
1938                         blk->magic = XFS_DIR2_LEAFN_MAGIC;
1939                         ASSERT(level == path->active-1);
1940                         blk->index = 0;
1941                         blk->hashval = xfs_dir2_leaf_lasthash(args->dp,
1942                                                               blk->bp, NULL);
1943                         break;
1944                 default:
1945                         ASSERT(0);
1946                         break;
1947                 }
1948         }
1949         *result = 0;
1950         return 0;
1951 }
1952
1953
1954 /*========================================================================
1955  * Utility routines.
1956  *========================================================================*/
1957
1958 /*
1959  * Implement a simple hash on a character string.
1960  * Rotate the hash value by 7 bits, then XOR each character in.
1961  * This is implemented with some source-level loop unrolling.
1962  */
1963 xfs_dahash_t
1964 xfs_da_hashname(const uint8_t *name, int namelen)
1965 {
1966         xfs_dahash_t hash;
1967
1968         /*
1969          * Do four characters at a time as long as we can.
1970          */
1971         for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
1972                 hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
1973                        (name[3] << 0) ^ rol32(hash, 7 * 4);
1974
1975         /*
1976          * Now do the rest of the characters.
1977          */
1978         switch (namelen) {
1979         case 3:
1980                 return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
1981                        rol32(hash, 7 * 3);
1982         case 2:
1983                 return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
1984         case 1:
1985                 return (name[0] << 0) ^ rol32(hash, 7 * 1);
1986         default: /* case 0: */
1987                 return hash;
1988         }
1989 }
1990
1991 enum xfs_dacmp
1992 xfs_da_compname(
1993         struct xfs_da_args *args,
1994         const unsigned char *name,
1995         int             len)
1996 {
1997         return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
1998                                         XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
1999 }
2000
2001 static xfs_dahash_t
2002 xfs_default_hashname(
2003         struct xfs_name *name)
2004 {
2005         return xfs_da_hashname(name->name, name->len);
2006 }
2007
2008 const struct xfs_nameops xfs_default_nameops = {
2009         .hashname       = xfs_default_hashname,
2010         .compname       = xfs_da_compname
2011 };
2012
2013 int
2014 xfs_da_grow_inode_int(
2015         struct xfs_da_args      *args,
2016         xfs_fileoff_t           *bno,
2017         int                     count)
2018 {
2019         struct xfs_trans        *tp = args->trans;
2020         struct xfs_inode        *dp = args->dp;
2021         int                     w = args->whichfork;
2022         xfs_rfsblock_t          nblks = dp->i_d.di_nblocks;
2023         struct xfs_bmbt_irec    map, *mapp;
2024         int                     nmap, error, got, i, mapi;
2025
2026         /*
2027          * Find a spot in the file space to put the new block.
2028          */
2029         error = xfs_bmap_first_unused(tp, dp, count, bno, w);
2030         if (error)
2031                 return error;
2032
2033         /*
2034          * Try mapping it in one filesystem block.
2035          */
2036         nmap = 1;
2037         ASSERT(args->firstblock != NULL);
2038         error = xfs_bmapi_write(tp, dp, *bno, count,
2039                         xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
2040                         args->firstblock, args->total, &map, &nmap,
2041                         args->dfops);
2042         if (error)
2043                 return error;
2044
2045         ASSERT(nmap <= 1);
2046         if (nmap == 1) {
2047                 mapp = &map;
2048                 mapi = 1;
2049         } else if (nmap == 0 && count > 1) {
2050                 xfs_fileoff_t           b;
2051                 int                     c;
2052
2053                 /*
2054                  * If we didn't get it and the block might work if fragmented,
2055                  * try without the CONTIG flag.  Loop until we get it all.
2056                  */
2057                 mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
2058                 for (b = *bno, mapi = 0; b < *bno + count; ) {
2059                         nmap = MIN(XFS_BMAP_MAX_NMAP, count);
2060                         c = (int)(*bno + count - b);
2061                         error = xfs_bmapi_write(tp, dp, b, c,
2062                                         xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
2063                                         args->firstblock, args->total,
2064                                         &mapp[mapi], &nmap, args->dfops);
2065                         if (error)
2066                                 goto out_free_map;
2067                         if (nmap < 1)
2068                                 break;
2069                         mapi += nmap;
2070                         b = mapp[mapi - 1].br_startoff +
2071                             mapp[mapi - 1].br_blockcount;
2072                 }
2073         } else {
2074                 mapi = 0;
2075                 mapp = NULL;
2076         }
2077
2078         /*
2079          * Count the blocks we got, make sure it matches the total.
2080          */
2081         for (i = 0, got = 0; i < mapi; i++)
2082                 got += mapp[i].br_blockcount;
2083         if (got != count || mapp[0].br_startoff != *bno ||
2084             mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
2085             *bno + count) {
2086                 error = -ENOSPC;
2087                 goto out_free_map;
2088         }
2089
2090         /* account for newly allocated blocks in reserved blocks total */
2091         args->total -= dp->i_d.di_nblocks - nblks;
2092
2093 out_free_map:
2094         if (mapp != &map)
2095                 kmem_free(mapp);
2096         return error;
2097 }
2098
2099 /*
2100  * Add a block to the btree ahead of the file.
2101  * Return the new block number to the caller.
2102  */
2103 int
2104 xfs_da_grow_inode(
2105         struct xfs_da_args      *args,
2106         xfs_dablk_t             *new_blkno)
2107 {
2108         xfs_fileoff_t           bno;
2109         int                     error;
2110
2111         trace_xfs_da_grow_inode(args);
2112
2113         bno = args->geo->leafblk;
2114         error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount);
2115         if (!error)
2116                 *new_blkno = (xfs_dablk_t)bno;
2117         return error;
2118 }
2119
2120 /*
2121  * Ick.  We need to always be able to remove a btree block, even
2122  * if there's no space reservation because the filesystem is full.
2123  * This is called if xfs_bunmapi on a btree block fails due to ENOSPC.
2124  * It swaps the target block with the last block in the file.  The
2125  * last block in the file can always be removed since it can't cause
2126  * a bmap btree split to do that.
2127  */
2128 STATIC int
2129 xfs_da3_swap_lastblock(
2130         struct xfs_da_args      *args,
2131         xfs_dablk_t             *dead_blknop,
2132         struct xfs_buf          **dead_bufp)
2133 {
2134         struct xfs_da_blkinfo   *dead_info;
2135         struct xfs_da_blkinfo   *sib_info;
2136         struct xfs_da_intnode   *par_node;
2137         struct xfs_da_intnode   *dead_node;
2138         struct xfs_dir2_leaf    *dead_leaf2;
2139         struct xfs_da_node_entry *btree;
2140         struct xfs_da3_icnode_hdr par_hdr;
2141         struct xfs_inode        *dp;
2142         struct xfs_trans        *tp;
2143         struct xfs_mount        *mp;
2144         struct xfs_buf          *dead_buf;
2145         struct xfs_buf          *last_buf;
2146         struct xfs_buf          *sib_buf;
2147         struct xfs_buf          *par_buf;
2148         xfs_dahash_t            dead_hash;
2149         xfs_fileoff_t           lastoff;
2150         xfs_dablk_t             dead_blkno;
2151         xfs_dablk_t             last_blkno;
2152         xfs_dablk_t             sib_blkno;
2153         xfs_dablk_t             par_blkno;
2154         int                     error;
2155         int                     w;
2156         int                     entno;
2157         int                     level;
2158         int                     dead_level;
2159
2160         trace_xfs_da_swap_lastblock(args);
2161
2162         dead_buf = *dead_bufp;
2163         dead_blkno = *dead_blknop;
2164         tp = args->trans;
2165         dp = args->dp;
2166         w = args->whichfork;
2167         ASSERT(w == XFS_DATA_FORK);
2168         mp = dp->i_mount;
2169         lastoff = args->geo->freeblk;
2170         error = xfs_bmap_last_before(tp, dp, &lastoff, w);
2171         if (error)
2172                 return error;
2173         if (unlikely(lastoff == 0)) {
2174                 XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
2175                                  mp);
2176                 return -EFSCORRUPTED;
2177         }
2178         /*
2179          * Read the last block in the btree space.
2180          */
2181         last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
2182         error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w);
2183         if (error)
2184                 return error;
2185         /*
2186          * Copy the last block into the dead buffer and log it.
2187          */
2188         memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
2189         xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
2190         dead_info = dead_buf->b_addr;
2191         /*
2192          * Get values from the moved block.
2193          */
2194         if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
2195             dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
2196                 struct xfs_dir3_icleaf_hdr leafhdr;
2197                 struct xfs_dir2_leaf_entry *ents;
2198
2199                 dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
2200                 dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2);
2201                 ents = dp->d_ops->leaf_ents_p(dead_leaf2);
2202                 dead_level = 0;
2203                 dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
2204         } else {
2205                 struct xfs_da3_icnode_hdr deadhdr;
2206
2207                 dead_node = (xfs_da_intnode_t *)dead_info;
2208                 dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node);
2209                 btree = dp->d_ops->node_tree_p(dead_node);
2210                 dead_level = deadhdr.level;
2211                 dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
2212         }
2213         sib_buf = par_buf = NULL;
2214         /*
2215          * If the moved block has a left sibling, fix up the pointers.
2216          */
2217         if ((sib_blkno = be32_to_cpu(dead_info->back))) {
2218                 error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
2219                 if (error)
2220                         goto done;
2221                 sib_info = sib_buf->b_addr;
2222                 if (unlikely(
2223                     be32_to_cpu(sib_info->forw) != last_blkno ||
2224                     sib_info->magic != dead_info->magic)) {
2225                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
2226                                          XFS_ERRLEVEL_LOW, mp);
2227                         error = -EFSCORRUPTED;
2228                         goto done;
2229                 }
2230                 sib_info->forw = cpu_to_be32(dead_blkno);
2231                 xfs_trans_log_buf(tp, sib_buf,
2232                         XFS_DA_LOGRANGE(sib_info, &sib_info->forw,
2233                                         sizeof(sib_info->forw)));
2234                 sib_buf = NULL;
2235         }
2236         /*
2237          * If the moved block has a right sibling, fix up the pointers.
2238          */
2239         if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
2240                 error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
2241                 if (error)
2242                         goto done;
2243                 sib_info = sib_buf->b_addr;
2244                 if (unlikely(
2245                        be32_to_cpu(sib_info->back) != last_blkno ||
2246                        sib_info->magic != dead_info->magic)) {
2247                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
2248                                          XFS_ERRLEVEL_LOW, mp);
2249                         error = -EFSCORRUPTED;
2250                         goto done;
2251                 }
2252                 sib_info->back = cpu_to_be32(dead_blkno);
2253                 xfs_trans_log_buf(tp, sib_buf,
2254                         XFS_DA_LOGRANGE(sib_info, &sib_info->back,
2255                                         sizeof(sib_info->back)));
2256                 sib_buf = NULL;
2257         }
2258         par_blkno = args->geo->leafblk;
2259         level = -1;
2260         /*
2261          * Walk down the tree looking for the parent of the moved block.
2262          */
2263         for (;;) {
2264                 error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
2265                 if (error)
2266                         goto done;
2267                 par_node = par_buf->b_addr;
2268                 dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
2269                 if (level >= 0 && level != par_hdr.level + 1) {
2270                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
2271                                          XFS_ERRLEVEL_LOW, mp);
2272                         error = -EFSCORRUPTED;
2273                         goto done;
2274                 }
2275                 level = par_hdr.level;
2276                 btree = dp->d_ops->node_tree_p(par_node);
2277                 for (entno = 0;
2278                      entno < par_hdr.count &&
2279                      be32_to_cpu(btree[entno].hashval) < dead_hash;
2280                      entno++)
2281                         continue;
2282                 if (entno == par_hdr.count) {
2283                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
2284                                          XFS_ERRLEVEL_LOW, mp);
2285                         error = -EFSCORRUPTED;
2286                         goto done;
2287                 }
2288                 par_blkno = be32_to_cpu(btree[entno].before);
2289                 if (level == dead_level + 1)
2290                         break;
2291                 xfs_trans_brelse(tp, par_buf);
2292                 par_buf = NULL;
2293         }
2294         /*
2295          * We're in the right parent block.
2296          * Look for the right entry.
2297          */
2298         for (;;) {
2299                 for (;
2300                      entno < par_hdr.count &&
2301                      be32_to_cpu(btree[entno].before) != last_blkno;
2302                      entno++)
2303                         continue;
2304                 if (entno < par_hdr.count)
2305                         break;
2306                 par_blkno = par_hdr.forw;
2307                 xfs_trans_brelse(tp, par_buf);
2308                 par_buf = NULL;
2309                 if (unlikely(par_blkno == 0)) {
2310                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
2311                                          XFS_ERRLEVEL_LOW, mp);
2312                         error = -EFSCORRUPTED;
2313                         goto done;
2314                 }
2315                 error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
2316                 if (error)
2317                         goto done;
2318                 par_node = par_buf->b_addr;
2319                 dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
2320                 if (par_hdr.level != level) {
2321                         XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
2322                                          XFS_ERRLEVEL_LOW, mp);
2323                         error = -EFSCORRUPTED;
2324                         goto done;
2325                 }
2326                 btree = dp->d_ops->node_tree_p(par_node);
2327                 entno = 0;
2328         }
2329         /*
2330          * Update the parent entry pointing to the moved block.
2331          */
2332         btree[entno].before = cpu_to_be32(dead_blkno);
2333         xfs_trans_log_buf(tp, par_buf,
2334                 XFS_DA_LOGRANGE(par_node, &btree[entno].before,
2335                                 sizeof(btree[entno].before)));
2336         *dead_blknop = last_blkno;
2337         *dead_bufp = last_buf;
2338         return 0;
2339 done:
2340         if (par_buf)
2341                 xfs_trans_brelse(tp, par_buf);
2342         if (sib_buf)
2343                 xfs_trans_brelse(tp, sib_buf);
2344         xfs_trans_brelse(tp, last_buf);
2345         return error;
2346 }
2347
2348 /*
2349  * Remove a btree block from a directory or attribute.
2350  */
2351 int
2352 xfs_da_shrink_inode(
2353         xfs_da_args_t   *args,
2354         xfs_dablk_t     dead_blkno,
2355         struct xfs_buf  *dead_buf)
2356 {
2357         xfs_inode_t *dp;
2358         int done, error, w, count;
2359         xfs_trans_t *tp;
2360
2361         trace_xfs_da_shrink_inode(args);
2362
2363         dp = args->dp;
2364         w = args->whichfork;
2365         tp = args->trans;
2366         count = args->geo->fsbcount;
2367         for (;;) {
2368                 /*
2369                  * Remove extents.  If we get ENOSPC for a dir we have to move
2370                  * the last block to the place we want to kill.
2371                  */
2372                 error = xfs_bunmapi(tp, dp, dead_blkno, count,
2373                                     xfs_bmapi_aflag(w), 0, args->firstblock,
2374                                     args->dfops, &done);
2375                 if (error == -ENOSPC) {
2376                         if (w != XFS_DATA_FORK)
2377                                 break;
2378                         error = xfs_da3_swap_lastblock(args, &dead_blkno,
2379                                                       &dead_buf);
2380                         if (error)
2381                                 break;
2382                 } else {
2383                         break;
2384                 }
2385         }
2386         xfs_trans_binval(tp, dead_buf);
2387         return error;
2388 }
2389
2390 /*
2391  * See if the mapping(s) for this btree block are valid, i.e.
2392  * don't contain holes, are logically contiguous, and cover the whole range.
2393  */
2394 STATIC int
2395 xfs_da_map_covers_blocks(
2396         int             nmap,
2397         xfs_bmbt_irec_t *mapp,
2398         xfs_dablk_t     bno,
2399         int             count)
2400 {
2401         int             i;
2402         xfs_fileoff_t   off;
2403
2404         for (i = 0, off = bno; i < nmap; i++) {
2405                 if (mapp[i].br_startblock == HOLESTARTBLOCK ||
2406                     mapp[i].br_startblock == DELAYSTARTBLOCK) {
2407                         return 0;
2408                 }
2409                 if (off != mapp[i].br_startoff) {
2410                         return 0;
2411                 }
2412                 off += mapp[i].br_blockcount;
2413         }
2414         return off == bno + count;
2415 }
2416
2417 /*
2418  * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map.
2419  *
2420  * For the single map case, it is assumed that the caller has provided a pointer
2421  * to a valid xfs_buf_map.  For the multiple map case, this function will
2422  * allocate the xfs_buf_map to hold all the maps and replace the caller's single
2423  * map pointer with the allocated map.
2424  */
2425 static int
2426 xfs_buf_map_from_irec(
2427         struct xfs_mount        *mp,
2428         struct xfs_buf_map      **mapp,
2429         int                     *nmaps,
2430         struct xfs_bmbt_irec    *irecs,
2431         int                     nirecs)
2432 {
2433         struct xfs_buf_map      *map;
2434         int                     i;
2435
2436         ASSERT(*nmaps == 1);
2437         ASSERT(nirecs >= 1);
2438
2439         if (nirecs > 1) {
2440                 map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
2441                                   KM_SLEEP | KM_NOFS);
2442                 if (!map)
2443                         return -ENOMEM;
2444                 *mapp = map;
2445         }
2446
2447         *nmaps = nirecs;
2448         map = *mapp;
2449         for (i = 0; i < *nmaps; i++) {
2450                 ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK &&
2451                        irecs[i].br_startblock != HOLESTARTBLOCK);
2452                 map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock);
2453                 map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount);
2454         }
2455         return 0;
2456 }
2457
2458 /*
2459  * Map the block we are given ready for reading. There are three possible return
2460  * values:
2461  *      -1 - will be returned if we land in a hole and mappedbno == -2 so the
2462  *           caller knows not to execute a subsequent read.
2463  *       0 - if we mapped the block successfully
2464  *      >0 - positive error number if there was an error.
2465  */
2466 static int
2467 xfs_dabuf_map(
2468         struct xfs_inode        *dp,
2469         xfs_dablk_t             bno,
2470         xfs_daddr_t             mappedbno,
2471         int                     whichfork,
2472         struct xfs_buf_map      **map,
2473         int                     *nmaps)
2474 {
2475         struct xfs_mount        *mp = dp->i_mount;
2476         int                     nfsb;
2477         int                     error = 0;
2478         struct xfs_bmbt_irec    irec;
2479         struct xfs_bmbt_irec    *irecs = &irec;
2480         int                     nirecs;
2481
2482         ASSERT(map && *map);
2483         ASSERT(*nmaps == 1);
2484
2485         if (whichfork == XFS_DATA_FORK)
2486                 nfsb = mp->m_dir_geo->fsbcount;
2487         else
2488                 nfsb = mp->m_attr_geo->fsbcount;
2489
2490         /*
2491          * Caller doesn't have a mapping.  -2 means don't complain
2492          * if we land in a hole.
2493          */
2494         if (mappedbno == -1 || mappedbno == -2) {
2495                 /*
2496                  * Optimize the one-block case.
2497                  */
2498                 if (nfsb != 1)
2499                         irecs = kmem_zalloc(sizeof(irec) * nfsb,
2500                                             KM_SLEEP | KM_NOFS);
2501
2502                 nirecs = nfsb;
2503                 error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs,
2504                                        &nirecs, xfs_bmapi_aflag(whichfork));
2505                 if (error)
2506                         goto out;
2507         } else {
2508                 irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno);
2509                 irecs->br_startoff = (xfs_fileoff_t)bno;
2510                 irecs->br_blockcount = nfsb;
2511                 irecs->br_state = 0;
2512                 nirecs = 1;
2513         }
2514
2515         if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
2516                 error = mappedbno == -2 ? -1 : -EFSCORRUPTED;
2517                 if (unlikely(error == -EFSCORRUPTED)) {
2518                         if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
2519                                 int i;
2520                                 xfs_alert(mp, "%s: bno %lld dir: inode %lld",
2521                                         __func__, (long long)bno,
2522                                         (long long)dp->i_ino);
2523                                 for (i = 0; i < *nmaps; i++) {
2524                                         xfs_alert(mp,
2525 "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
2526                                                 i,
2527                                                 (long long)irecs[i].br_startoff,
2528                                                 (long long)irecs[i].br_startblock,
2529                                                 (long long)irecs[i].br_blockcount,
2530                                                 irecs[i].br_state);
2531                                 }
2532                         }
2533                         XFS_ERROR_REPORT("xfs_da_do_buf(1)",
2534                                          XFS_ERRLEVEL_LOW, mp);
2535                 }
2536                 goto out;
2537         }
2538         error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs);
2539 out:
2540         if (irecs != &irec)
2541                 kmem_free(irecs);
2542         return error;
2543 }
2544
2545 /*
2546  * Get a buffer for the dir/attr block.
2547  */
2548 int
2549 xfs_da_get_buf(
2550         struct xfs_trans        *trans,
2551         struct xfs_inode        *dp,
2552         xfs_dablk_t             bno,
2553         xfs_daddr_t             mappedbno,
2554         struct xfs_buf          **bpp,
2555         int                     whichfork)
2556 {
2557         struct xfs_buf          *bp;
2558         struct xfs_buf_map      map;
2559         struct xfs_buf_map      *mapp;
2560         int                     nmap;
2561         int                     error;
2562
2563         *bpp = NULL;
2564         mapp = &map;
2565         nmap = 1;
2566         error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
2567                                 &mapp, &nmap);
2568         if (error) {
2569                 /* mapping a hole is not an error, but we don't continue */
2570                 if (error == -1)
2571                         error = 0;
2572                 goto out_free;
2573         }
2574
2575         bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
2576                                     mapp, nmap, 0);
2577         error = bp ? bp->b_error : -EIO;
2578         if (error) {
2579                 if (bp)
2580                         xfs_trans_brelse(trans, bp);
2581                 goto out_free;
2582         }
2583
2584         *bpp = bp;
2585
2586 out_free:
2587         if (mapp != &map)
2588                 kmem_free(mapp);
2589
2590         return error;
2591 }
2592
2593 /*
2594  * Get a buffer for the dir/attr block, fill in the contents.
2595  */
2596 int
2597 xfs_da_read_buf(
2598         struct xfs_trans        *trans,
2599         struct xfs_inode        *dp,
2600         xfs_dablk_t             bno,
2601         xfs_daddr_t             mappedbno,
2602         struct xfs_buf          **bpp,
2603         int                     whichfork,
2604         const struct xfs_buf_ops *ops)
2605 {
2606         struct xfs_buf          *bp;
2607         struct xfs_buf_map      map;
2608         struct xfs_buf_map      *mapp;
2609         int                     nmap;
2610         int                     error;
2611
2612         *bpp = NULL;
2613         mapp = &map;
2614         nmap = 1;
2615         error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
2616                                 &mapp, &nmap);
2617         if (error) {
2618                 /* mapping a hole is not an error, but we don't continue */
2619                 if (error == -1)
2620                         error = 0;
2621                 goto out_free;
2622         }
2623
2624         error = xfs_trans_read_buf_map(dp->i_mount, trans,
2625                                         dp->i_mount->m_ddev_targp,
2626                                         mapp, nmap, 0, &bp, ops);
2627         if (error)
2628                 goto out_free;
2629
2630         if (whichfork == XFS_ATTR_FORK)
2631                 xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
2632         else
2633                 xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
2634         *bpp = bp;
2635 out_free:
2636         if (mapp != &map)
2637                 kmem_free(mapp);
2638
2639         return error;
2640 }
2641
2642 /*
2643  * Readahead the dir/attr block.
2644  */
2645 int
2646 xfs_da_reada_buf(
2647         struct xfs_inode        *dp,
2648         xfs_dablk_t             bno,
2649         xfs_daddr_t             mappedbno,
2650         int                     whichfork,
2651         const struct xfs_buf_ops *ops)
2652 {
2653         struct xfs_buf_map      map;
2654         struct xfs_buf_map      *mapp;
2655         int                     nmap;
2656         int                     error;
2657
2658         mapp = &map;
2659         nmap = 1;
2660         error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
2661                                 &mapp, &nmap);
2662         if (error) {
2663                 /* mapping a hole is not an error, but we don't continue */
2664                 if (error == -1)
2665                         error = 0;
2666                 goto out_free;
2667         }
2668
2669         mappedbno = mapp[0].bm_bn;
2670         xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
2671
2672 out_free:
2673         if (mapp != &map)
2674                 kmem_free(mapp);
2675
2676         return error;
2677 }