src/patches/suse-2.6.27.25/patches.kernel.org/revert-ext4-changes-in-2.6.27.19-and-2.6.27.20-and-2.6.27.25.patch

   1 From: Greg Kroah-Hartman <gregkh@suse.de>
   2 Subject: revert ext4 changes in 2.6.27.19 and 2.6.27.20 and 2.6.27.25
   3 Patch-mainline: no
   4
   5 As we are already taking a different version of ext4, revert the
   6 changes that were made to ext4 in 2.6.27.19 and 2.6.27.20 and 2.6.27.25
   7
   8 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
   9
  10 --- b/Documentation/filesystems/ext4.txt
  11 +++ a/Documentation/filesystems/ext4.txt
  12 @@ -73,7 +73,7 @@
  13  * extent format more robust in face of on-disk corruption due to magics,
  14  * internal redunancy in tree
  15  * improved file allocation (multi-block alloc)
  16 +* fix 32000 subdirectory limit
  17 -* lift 32000 subdirectory limit imposed by i_links_count[1]
  18  * nsec timestamps for mtime, atime, ctime, create time
  19  * inode version field on disk (NFSv4, Lustre)
  20  * reduced e2fsck time via uninit_bg feature
  21 @@ -88,9 +88,6 @@
  22  * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
  23    the ordering)
  24
  25 -[1] Filesystems with a block size of 1k may see a limit imposed by the
  26 -directory hash tree having a maximum depth of two.
  27 -
  28  2.2 Candidate features for future inclusion
  29
  30  * Online defrag (patches available but not well tested)
  31 reverted:
  32 --- b/fs/ext4/balloc.c
  33 +++ a/fs/ext4/balloc.c
  34 @@ -20,7 +20,6 @@
  35  #include "ext4.h"
  36  #include "ext4_jbd2.h"
  37  #include "group.h"
  38 -#include "mballoc.h"
  39
  40  /*
  41   * balloc.c contains the blocks allocation and deallocation routines
  42 @@ -319,41 +318,18 @@
  43                             block_group, bitmap_blk);
  44                 return NULL;
  45         }
  46 +       if (bh_uptodate_or_lock(bh))
  47 -
  48 -       if (bitmap_uptodate(bh))
  49                 return bh;
  50
  51 -       lock_buffer(bh);
  52 -       if (bitmap_uptodate(bh)) {
  53 -               unlock_buffer(bh);
  54 -               return bh;
  55 -       }
  56         spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
  57         if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
  58                 ext4_init_block_bitmap(sb, bh, block_group, desc);
  59 -               set_bitmap_uptodate(bh);
  60                 set_buffer_uptodate(bh);
  61                 unlock_buffer(bh);
  62                 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
  63                 return bh;
  64         }
  65         spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
  66 -       if (buffer_uptodate(bh)) {
  67 -               /*
  68 -                * if not uninit if bh is uptodate,
  69 -                * bitmap is also uptodate
  70 -                */
  71 -               set_bitmap_uptodate(bh);
  72 -               unlock_buffer(bh);
  73 -               return bh;
  74 -       }
  75 -       /*
  76 -        * submit the buffer_head for read. We can
  77 -        * safely mark the bitmap as uptodate now.
  78 -        * We do it here so the bitmap uptodate bit
  79 -        * get set with buffer lock held.
  80 -        */
  81 -       set_bitmap_uptodate(bh);
  82         if (bh_submit_read(bh) < 0) {
  83                 put_bh(bh);
  84                 ext4_error(sb, __func__,
  85 @@ -861,136 +837,6 @@
  86  }
  87
  88  /**
  89 - * ext4_add_groupblocks() -- Add given blocks to an existing group
  90 - * @handle:                    handle to this transaction
  91 - * @sb:                                super block
  92 - * @block:                     start physcial block to add to the block group
  93 - * @count:                     number of blocks to free
  94 - *
  95 - * This marks the blocks as free in the bitmap. We ask the
  96 - * mballoc to reload the buddy after this by setting group
  97 - * EXT4_GROUP_INFO_NEED_INIT_BIT flag
  98 - */
  99 -void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 100 -                        ext4_fsblk_t block, unsigned long count)
 101 -{
 102 -       struct buffer_head *bitmap_bh = NULL;
 103 -       struct buffer_head *gd_bh;
 104 -       ext4_group_t block_group;
 105 -       ext4_grpblk_t bit;
 106 -       unsigned long i;
 107 -       struct ext4_group_desc *desc;
 108 -       struct ext4_super_block *es;
 109 -       struct ext4_sb_info *sbi;
 110 -       int err = 0, ret;
 111 -       ext4_grpblk_t blocks_freed;
 112 -       struct ext4_group_info *grp;
 113 -
 114 -       sbi = EXT4_SB(sb);
 115 -       es = sbi->s_es;
 116 -       ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
 117 -
 118 -       ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
 119 -       grp = ext4_get_group_info(sb, block_group);
 120 -       /*
 121 -        * Check to see if we are freeing blocks across a group
 122 -        * boundary.
 123 -        */
 124 -       if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
 125 -               goto error_return;
 126 -
 127 -       bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 128 -       if (!bitmap_bh)
 129 -               goto error_return;
 130 -       desc = ext4_get_group_desc(sb, block_group, &gd_bh);
 131 -       if (!desc)
 132 -               goto error_return;
 133 -
 134 -       if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
 135 -           in_range(ext4_inode_bitmap(sb, desc), block, count) ||
 136 -           in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
 137 -           in_range(block + count - 1, ext4_inode_table(sb, desc),
 138 -                    sbi->s_itb_per_group)) {
 139 -               ext4_error(sb, __func__,
 140 -                          "Adding blocks in system zones - "
 141 -                           "Block = %llu, count = %lu",
 142 -                           block, count);
 143 -               goto error_return;
 144 -       }
 145 -
 146 -       /*
 147 -        * We are about to add blocks to the bitmap,
 148 -        * so we need undo access.
 149 -        */
 150 -       BUFFER_TRACE(bitmap_bh, "getting undo access");
 151 -       err = ext4_journal_get_undo_access(handle, bitmap_bh);
 152 -       if (err)
 153 -               goto error_return;
 154 -
 155 -       /*
 156 -        * We are about to modify some metadata.  Call the journal APIs
 157 -        * to unshare ->b_data if a currently-committing transaction is
 158 -        * using it
 159 -        */
 160 -       BUFFER_TRACE(gd_bh, "get_write_access");
 161 -       err = ext4_journal_get_write_access(handle, gd_bh);
 162 -       if (err)
 163 -               goto error_return;
 164 -       /*
 165 -        * make sure we don't allow a parallel init on other groups in the
 166 -        * same buddy cache
 167 -        */
 168 -       down_write(&grp->alloc_sem);
 169 -       for (i = 0, blocks_freed = 0; i < count; i++) {
 170 -               BUFFER_TRACE(bitmap_bh, "clear bit");
 171 -               if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
 172 -                                               bit + i, bitmap_bh->b_data)) {
 173 -                       ext4_error(sb, __func__,
 174 -                                  "bit already cleared for block %llu",
 175 -                                  (ext4_fsblk_t)(block + i));
 176 -                       BUFFER_TRACE(bitmap_bh, "bit already cleared");
 177 -               } else {
 178 -                       blocks_freed++;
 179 -               }
 180 -       }
 181 -       spin_lock(sb_bgl_lock(sbi, block_group));
 182 -       le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed);
 183 -       desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
 184 -       spin_unlock(sb_bgl_lock(sbi, block_group));
 185 -       percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
 186 -
 187 -       if (sbi->s_log_groups_per_flex) {
 188 -               ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
 189 -               spin_lock(sb_bgl_lock(sbi, flex_group));
 190 -               sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
 191 -               spin_unlock(sb_bgl_lock(sbi, flex_group));
 192 -       }
 193 -       /*
 194 -        * request to reload the buddy with the
 195 -        * new bitmap information
 196 -        */
 197 -       set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
 198 -       ext4_mb_update_group_info(grp, blocks_freed);
 199 -       up_write(&grp->alloc_sem);
 200 -
 201 -       /* We dirtied the bitmap block */
 202 -       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
 203 -       err = ext4_journal_dirty_metadata(handle, bitmap_bh);
 204 -
 205 -       /* And the group descriptor block */
 206 -       BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
 207 -       ret = ext4_journal_dirty_metadata(handle, gd_bh);
 208 -       if (!err)
 209 -               err = ret;
 210 -       sb->s_dirt = 1;
 211 -
 212 -error_return:
 213 -       brelse(bitmap_bh);
 214 -       ext4_std_error(sb, err);
 215 -       return;
 216 -}
 217 -
 218 -/**
 219   * ext4_free_blocks() -- Free given blocks and update quota
 220   * @handle:            handle for this transaction
 221   * @inode:             inode
 222 reverted:
 223 --- b/fs/ext4/ext4.h
 224 +++ a/fs/ext4/ext4.h
 225 @@ -19,7 +19,6 @@
 226  #include <linux/types.h>
 227  #include <linux/blkdev.h>
 228  #include <linux/magic.h>
 229 -#include <linux/jbd2.h>
 230  #include "ext4_i.h"
 231
 232  /*
 233 @@ -248,30 +247,6 @@
 234  #define EXT4_FL_USER_VISIBLE           0x000BDFFF /* User visible flags */
 235  #define EXT4_FL_USER_MODIFIABLE                0x000380FF /* User modifiable flags */
 236
 237 -/* Flags that should be inherited by new inodes from their parent. */
 238 -#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
 239 -                          EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
 240 -                          EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
 241 -                          EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
 242 -                          EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
 243 -
 244 -/* Flags that are appropriate for regular files (all but dir-specific ones). */
 245 -#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
 246 -
 247 -/* Flags that are appropriate for non-directories/regular files. */
 248 -#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
 249 -
 250 -/* Mask out flags that are inappropriate for the given type of inode. */
 251 -static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
 252 -{
 253 -       if (S_ISDIR(mode))
 254 -               return flags;
 255 -       else if (S_ISREG(mode))
 256 -               return flags & EXT4_REG_FLMASK;
 257 -       else
 258 -               return flags & EXT4_OTHER_FLMASK;
 259 -}
 260 -
 261  /*
 262   * Inode dynamic state flags
 263   */
 264 @@ -279,7 +254,6 @@
 265  #define EXT4_STATE_NEW                 0x00000002 /* inode is newly created */
 266  #define EXT4_STATE_XATTR               0x00000004 /* has in-inode xattrs */
 267  #define EXT4_STATE_NO_EXPAND           0x00000008 /* No space for expansion */
 268 -#define EXT4_STATE_DA_ALLOC_CLOSE      0x00000010 /* Alloc DA blks on close */
 269
 270  /* Used to pass group descriptor data when online resize is done */
 271  struct ext4_new_group_input {
 272 @@ -327,9 +301,7 @@
 273  #define EXT4_IOC_GROUP_EXTEND          _IOW('f', 7, unsigned long)
 274  #define EXT4_IOC_GROUP_ADD             _IOW('f', 8, struct ext4_new_group_input)
 275  #define EXT4_IOC_MIGRATE               _IO('f', 9)
 276 - /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
 277   /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
 278 -#define EXT4_IOC_ALLOC_DA_BLKS         _IO('f', 12)
 279
 280  /*
 281   * ioctl commands in 32 bit emulation
 282 @@ -887,7 +859,7 @@
 283  {
 284         unsigned len = le16_to_cpu(dlen);
 285
 286 +       if (len == EXT4_MAX_REC_LEN)
 287 -       if (len == EXT4_MAX_REC_LEN || len == 0)
 288                 return 1 << 16;
 289         return len;
 290  }
 291 @@ -917,9 +889,6 @@
 292  #define DX_HASH_LEGACY         0
 293  #define DX_HASH_HALF_MD4       1
 294  #define DX_HASH_TEA            2
 295 -#define DX_HASH_LEGACY_UNSIGNED        3
 296 -#define DX_HASH_HALF_MD4_UNSIGNED      4
 297 -#define DX_HASH_TEA_UNSIGNED           5
 298
 299  #ifdef __KERNEL__
 300
 301 @@ -1019,11 +988,9 @@
 302                                                 ext4_fsblk_t nblocks);
 303  extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
 304                         ext4_fsblk_t block, unsigned long count, int metadata);
 305 +extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
 306 +                                ext4_fsblk_t block, unsigned long count,
 307 -extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
 308 -                               ext4_fsblk_t block, unsigned long count,
 309                                 unsigned long *pdquot_freed_blocks);
 310 -extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 311 -                               ext4_fsblk_t block, unsigned long count);
 312  extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *);
 313  extern void ext4_check_blocks_bitmap (struct super_block *);
 314  extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 315 @@ -1071,13 +1038,12 @@
 316  extern void exit_ext4_mballoc(void);
 317  extern void ext4_mb_free_blocks(handle_t *, struct inode *,
 318                 unsigned long, unsigned long, int, unsigned long *);
 319 +extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
 320 -extern int ext4_mb_add_groupinfo(struct super_block *sb,
 321                 ext4_group_t i, struct ext4_group_desc *desc);
 322  extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
 323                 ext4_grpblk_t add);
 324 +
 325 +
 326 -extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 327 -extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
 328 -                                               ext4_group_t, int);
 329  /* inode.c */
 330  int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 331                 struct buffer_head *bh, ext4_fsblk_t blocknr);
 332 @@ -1105,14 +1071,13 @@
 333  extern void ext4_truncate (struct inode *);
 334  extern void ext4_set_inode_flags(struct inode *);
 335  extern void ext4_get_inode_flags(struct ext4_inode_info *);
 336 -extern int ext4_alloc_da_blocks(struct inode *inode);
 337  extern void ext4_set_aops(struct inode *inode);
 338  extern int ext4_writepage_trans_blocks(struct inode *);
 339  extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
 340  extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 341  extern int ext4_block_truncate_page(handle_t *handle,
 342                 struct address_space *mapping, loff_t from);
 343 +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 344 -extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 345
 346  /* ioctl.c */
 347  extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
 348 @@ -1202,11 +1167,8 @@
 349
 350  static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
 351  {
 352 +       return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
 353 +               le32_to_cpu(raw_inode->i_size_lo);
 354 -       if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
 355 -               return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
 356 -                       le32_to_cpu(raw_inode->i_size_lo);
 357 -       else
 358 -               return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
 359  }
 360
 361  static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
 362 @@ -1282,23 +1244,6 @@
 363                         sector_t block, unsigned long max_blocks,
 364                         struct buffer_head *bh, int create,
 365                         int extend_disksize, int flag);
 366 -/*
 367 - * Add new method to test wether block and inode bitmaps are properly
 368 - * initialized. With uninit_bg reading the block from disk is not enough
 369 - * to mark the bitmap uptodate. We need to also zero-out the bitmap
 370 - */
 371 -#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
 372 -
 373 -static inline int bitmap_uptodate(struct buffer_head *bh)
 374 -{
 375 -       return (buffer_uptodate(bh) &&
 376 -               test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
 377 -}
 378 -static inline void set_bitmap_uptodate(struct buffer_head *bh)
 379 -{
 380 -       set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
 381 -}
 382 -
 383  #endif /* __KERNEL__ */
 384
 385  #endif /* _EXT4_H */
 386 reverted:
 387 --- b/fs/ext4/ext4_sb.h
 388 +++ a/fs/ext4/ext4_sb.h
 389 @@ -56,7 +56,6 @@
 390         u32 s_next_generation;
 391         u32 s_hash_seed[4];
 392         int s_def_hash_version;
 393 -       int s_hash_unsigned;    /* 3 if hash should be signed, 0 if not */
 394         struct percpu_counter s_freeblocks_counter;
 395         struct percpu_counter s_freeinodes_counter;
 396         struct percpu_counter s_dirs_counter;
 397 @@ -103,8 +102,7 @@
 398         struct list_head s_committed_transaction;
 399         spinlock_t s_md_lock;
 400         tid_t s_last_transaction;
 401 +       unsigned short *s_mb_offsets, *s_mb_maxs;
 402 -       unsigned short *s_mb_offsets;
 403 -       unsigned int *s_mb_maxs;
 404
 405         /* tunables */
 406         unsigned long s_stripe;
 407 reverted:
 408 --- b/fs/ext4/extents.c
 409 +++ a/fs/ext4/extents.c
 410 @@ -1118,8 +1118,7 @@
 411         struct ext4_extent_idx *ix;
 412         struct ext4_extent *ex;
 413         ext4_fsblk_t block;
 414 +       int depth, ee_len;
 415 -       int depth;      /* Note, NOT eh_depth; depth from top of tree */
 416 -       int ee_len;
 417
 418         BUG_ON(path == NULL);
 419         depth = path->p_depth;
 420 @@ -1178,8 +1177,7 @@
 421                 if (bh == NULL)
 422                         return -EIO;
 423                 eh = ext_block_hdr(bh);
 424 +               if (ext4_ext_check_header(inode, eh, depth)) {
 425 -               /* subtract from p_depth to get proper eh_depth */
 426 -               if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
 427                         put_bh(bh);
 428                         return -EIO;
 429                 }
 430 @@ -1633,13 +1631,11 @@
 431  {
 432         struct ext4_ext_cache *cex;
 433         BUG_ON(len == 0);
 434 -       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 435         cex = &EXT4_I(inode)->i_cached_extent;
 436         cex->ec_type = type;
 437         cex->ec_block = block;
 438         cex->ec_len = len;
 439         cex->ec_start = start;
 440 -       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 441  }
 442
 443  /*
 444 @@ -1696,17 +1692,12 @@
 445                         struct ext4_extent *ex)
 446  {
 447         struct ext4_ext_cache *cex;
 448 -       int ret = EXT4_EXT_CACHE_NO;
 449
 450 -       /*
 451 -        * We borrow i_block_reservation_lock to protect i_cached_extent
 452 -        */
 453 -       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
 454         cex = &EXT4_I(inode)->i_cached_extent;
 455
 456         /* has cache valid data? */
 457         if (cex->ec_type == EXT4_EXT_CACHE_NO)
 458 +               return EXT4_EXT_CACHE_NO;
 459 -               goto errout;
 460
 461         BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP &&
 462                         cex->ec_type != EXT4_EXT_CACHE_EXTENT);
 463 @@ -1717,11 +1708,11 @@
 464                 ext_debug("%u cached by %u:%u:%llu\n",
 465                                 block,
 466                                 cex->ec_block, cex->ec_len, cex->ec_start);
 467 +               return cex->ec_type;
 468 -               ret = cex->ec_type;
 469         }
 470 +
 471 +       /* not in cache */
 472 +       return EXT4_EXT_CACHE_NO;
 473 -errout:
 474 -       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 475 -       return ret;
 476  }
 477
 478  /*
 479 @@ -2677,8 +2668,6 @@
 480                                 if (allocated > max_blocks)
 481                                         allocated = max_blocks;
 482                                 set_buffer_unwritten(bh_result);
 483 -                               bh_result->b_bdev = inode->i_sb->s_bdev;
 484 -                               bh_result->b_blocknr = newblock;
 485                                 goto out2;
 486                         }
 487
 488 reverted:
 489 --- b/fs/ext4/file.c
 490 +++ a/fs/ext4/file.c
 491 @@ -33,14 +33,9 @@
 492   */
 493  static int ext4_release_file (struct inode * inode, struct file * filp)
 494  {
 495 -       if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
 496 -               ext4_alloc_da_blocks(inode);
 497 -               EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
 498 -       }
 499         /* if we are the last writer on the inode, drop the block reservation */
 500         if ((filp->f_mode & FMODE_WRITE) &&
 501 +                       (atomic_read(&inode->i_writecount) == 1))
 502 -                       (atomic_read(&inode->i_writecount) == 1) &&
 503 -                       !EXT4_I(inode)->i_reserved_data_blocks)
 504         {
 505                 down_write(&EXT4_I(inode)->i_data_sem);
 506                 ext4_discard_reservation(inode);
 507 reverted:
 508 --- b/fs/ext4/hash.c
 509 +++ a/fs/ext4/hash.c
 510 @@ -35,71 +35,23 @@
 511
 512
 513  /* The old legacy hash */
 514 +static __u32 dx_hack_hash (const char *name, int len)
 515 -static __u32 dx_hack_hash_unsigned(const char *name, int len)
 516  {
 517 +       __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
 518 -       __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
 519 -       const unsigned char *ucp = (const unsigned char *) name;
 520 -
 521 -       while (len--) {
 522 -               hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
 523 -
 524 -               if (hash & 0x80000000)
 525 -                       hash -= 0x7fffffff;
 526 -               hash1 = hash0;
 527 -               hash0 = hash;
 528 -       }
 529 -       return hash0 << 1;
 530 -}
 531 -
 532 -static __u32 dx_hack_hash_signed(const char *name, int len)
 533 -{
 534 -       __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
 535 -       const signed char *scp = (const signed char *) name;
 536 -
 537         while (len--) {
 538 +               __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
 539 -               hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
 540
 541 +               if (hash & 0x80000000) hash -= 0x7fffffff;
 542 -               if (hash & 0x80000000)
 543 -                       hash -= 0x7fffffff;
 544                 hash1 = hash0;
 545                 hash0 = hash;
 546         }
 547 +       return (hash0 << 1);
 548 -       return hash0 << 1;
 549  }
 550
 551 +static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
 552 -static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
 553  {
 554         __u32   pad, val;
 555         int     i;
 556 -       const signed char *scp = (const signed char *) msg;
 557 -
 558 -       pad = (__u32)len | ((__u32)len << 8);
 559 -       pad |= pad << 16;
 560 -
 561 -       val = pad;
 562 -       if (len > num*4)
 563 -               len = num * 4;
 564 -       for (i = 0; i < len; i++) {
 565 -               if ((i % 4) == 0)
 566 -                       val = pad;
 567 -               val = ((int) scp[i]) + (val << 8);
 568 -               if ((i % 4) == 3) {
 569 -                       *buf++ = val;
 570 -                       val = pad;
 571 -                       num--;
 572 -               }
 573 -       }
 574 -       if (--num >= 0)
 575 -               *buf++ = val;
 576 -       while (--num >= 0)
 577 -               *buf++ = pad;
 578 -}
 579 -
 580 -static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
 581 -{
 582 -       __u32   pad, val;
 583 -       int     i;
 584 -       const unsigned char *ucp = (const unsigned char *) msg;
 585
 586         pad = (__u32)len | ((__u32)len << 8);
 587         pad |= pad << 16;
 588 @@ -110,7 +62,7 @@
 589         for (i=0; i < len; i++) {
 590                 if ((i % 4) == 0)
 591                         val = pad;
 592 +               val = msg[i] + (val << 8);
 593 -               val = ((int) ucp[i]) + (val << 8);
 594                 if ((i % 4) == 3) {
 595                         *buf++ = val;
 596                         val = pad;
 597 @@ -143,8 +95,6 @@
 598         const char      *p;
 599         int             i;
 600         __u32           in[8], buf[4];
 601 -       void            (*str2hashbuf)(const char *, int, __u32 *, int) =
 602 -                               str2hashbuf_signed;
 603
 604         /* Initialize the default seed for the hash checksum functions */
 605         buf[0] = 0x67452301;
 606 @@ -163,18 +113,13 @@
 607         }
 608
 609         switch (hinfo->hash_version) {
 610 -       case DX_HASH_LEGACY_UNSIGNED:
 611 -               hash = dx_hack_hash_unsigned(name, len);
 612 -               break;
 613         case DX_HASH_LEGACY:
 614 +               hash = dx_hack_hash(name, len);
 615 -               hash = dx_hack_hash_signed(name, len);
 616                 break;
 617 -       case DX_HASH_HALF_MD4_UNSIGNED:
 618 -               str2hashbuf = str2hashbuf_unsigned;
 619         case DX_HASH_HALF_MD4:
 620                 p = name;
 621                 while (len > 0) {
 622 +                       str2hashbuf(p, len, in, 8);
 623 -                       (*str2hashbuf)(p, len, in, 8);
 624                         half_md4_transform(buf, in);
 625                         len -= 32;
 626                         p += 32;
 627 @@ -182,12 +127,10 @@
 628                 minor_hash = buf[2];
 629                 hash = buf[1];
 630                 break;
 631 -       case DX_HASH_TEA_UNSIGNED:
 632 -               str2hashbuf = str2hashbuf_unsigned;
 633         case DX_HASH_TEA:
 634                 p = name;
 635                 while (len > 0) {
 636 +                       str2hashbuf(p, len, in, 4);
 637 -                       (*str2hashbuf)(p, len, in, 4);
 638                         TEA_transform(buf, in);
 639                         len -= 16;
 640                         p += 16;
 641 reverted:
 642 --- b/fs/ext4/ialloc.c
 643 +++ a/fs/ext4/ialloc.c
 644 @@ -84,7 +84,7 @@
 645         }
 646
 647         memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
 648 +       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
 649 -       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
 650                         bh->b_data);
 651
 652         return EXT4_INODES_PER_GROUP(sb);
 653 @@ -115,40 +115,18 @@
 654                             block_group, bitmap_blk);
 655                 return NULL;
 656         }
 657 +       if (bh_uptodate_or_lock(bh))
 658 -       if (bitmap_uptodate(bh))
 659                 return bh;
 660
 661 -       lock_buffer(bh);
 662 -       if (bitmap_uptodate(bh)) {
 663 -               unlock_buffer(bh);
 664 -               return bh;
 665 -       }
 666         spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
 667         if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 668                 ext4_init_inode_bitmap(sb, bh, block_group, desc);
 669 -               set_bitmap_uptodate(bh);
 670                 set_buffer_uptodate(bh);
 671                 unlock_buffer(bh);
 672                 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
 673                 return bh;
 674         }
 675         spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
 676 -       if (buffer_uptodate(bh)) {
 677 -               /*
 678 -                * if not uninit if bh is uptodate,
 679 -                * bitmap is also uptodate
 680 -                */
 681 -               set_bitmap_uptodate(bh);
 682 -               unlock_buffer(bh);
 683 -               return bh;
 684 -       }
 685 -       /*
 686 -        * submit the buffer_head for read. We can
 687 -        * safely mark the bitmap as uptodate now.
 688 -        * We do it here so the bitmap uptodate bit
 689 -        * get set with buffer lock held.
 690 -        */
 691 -       set_bitmap_uptodate(bh);
 692         if (bh_submit_read(bh) < 0) {
 693                 put_bh(bh);
 694                 ext4_error(sb, __func__,
 695 @@ -188,7 +166,7 @@
 696         struct ext4_group_desc * gdp;
 697         struct ext4_super_block * es;
 698         struct ext4_sb_info *sbi;
 699 +       int fatal = 0, err;
 700 -       int fatal = 0, err, cleared;
 701         ext4_group_t flex_group;
 702
 703         if (atomic_read(&inode->i_count) > 1) {
 704 @@ -242,12 +220,10 @@
 705                 goto error_return;
 706
 707         /* Ok, now we can actually update the inode bitmaps.. */
 708 +       if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
 709 +                                       bit, bitmap_bh->b_data))
 710 +               ext4_error (sb, "ext4_free_inode",
 711 +                             "bit already cleared for inode %lu", ino);
 712 -       spin_lock(sb_bgl_lock(sbi, block_group));
 713 -       cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
 714 -       spin_unlock(sb_bgl_lock(sbi, block_group));
 715 -       if (!cleared)
 716 -               ext4_error(sb, "ext4_free_inode",
 717 -                          "bit already cleared for inode %lu", ino);
 718         else {
 719                 gdp = ext4_get_group_desc (sb, block_group, &bh2);
 720
 721 @@ -591,77 +567,6 @@
 722  }
 723
 724  /*
 725 - * claim the inode from the inode bitmap. If the group
 726 - * is uninit we need to take the groups's sb_bgl_lock
 727 - * and clear the uninit flag. The inode bitmap update
 728 - * and group desc uninit flag clear should be done
 729 - * after holding sb_bgl_lock so that ext4_read_inode_bitmap
 730 - * doesn't race with the ext4_claim_inode
 731 - */
 732 -static int ext4_claim_inode(struct super_block *sb,
 733 -                       struct buffer_head *inode_bitmap_bh,
 734 -                       unsigned long ino, ext4_group_t group, int mode)
 735 -{
 736 -       int free = 0, retval = 0;
 737 -       struct ext4_sb_info *sbi = EXT4_SB(sb);
 738 -       struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
 739 -
 740 -       spin_lock(sb_bgl_lock(sbi, group));
 741 -       if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
 742 -               /* not a free inode */
 743 -               retval = 1;
 744 -               goto err_ret;
 745 -       }
 746 -       ino++;
 747 -       if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
 748 -                       ino > EXT4_INODES_PER_GROUP(sb)) {
 749 -               spin_unlock(sb_bgl_lock(sbi, group));
 750 -               ext4_error(sb, __func__,
 751 -                          "reserved inode or inode > inodes count - "
 752 -                          "block_group = %lu, inode=%lu", group,
 753 -                          ino + group * EXT4_INODES_PER_GROUP(sb));
 754 -               return 1;
 755 -       }
 756 -       /* If we didn't allocate from within the initialized part of the inode
 757 -        * table then we need to initialize up to this inode. */
 758 -       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
 759 -
 760 -               if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 761 -                       gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
 762 -                       /* When marking the block group with
 763 -                        * ~EXT4_BG_INODE_UNINIT we don't want to depend
 764 -                        * on the value of bg_itable_unused even though
 765 -                        * mke2fs could have initialized the same for us.
 766 -                        * Instead we calculated the value below
 767 -                        */
 768 -
 769 -                       free = 0;
 770 -               } else {
 771 -                       free = EXT4_INODES_PER_GROUP(sb) -
 772 -                               le16_to_cpu(gdp->bg_itable_unused);
 773 -               }
 774 -
 775 -               /*
 776 -                * Check the relative inode number against the last used
 777 -                * relative inode number in this group. if it is greater
 778 -                * we need to  update the bg_itable_unused count
 779 -                *
 780 -                */
 781 -               if (ino > free)
 782 -                       gdp->bg_itable_unused =
 783 -                               cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
 784 -       }
 785 -       le16_add_cpu(&gdp->bg_free_inodes_count, -1);
 786 -       if (S_ISDIR(mode)) {
 787 -               le16_add_cpu(&gdp->bg_used_dirs_count, 1);
 788 -       }
 789 -       gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 790 -err_ret:
 791 -       spin_unlock(sb_bgl_lock(sbi, group));
 792 -       return retval;
 793 -}
 794 -
 795 -/*
 796   * There are two policies for allocating an inode.  If the new inode is
 797   * a directory, then a forward search is made for a block group with both
 798   * free space and a low directory-to-inode ratio; if that fails, then of
 799 @@ -687,7 +592,6 @@
 800         struct inode *ret;
 801         ext4_group_t i;
 802         int free = 0;
 803 -       static int once = 1;
 804         ext4_group_t flex_group;
 805
 806         /* Cannot create files in a deleted directory */
 807 @@ -705,15 +609,6 @@
 808
 809         if (sbi->s_log_groups_per_flex) {
 810                 ret2 = find_group_flex(sb, dir, &group);
 811 -               if (ret2 == -1) {
 812 -                       ret2 = find_group_other(sb, dir, &group);
 813 -                       if (ret2 == 0 && once) {
 814 -                               once = 0;
 815 -                               printk(KERN_NOTICE "ext4: find_group_flex "
 816 -                                      "failed, fallback succeeded dir %lu\n",
 817 -                                      dir->i_ino);
 818 -                       }
 819 -               }
 820                 goto got_group;
 821         }
 822
 823 @@ -754,12 +649,8 @@
 824                         if (err)
 825                                 goto fail;
 826
 827 +                       if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
 828 +                                               ino, bitmap_bh->b_data)) {
 829 -                       BUFFER_TRACE(bh2, "get_write_access");
 830 -                       err = ext4_journal_get_write_access(handle, bh2);
 831 -                       if (err)
 832 -                               goto fail;
 833 -                       if (!ext4_claim_inode(sb, bitmap_bh,
 834 -                                               ino, group, mode)) {
 835                                 /* we won it */
 836                                 BUFFER_TRACE(bitmap_bh,
 837                                         "call ext4_journal_dirty_metadata");
 838 @@ -767,13 +658,10 @@
 839                                                                 bitmap_bh);
 840                                 if (err)
 841                                         goto fail;
 842 -                               /* zero bit is inode number 1*/
 843 -                               ino++;
 844                                 goto got;
 845                         }
 846                         /* we lost it */
 847                         jbd2_journal_release_buffer(handle, bitmap_bh);
 848 -                       jbd2_journal_release_buffer(handle, bh2);
 849
 850                         if (++ino < EXT4_INODES_PER_GROUP(sb))
 851                                 goto repeat_in_this_group;
 852 @@ -793,6 +681,21 @@
 853         goto out;
 854
 855  got:
 856 +       ino++;
 857 +       if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
 858 +           ino > EXT4_INODES_PER_GROUP(sb)) {
 859 +               ext4_error(sb, __func__,
 860 +                          "reserved inode or inode > inodes count - "
 861 +                          "block_group = %lu, inode=%lu", group,
 862 +                          ino + group * EXT4_INODES_PER_GROUP(sb));
 863 +               err = -EIO;
 864 +               goto fail;
 865 +       }
 866 +
 867 +       BUFFER_TRACE(bh2, "get_write_access");
 868 +       err = ext4_journal_get_write_access(handle, bh2);
 869 +       if (err) goto fail;
 870 +
 871         /* We may have to initialize the block bitmap if it isn't already */
 872         if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
 873             gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 874 @@ -827,10 +730,47 @@
 875                 if (err)
 876                         goto fail;
 877         }
 878 +
 879 +       spin_lock(sb_bgl_lock(sbi, group));
 880 +       /* If we didn't allocate from within the initialized part of the inode
 881 +        * table then we need to initialize up to this inode. */
 882 +       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
 883 +               if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 884 +                       gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
 885 +
 886 +                       /* When marking the block group with
 887 +                        * ~EXT4_BG_INODE_UNINIT we don't want to depend
 888 +                        * on the value of bg_itable_unused even though
 889 +                        * mke2fs could have initialized the same for us.
 890 +                        * Instead we calculated the value below
 891 +                        */
 892 +
 893 +                       free = 0;
 894 +               } else {
 895 +                       free = EXT4_INODES_PER_GROUP(sb) -
 896 +                               le16_to_cpu(gdp->bg_itable_unused);
 897 +               }
 898 +
 899 +               /*
 900 +                * Check the relative inode number against the last used
 901 +                * relative inode number in this group. if it is greater
 902 +                * we need to  update the bg_itable_unused count
 903 +                *
 904 +                */
 905 +               if (ino > free)
 906 +                       gdp->bg_itable_unused =
 907 +                               cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
 908 +       }
 909 +
 910 +       le16_add_cpu(&gdp->bg_free_inodes_count, -1);
 911 +       if (S_ISDIR(mode)) {
 912 +               le16_add_cpu(&gdp->bg_used_dirs_count, 1);
 913 +       }
 914 +       gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 915 +       spin_unlock(sb_bgl_lock(sbi, group));
 916 +       BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
 917 -       BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
 918         err = ext4_journal_dirty_metadata(handle, bh2);
 919 +       if (err) goto fail;
 920 -       if (err)
 921 -               goto fail;
 922
 923         percpu_counter_dec(&sbi->s_freeinodes_counter);
 924         if (S_ISDIR(mode))
 925 @@ -866,12 +806,16 @@
 926         ei->i_disksize = 0;
 927
 928         /*
 929 +        * Don't inherit extent flag from directory. We set extent flag on
 930 +        * newly created directory and file only if -o extent mount option is
 931 +        * specified
 932 -        * Don't inherit extent flag from directory, amongst others. We set
 933 -        * extent flag on newly created directory and file only if -o extent
 934 -        * mount option is specified
 935          */
 936 +       ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL);
 937 +       if (S_ISLNK(mode))
 938 +               ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
 939 +       /* dirsync only applies to directories */
 940 +       if (!S_ISDIR(mode))
 941 +               ei->i_flags &= ~EXT4_DIRSYNC_FL;
 942 -       ei->i_flags =
 943 -               ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
 944         ei->i_file_acl = 0;
 945         ei->i_dtime = 0;
 946         ei->i_block_alloc_info = NULL;
 947 reverted:
 948 --- b/fs/ext4/inode.c
 949 +++ a/fs/ext4/inode.c
 950 @@ -46,10 +46,8 @@
 951  static inline int ext4_begin_ordered_truncate(struct inode *inode,
 952                                               loff_t new_size)
 953  {
 954 +       return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
 955 +                                                  new_size);
 956 -       return jbd2_journal_begin_ordered_truncate(
 957 -                                       EXT4_SB(inode->i_sb)->s_journal,
 958 -                                       &EXT4_I(inode)->jinode,
 959 -                                       new_size);
 960  }
 961
 962  static void ext4_invalidatepage(struct page *page, unsigned long offset);
 963 @@ -353,9 +351,9 @@
 964                 final = ptrs;
 965         } else {
 966                 ext4_warning(inode->i_sb, "ext4_block_to_path",
 967 +                               "block %lu > max",
 968 -                               "block %lu > max in inode %lu",
 969                                 i_block + direct_blocks +
 970 +                               indirect_blocks + double_blocks);
 971 -                               indirect_blocks + double_blocks, inode->i_ino);
 972         }
 973         if (boundary)
 974                 *boundary = final - 1 - (i_block & (ptrs - 1));
 975 @@ -1046,14 +1044,6 @@
 976         EXT4_I(inode)->i_reserved_meta_blocks = mdb;
 977         EXT4_I(inode)->i_allocated_meta_blocks = 0;
 978         spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 979 -
 980 -       /*
 981 -        * If we have done all the pending block allocations and if
 982 -        * there aren't any writers on the inode, we can discard the
 983 -        * inode's preallocations.
 984 -        */
 985 -       if (!total && (atomic_read(&inode->i_writecount) == 0))
 986 -               ext4_discard_reservation(inode);
 987  }
 988
 989  /*
 990 @@ -1085,7 +1075,6 @@
 991         int retval;
 992
 993         clear_buffer_mapped(bh);
 994 -       clear_buffer_unwritten(bh);
 995
 996         /*
 997          * Try to see if we can get  the block without requesting
 998 @@ -1116,18 +1105,6 @@
 999                 return retval;
1000
1001         /*
1002 -        * When we call get_blocks without the create flag, the
1003 -        * BH_Unwritten flag could have gotten set if the blocks
1004 -        * requested were part of a uninitialized extent.  We need to
1005 -        * clear this flag now that we are committed to convert all or
1006 -        * part of the uninitialized extent to be an initialized
1007 -        * extent.  This is because we need to avoid the combination
1008 -        * of BH_Unwritten and BH_Mapped flags being simultaneously
1009 -        * set on the buffer_head.
1010 -        */
1011 -       clear_buffer_unwritten(bh);
1012 -
1013 -       /*
1014          * New blocks allocate and/or writing to uninitialized extent
1015          * will possibly result in updating i_data, so we take
1016          * the write lock of i_data_sem, and call get_blocks()
1017 @@ -1393,10 +1370,6 @@
1018                 goto out;
1019         }
1020
1021 -       /* We cannot recurse into the filesystem as the transaction is already
1022 -        * started */
1023 -       flags |= AOP_FLAG_NOFS;
1024 -
1025         page = grab_cache_page_write_begin(mapping, index, flags);
1026         if (!page) {
1027                 ext4_journal_stop(handle);
1028 @@ -1406,7 +1379,7 @@
1029         *pagep = page;
1030
1031         ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
1032 +                                                       ext4_get_block);
1033 -                               ext4_get_block);
1034
1035         if (!ret && ext4_should_journal_data(inode)) {
1036                 ret = walk_page_buffers(handle, page_buffers(page),
1037 @@ -1675,25 +1648,18 @@
1038   */
1039  static int mpage_da_submit_io(struct mpage_da_data *mpd)
1040  {
1041 +       struct address_space *mapping = mpd->inode->i_mapping;
1042 +       int ret = 0, err, nr_pages, i;
1043 +       unsigned long index, end;
1044 -       long pages_skipped;
1045         struct pagevec pvec;
1046 -       unsigned long index, end;
1047 -       int ret = 0, err, nr_pages, i;
1048 -       struct inode *inode = mpd->inode;
1049 -       struct address_space *mapping = inode->i_mapping;
1050
1051         BUG_ON(mpd->next_page <= mpd->first_page);
1052 +       pagevec_init(&pvec, 0);
1053 -       /*
1054 -        * We need to start from the first_page to the next_page - 1
1055 -        * to make sure we also write the mapped dirty buffer_heads.
1056 -        * If we look at mpd->lbh.b_blocknr we would only be looking
1057 -        * at the currently mapped buffer_heads.
1058 -        */
1059         index = mpd->first_page;
1060         end = mpd->next_page - 1;
1061
1062 -       pagevec_init(&pvec, 0);
1063         while (index <= end) {
1064 +               /* XXX: optimize tail */
1065                 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1066                 if (nr_pages == 0)
1067                         break;
1068 @@ -1705,10 +1671,6 @@
1069                                 break;
1070                         index++;
1071
1072 -                       BUG_ON(!PageLocked(page));
1073 -                       BUG_ON(PageWriteback(page));
1074 -
1075 -                       pages_skipped = mpd->wbc->pages_skipped;
1076                         err = mapping->a_ops->writepage(page, mpd->wbc);
1077                         if (!err)
1078                                 mpd->pages_written++;
1079 @@ -2029,29 +1991,11 @@
1080                 bh = head;
1081                 do {
1082                         BUG_ON(buffer_locked(bh));
1083 -                       /*
1084 -                        * We need to try to allocate
1085 -                        * unmapped blocks in the same page.
1086 -                        * Otherwise we won't make progress
1087 -                        * with the page in ext4_da_writepage
1088 -                        */
1089                         if (buffer_dirty(bh) &&
1090                                 (!buffer_mapped(bh) || buffer_delay(bh))) {
1091                                 mpage_add_bh_to_extent(mpd, logical, bh);
1092                                 if (mpd->io_done)
1093                                         return MPAGE_DA_EXTENT_TAIL;
1094 -                       } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
1095 -                               /*
1096 -                                * mapped dirty buffer. We need to update
1097 -                                * the b_state because we look at
1098 -                                * b_state in mpage_da_map_blocks. We don't
1099 -                                * update b_size because if we find an
1100 -                                * unmapped buffer_head later we need to
1101 -                                * use the b_state flag of that buffer_head.
1102 -                                */
1103 -                               if (mpd->lbh.b_size == 0)
1104 -                                       mpd->lbh.b_state =
1105 -                                               bh->b_state & BH_FLAGS;
1106                         }
1107                         logical++;
1108                 } while ((bh = bh->b_this_page) != head);
1109 @@ -2118,10 +2062,6 @@
1110                                   struct buffer_head *bh_result, int create)
1111  {
1112         int ret = 0;
1113 -       sector_t invalid_block = ~((sector_t) 0xffff);
1114 -
1115 -       if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
1116 -               invalid_block = ~0;
1117
1118         BUG_ON(create == 0);
1119         BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
1120 @@ -2143,18 +2083,11 @@
1121                         /* not enough space to reserve */
1122                         return ret;
1123
1124 +               map_bh(bh_result, inode->i_sb, 0);
1125 -               map_bh(bh_result, inode->i_sb, invalid_block);
1126                 set_buffer_new(bh_result);
1127                 set_buffer_delay(bh_result);
1128         } else if (ret > 0) {
1129                 bh_result->b_size = (ret << inode->i_blkbits);
1130 -               /*
1131 -                * With sub-block writes into unwritten extents
1132 -                * we also need to mark the buffer as new so that
1133 -                * the unwritten parts of the buffer gets correctly zeroed.
1134 -                */
1135 -               if (buffer_unwritten(bh_result))
1136 -                       set_buffer_new(bh_result);
1137                 ret = 0;
1138         }
1139
1140 @@ -2365,20 +2298,6 @@
1141          */
1142         if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1143                 return 0;
1144 -
1145 -       /*
1146 -        * If the filesystem has aborted, it is read-only, so return
1147 -        * right away instead of dumping stack traces later on that
1148 -        * will obscure the real source of the problem.  We test
1149 -        * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
1150 -        * the latter could be true if the filesystem is mounted
1151 -        * read-only, and in that case, ext4_da_writepages should
1152 -        * *never* be called, so if that ever happens, we would want
1153 -        * the stack trace.
1154 -        */
1155 -       if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
1156 -               return -EROFS;
1157 -
1158         /*
1159          * Make sure nr_to_write is >= sbi->s_mb_stream_request
1160          * This make sure small files blocks are allocated in
1161 @@ -2417,7 +2336,7 @@
1162                 handle = ext4_journal_start(inode, needed_blocks);
1163                 if (IS_ERR(handle)) {
1164                         ret = PTR_ERR(handle);
1165 +                       printk(KERN_EMERG "%s: jbd2_start: "
1166 -                       printk(KERN_CRIT "%s: jbd2_start: "
1167                                "%ld pages, ino %lu; err %d\n", __func__,
1168                                 wbc->nr_to_write, inode->i_ino, ret);
1169                         dump_stack();
1170 @@ -2501,9 +2420,6 @@
1171                 ret = PTR_ERR(handle);
1172                 goto out;
1173         }
1174 -       /* We cannot recurse into the filesystem as the transaction is already
1175 -        * started */
1176 -       flags |= AOP_FLAG_NOFS;
1177
1178         page = grab_cache_page_write_begin(mapping, index, flags);
1179         if (!page) {
1180 @@ -2617,48 +2533,6 @@
1181         return;
1182  }
1183
1184 -/*
1185 - * Force all delayed allocation blocks to be allocated for a given inode.
1186 - */
1187 -int ext4_alloc_da_blocks(struct inode *inode)
1188 -{
1189 -       if (!EXT4_I(inode)->i_reserved_data_blocks &&
1190 -           !EXT4_I(inode)->i_reserved_meta_blocks)
1191 -               return 0;
1192 -
1193 -       /*
1194 -        * We do something simple for now.  The filemap_flush() will
1195 -        * also start triggering a write of the data blocks, which is
1196 -        * not strictly speaking necessary (and for users of
1197 -        * laptop_mode, not even desirable).  However, to do otherwise
1198 -        * would require replicating code paths in:
1199 -        *
1200 -        * ext4_da_writepages() ->
1201 -        *    write_cache_pages() ---> (via passed in callback function)
1202 -        *        __mpage_da_writepage() -->
1203 -        *           mpage_add_bh_to_extent()
1204 -        *           mpage_da_map_blocks()
1205 -        *
1206 -        * The problem is that write_cache_pages(), located in
1207 -        * mm/page-writeback.c, marks pages clean in preparation for
1208 -        * doing I/O, which is not desirable if we're not planning on
1209 -        * doing I/O at all.
1210 -        *
1211 -        * We could call write_cache_pages(), and then redirty all of
1212 -        * the pages by calling redirty_page_for_writeback() but that
1213 -        * would be ugly in the extreme.  So instead we would need to
1214 -        * replicate parts of the code in the above functions,
1215 -        * simplifying them becuase we wouldn't actually intend to
1216 -        * write out the pages, but rather only collect contiguous
1217 -        * logical block extents, call the multi-block allocator, and
1218 -        * then update the buffer heads with the block allocations.
1219 -        *
1220 -        * For now, though, we'll cheat by calling filemap_flush(),
1221 -        * which will map the blocks, and start the I/O, but not
1222 -        * actually wait for the I/O to complete.
1223 -        */
1224 -       return filemap_flush(inode->i_mapping);
1225 -}
1226
1227  /*
1228   * bmap() is special.  It gets used by applications such as lilo and by
1229 @@ -3668,9 +3542,6 @@
1230         if (!ext4_can_truncate(inode))
1231                 return;
1232
1233 -       if (inode->i_size == 0)
1234 -               ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
1235 -
1236         if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
1237                 ext4_ext_truncate(inode);
1238                 return;
1239 @@ -4088,9 +3959,11 @@
1240         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
1241         inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
1242         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
1243 +       if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
1244 +           cpu_to_le32(EXT4_OS_HURD)) {
1245 -       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
1246                 ei->i_file_acl |=
1247                         ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
1248 +       }
1249         inode->i_size = ext4_isize(raw_inode);
1250         ei->i_disksize = inode->i_size;
1251         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
1252 @@ -4137,18 +4010,6 @@
1253                         (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
1254         }
1255
1256 -       if (ei->i_file_acl &&
1257 -           ((ei->i_file_acl <
1258 -             (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
1259 -              EXT4_SB(sb)->s_gdb_count)) ||
1260 -            (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
1261 -               ext4_error(sb, __func__,
1262 -                          "bad extended attribute block %llu in inode #%lu",
1263 -                          ei->i_file_acl, inode->i_ino);
1264 -               ret = -EIO;
1265 -               goto bad_inode;
1266 -       }
1267 -
1268         if (S_ISREG(inode->i_mode)) {
1269                 inode->i_op = &ext4_file_inode_operations;
1270                 inode->i_fop = &ext4_file_operations;
1271 @@ -4163,8 +4024,7 @@
1272                         inode->i_op = &ext4_symlink_inode_operations;
1273                         ext4_set_aops(inode);
1274                 }
1275 +       } else {
1276 -       } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
1277 -             S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
1278                 inode->i_op = &ext4_special_inode_operations;
1279                 if (raw_inode->i_block[0])
1280                         init_special_inode(inode, inode->i_mode,
1281 @@ -4172,13 +4032,6 @@
1282                 else
1283                         init_special_inode(inode, inode->i_mode,
1284                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
1285 -       } else {
1286 -               brelse(bh);
1287 -               ret = -EIO;
1288 -               ext4_error(inode->i_sb, __func__,
1289 -                          "bogus i_mode (%o) for inode=%lu",
1290 -                          inode->i_mode, inode->i_ino);
1291 -               goto bad_inode;
1292         }
1293         brelse (iloc.bh);
1294         ext4_set_inode_flags(inode);
1295 @@ -4956,9 +4809,8 @@
1296         return !buffer_mapped(bh);
1297  }
1298
1299 +int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
1300 -int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1301  {
1302 -       struct page *page = vmf->page;
1303         loff_t size;
1304         unsigned long len;
1305         int ret = -EINVAL;
1306 @@ -5009,8 +4861,6 @@
1307                 goto out_unlock;
1308         ret = 0;
1309  out_unlock:
1310 -       if (ret)
1311 -               ret = VM_FAULT_SIGBUS;
1312         up_read(&inode->i_alloc_sem);
1313         return ret;
1314  }
1315 reverted:
1316 --- b/fs/ext4/ioctl.c
1317 +++ a/fs/ext4/ioctl.c
1318 @@ -49,7 +49,8 @@
1319                 if (err)
1320                         return err;
1321
1322 +               if (!S_ISDIR(inode->i_mode))
1323 +                       flags &= ~EXT4_DIRSYNC_FL;
1324 -               flags = ext4_mask_flags(inode->i_mode, flags);
1325
1326                 err = -EPERM;
1327                 mutex_lock(&inode->i_mutex);
1328 @@ -287,20 +288,6 @@
1329                 return err;
1330         }
1331
1332 -       case EXT4_IOC_ALLOC_DA_BLKS:
1333 -       {
1334 -               int err;
1335 -               if (!is_owner_or_cap(inode))
1336 -                       return -EACCES;
1337 -
1338 -               err = mnt_want_write(filp->f_path.mnt);
1339 -               if (err)
1340 -                       return err;
1341 -               err = ext4_alloc_da_blocks(inode);
1342 -               mnt_drop_write(filp->f_path.mnt);
1343 -               return err;
1344 -       }
1345 -
1346         default:
1347                 return -ENOTTY;
1348         }
1349 reverted:
1350 --- b/fs/ext4/mballoc.c
1351 +++ a/fs/ext4/mballoc.c
1352 @@ -100,7 +100,7 @@
1353   * inode as:
1354   *
1355   *  {                        page                        }
1356 + *  [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
1357 - *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1358   *
1359   *
1360   * one block each for bitmap and buddy information.  So for each group we
1361 @@ -330,18 +330,6 @@
1362   *        object
1363   *
1364   */
1365 -static struct kmem_cache *ext4_pspace_cachep;
1366 -static struct kmem_cache *ext4_ac_cachep;
1367 -static struct kmem_cache *ext4_free_ext_cachep;
1368 -static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
1369 -                                       ext4_group_t group);
1370 -static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
1371 -                                               ext4_group_t group);
1372 -static int ext4_mb_init_per_dev_proc(struct super_block *sb);
1373 -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
1374 -static void ext4_mb_free_committed_blocks(struct super_block *);
1375 -static void ext4_mb_poll_new_transaction(struct super_block *sb,
1376 -                                        handle_t *handle);
1377
1378  static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
1379  {
1380 @@ -730,7 +718,7 @@
1381   * stored in the inode as
1382   *
1383   * {                        page                        }
1384 + * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
1385 - * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1386   *
1387   *
1388   * one block each for bitmap and buddy information.
1389 @@ -796,42 +784,20 @@
1390                 if (bh[i] == NULL)
1391                         goto out;
1392
1393 +               if (bh_uptodate_or_lock(bh[i]))
1394 -               if (bitmap_uptodate(bh[i]))
1395                         continue;
1396
1397 -               lock_buffer(bh[i]);
1398 -               if (bitmap_uptodate(bh[i])) {
1399 -                       unlock_buffer(bh[i]);
1400 -                       continue;
1401 -               }
1402                 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1403                 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
1404                         ext4_init_block_bitmap(sb, bh[i],
1405                                                 first_group + i, desc);
1406 -                       set_bitmap_uptodate(bh[i]);
1407                         set_buffer_uptodate(bh[i]);
1408                         unlock_buffer(bh[i]);
1409                         spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1410                         continue;
1411                 }
1412                 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1413 -               if (buffer_uptodate(bh[i])) {
1414 -                       /*
1415 -                        * if not uninit if bh is uptodate,
1416 -                        * bitmap is also uptodate
1417 -                        */
1418 -                       set_bitmap_uptodate(bh[i]);
1419 -                       unlock_buffer(bh[i]);
1420 -                       continue;
1421 -               }
1422                 get_bh(bh[i]);
1423 -               /*
1424 -                * submit the buffer_head for read. We can
1425 -                * safely mark the bitmap as uptodate now.
1426 -                * We do it here so the bitmap uptodate bit
1427 -                * get set with buffer lock held.
1428 -                */
1429 -               set_bitmap_uptodate(bh[i]);
1430                 bh[i]->b_end_io = end_buffer_read_sync;
1431                 submit_bh(READ, bh[i]);
1432                 mb_debug("read bitmap for group %lu\n", first_group + i);
1433 @@ -848,8 +814,6 @@
1434
1435         err = 0;
1436         first_block = page->index * blocks_per_page;
1437 -       /* init the page  */
1438 -       memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
1439         for (i = 0; i < blocks_per_page; i++) {
1440                 int group;
1441                 struct ext4_group_info *grinfo;
1442 @@ -876,6 +840,7 @@
1443                         BUG_ON(incore == NULL);
1444                         mb_debug("put buddy for group %u in page %lu/%x\n",
1445                                 group, page->index, i * blocksize);
1446 +                       memset(data, 0xff, blocksize);
1447                         grinfo = ext4_get_group_info(sb, group);
1448                         grinfo->bb_fragments = 0;
1449                         memset(grinfo->bb_counters, 0,
1450 @@ -883,9 +848,7 @@
1451                         /*
1452                          * incore got set to the group block bitmap below
1453                          */
1454 -                       ext4_lock_group(sb, group);
1455                         ext4_mb_generate_buddy(sb, data, incore, group);
1456 -                       ext4_unlock_group(sb, group);
1457                         incore = NULL;
1458                 } else {
1459                         /* this is block of bitmap */
1460 @@ -899,7 +862,6 @@
1461
1462                         /* mark all preallocated blks used in in-core bitmap */
1463                         ext4_mb_generate_from_pa(sb, data, group);
1464 -                       ext4_mb_generate_from_freelist(sb, data, group);
1465                         ext4_unlock_group(sb, group);
1466
1467                         /* set incore so that the buddy information can be
1468 @@ -924,20 +886,18 @@
1469  ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1470                                         struct ext4_buddy *e4b)
1471  {
1472 +       struct ext4_sb_info *sbi = EXT4_SB(sb);
1473 +       struct inode *inode = sbi->s_buddy_cache;
1474         int blocks_per_page;
1475         int block;
1476         int pnum;
1477         int poff;
1478         struct page *page;
1479         int ret;
1480 -       struct ext4_group_info *grp;
1481 -       struct ext4_sb_info *sbi = EXT4_SB(sb);
1482 -       struct inode *inode = sbi->s_buddy_cache;
1483
1484         mb_debug("load group %lu\n", group);
1485
1486         blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1487 -       grp = ext4_get_group_info(sb, group);
1488
1489         e4b->bd_blkbits = sb->s_blocksize_bits;
1490         e4b->bd_info = ext4_get_group_info(sb, group);
1491 @@ -945,15 +905,6 @@
1492         e4b->bd_group = group;
1493         e4b->bd_buddy_page = NULL;
1494         e4b->bd_bitmap_page = NULL;
1495 -       e4b->alloc_semp = &grp->alloc_sem;
1496 -
1497 -       /* Take the read lock on the group alloc
1498 -        * sem. This would make sure a parallel
1499 -        * ext4_mb_init_group happening on other
1500 -        * groups mapped by the page is blocked
1501 -        * till we are done with allocation
1502 -        */
1503 -       down_read(e4b->alloc_semp);
1504
1505         /*
1506          * the buddy cache inode stores the block bitmap
1507 @@ -969,14 +920,6 @@
1508         page = find_get_page(inode->i_mapping, pnum);
1509         if (page == NULL || !PageUptodate(page)) {
1510                 if (page)
1511 -                       /*
1512 -                        * drop the page reference and try
1513 -                        * to get the page with lock. If we
1514 -                        * are not uptodate that implies
1515 -                        * somebody just created the page but
1516 -                        * is yet to initialize the same. So
1517 -                        * wait for it to initialize.
1518 -                        */
1519                         page_cache_release(page);
1520                 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1521                 if (page) {
1522 @@ -1042,9 +985,6 @@
1523                 page_cache_release(e4b->bd_buddy_page);
1524         e4b->bd_buddy = NULL;
1525         e4b->bd_bitmap = NULL;
1526 -
1527 -       /* Done with the buddy cache */
1528 -       up_read(e4b->alloc_semp);
1529         return ret;
1530  }
1531
1532 @@ -1054,9 +994,6 @@
1533                 page_cache_release(e4b->bd_bitmap_page);
1534         if (e4b->bd_buddy_page)
1535                 page_cache_release(e4b->bd_buddy_page);
1536 -       /* Done with the buddy cache */
1537 -       if (e4b->alloc_semp)
1538 -               up_read(e4b->alloc_semp);
1539  }
1540
1541
1542 @@ -1094,10 +1031,7 @@
1543                         cur += 32;
1544                         continue;
1545                 }
1546 +               mb_clear_bit_atomic(lock, cur, bm);
1547 -               if (lock)
1548 -                       mb_clear_bit_atomic(lock, cur, bm);
1549 -               else
1550 -                       mb_clear_bit(cur, bm);
1551                 cur++;
1552         }
1553  }
1554 @@ -1115,10 +1049,7 @@
1555                         cur += 32;
1556                         continue;
1557                 }
1558 +               mb_set_bit_atomic(lock, cur, bm);
1559 -               if (lock)
1560 -                       mb_set_bit_atomic(lock, cur, bm);
1561 -               else
1562 -                       mb_set_bit(cur, bm);
1563                 cur++;
1564         }
1565  }
1566 @@ -1365,20 +1296,13 @@
1567         ac->ac_tail = ret & 0xffff;
1568         ac->ac_buddy = ret >> 16;
1569
1570 +       /* XXXXXXX: SUCH A HORRIBLE **CK */
1571 +       /*FIXME!! Why ? */
1572 -       /*
1573 -        * take the page reference. We want the page to be pinned
1574 -        * so that we don't get a ext4_mb_init_cache_call for this
1575 -        * group until we update the bitmap. That would mean we
1576 -        * double allocate blocks. The reference is dropped
1577 -        * in ext4_mb_release_context
1578 -        */
1579         ac->ac_bitmap_page = e4b->bd_bitmap_page;
1580         get_page(ac->ac_bitmap_page);
1581         ac->ac_buddy_page = e4b->bd_buddy_page;
1582         get_page(ac->ac_buddy_page);
1583 +
1584 -       /* on allocation we use ac to track the held semaphore */
1585 -       ac->alloc_semp =  e4b->alloc_semp;
1586 -       e4b->alloc_semp = NULL;
1587         /* store last allocated for subsequent stream allocation */
1588         if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
1589                 spin_lock(&sbi->s_md_lock);
1590 @@ -1402,8 +1326,6 @@
1591         struct ext4_free_extent ex;
1592         int max;
1593
1594 -       if (ac->ac_status == AC_STATUS_FOUND)
1595 -               return;
1596         /*
1597          * We don't want to scan for a whole year
1598          */
1599 @@ -1450,7 +1372,7 @@
1600         struct ext4_free_extent *gex = &ac->ac_g_ex;
1601
1602         BUG_ON(ex->fe_len <= 0);
1603 +       BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1604 -       BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1605         BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1606         BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1607
1608 @@ -1770,173 +1692,6 @@
1609         return 0;
1610  }
1611
1612 -/*
1613 - * lock the group_info alloc_sem of all the groups
1614 - * belonging to the same buddy cache page. This
1615 - * make sure other parallel operation on the buddy
1616 - * cache doesn't happen  whild holding the buddy cache
1617 - * lock
1618 - */
1619 -int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1620 -{
1621 -       int i;
1622 -       int block, pnum;
1623 -       int blocks_per_page;
1624 -       int groups_per_page;
1625 -       ext4_group_t first_group;
1626 -       struct ext4_group_info *grp;
1627 -
1628 -       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1629 -       /*
1630 -        * the buddy cache inode stores the block bitmap
1631 -        * and buddy information in consecutive blocks.
1632 -        * So for each group we need two blocks.
1633 -        */
1634 -       block = group * 2;
1635 -       pnum = block / blocks_per_page;
1636 -       first_group = pnum * blocks_per_page / 2;
1637 -
1638 -       groups_per_page = blocks_per_page >> 1;
1639 -       if (groups_per_page == 0)
1640 -               groups_per_page = 1;
1641 -       /* read all groups the page covers into the cache */
1642 -       for (i = 0; i < groups_per_page; i++) {
1643 -
1644 -               if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
1645 -                       break;
1646 -               grp = ext4_get_group_info(sb, first_group + i);
1647 -               /* take all groups write allocation
1648 -                * semaphore. This make sure there is
1649 -                * no block allocation going on in any
1650 -                * of that groups
1651 -                */
1652 -               down_write(&grp->alloc_sem);
1653 -       }
1654 -       return i;
1655 -}
1656 -
1657 -void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1658 -                                       ext4_group_t group, int locked_group)
1659 -{
1660 -       int i;
1661 -       int block, pnum;
1662 -       int blocks_per_page;
1663 -       ext4_group_t first_group;
1664 -       struct ext4_group_info *grp;
1665 -
1666 -       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1667 -       /*
1668 -        * the buddy cache inode stores the block bitmap
1669 -        * and buddy information in consecutive blocks.
1670 -        * So for each group we need two blocks.
1671 -        */
1672 -       block = group * 2;
1673 -       pnum = block / blocks_per_page;
1674 -       first_group = pnum * blocks_per_page / 2;
1675 -       /* release locks on all the groups */
1676 -       for (i = 0; i < locked_group; i++) {
1677 -
1678 -               grp = ext4_get_group_info(sb, first_group + i);
1679 -               /* take all groups write allocation
1680 -                * semaphore. This make sure there is
1681 -                * no block allocation going on in any
1682 -                * of that groups
1683 -                */
1684 -               up_write(&grp->alloc_sem);
1685 -       }
1686 -
1687 -}
1688 -
1689 -static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1690 -{
1691 -
1692 -       int ret;
1693 -       void *bitmap;
1694 -       int blocks_per_page;
1695 -       int block, pnum, poff;
1696 -       int num_grp_locked = 0;
1697 -       struct ext4_group_info *this_grp;
1698 -       struct ext4_sb_info *sbi = EXT4_SB(sb);
1699 -       struct inode *inode = sbi->s_buddy_cache;
1700 -       struct page *page = NULL, *bitmap_page = NULL;
1701 -
1702 -       mb_debug("init group %lu\n", group);
1703 -       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1704 -       this_grp = ext4_get_group_info(sb, group);
1705 -       /*
1706 -        * This ensures we don't add group
1707 -        * to this buddy cache via resize
1708 -        */
1709 -       num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
1710 -       if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1711 -               /*
1712 -                * somebody initialized the group
1713 -                * return without doing anything
1714 -                */
1715 -               ret = 0;
1716 -               goto err;
1717 -       }
1718 -       /*
1719 -        * the buddy cache inode stores the block bitmap
1720 -        * and buddy information in consecutive blocks.
1721 -        * So for each group we need two blocks.
1722 -        */
1723 -       block = group * 2;
1724 -       pnum = block / blocks_per_page;
1725 -       poff = block % blocks_per_page;
1726 -       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1727 -       if (page) {
1728 -               BUG_ON(page->mapping != inode->i_mapping);
1729 -               ret = ext4_mb_init_cache(page, NULL);
1730 -               if (ret) {
1731 -                       unlock_page(page);
1732 -                       goto err;
1733 -               }
1734 -               unlock_page(page);
1735 -       }
1736 -       if (page == NULL || !PageUptodate(page)) {
1737 -               ret = -EIO;
1738 -               goto err;
1739 -       }
1740 -       mark_page_accessed(page);
1741 -       bitmap_page = page;
1742 -       bitmap = page_address(page) + (poff * sb->s_blocksize);
1743 -
1744 -       /* init buddy cache */
1745 -       block++;
1746 -       pnum = block / blocks_per_page;
1747 -       poff = block % blocks_per_page;
1748 -       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1749 -       if (page == bitmap_page) {
1750 -               /*
1751 -                * If both the bitmap and buddy are in
1752 -                * the same page we don't need to force
1753 -                * init the buddy
1754 -                */
1755 -               unlock_page(page);
1756 -       } else if (page) {
1757 -               BUG_ON(page->mapping != inode->i_mapping);
1758 -               ret = ext4_mb_init_cache(page, bitmap);
1759 -               if (ret) {
1760 -                       unlock_page(page);
1761 -                       goto err;
1762 -               }
1763 -               unlock_page(page);
1764 -       }
1765 -       if (page == NULL || !PageUptodate(page)) {
1766 -               ret = -EIO;
1767 -               goto err;
1768 -       }
1769 -       mark_page_accessed(page);
1770 -err:
1771 -       ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1772 -       if (bitmap_page)
1773 -               page_cache_release(bitmap_page);
1774 -       if (page)
1775 -               page_cache_release(page);
1776 -       return ret;
1777 -}
1778 -
1779  static noinline_for_stack int
1780  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1781  {
1782 @@ -2020,7 +1775,7 @@
1783                                 group = 0;
1784
1785                         /* quick check to skip empty groups */
1786 +                       grp = ext4_get_group_info(ac->ac_sb, group);
1787 -                       grp = ext4_get_group_info(sb, group);
1788                         if (grp->bb_free == 0)
1789                                 continue;
1790
1791 @@ -2033,9 +1788,10 @@
1792                                  * we need full data about the group
1793                                  * to make a good selection
1794                                  */
1795 +                               err = ext4_mb_load_buddy(sb, group, &e4b);
1796 -                               err = ext4_mb_init_group(sb, group);
1797                                 if (err)
1798                                         goto out;
1799 +                               ext4_mb_release_desc(&e4b);
1800                         }
1801
1802                         /*
1803 @@ -2543,8 +2299,6 @@
1804         }
1805
1806         INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
1807 -       init_rwsem(&meta_group_info[i]->alloc_sem);
1808 -       meta_group_info[i]->bb_free_root.rb_node = NULL;;
1809
1810  #ifdef DOUBLE_CHECK
1811         {
1812 @@ -2571,6 +2325,54 @@
1813  } /* ext4_mb_add_groupinfo */
1814
1815  /*
1816 + * Add a group to the existing groups.
1817 + * This function is used for online resize
1818 + */
1819 +int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
1820 +                              struct ext4_group_desc *desc)
1821 +{
1822 +       struct ext4_sb_info *sbi = EXT4_SB(sb);
1823 +       struct inode *inode = sbi->s_buddy_cache;
1824 +       int blocks_per_page;
1825 +       int block;
1826 +       int pnum;
1827 +       struct page *page;
1828 +       int err;
1829 +
1830 +       /* Add group based on group descriptor*/
1831 +       err = ext4_mb_add_groupinfo(sb, group, desc);
1832 +       if (err)
1833 +               return err;
1834 +
1835 +       /*
1836 +        * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
1837 +        * datas) are set not up to date so that they will be re-initilaized
1838 +        * during the next call to ext4_mb_load_buddy
1839 +        */
1840 +
1841 +       /* Set buddy page as not up to date */
1842 +       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1843 +       block = group * 2;
1844 +       pnum = block / blocks_per_page;
1845 +       page = find_get_page(inode->i_mapping, pnum);
1846 +       if (page != NULL) {
1847 +               ClearPageUptodate(page);
1848 +               page_cache_release(page);
1849 +       }
1850 +
1851 +       /* Set bitmap page as not up to date */
1852 +       block++;
1853 +       pnum = block / blocks_per_page;
1854 +       page = find_get_page(inode->i_mapping, pnum);
1855 +       if (page != NULL) {
1856 +               ClearPageUptodate(page);
1857 +               page_cache_release(page);
1858 +       }
1859 +
1860 +       return 0;
1861 +}
1862 +
1863 +/*
1864   * Update an existing group.
1865   * This function is used for online resize
1866   */
1867 @@ -2693,12 +2495,10 @@
1868                 clear_opt(sbi->s_mount_opt, MBALLOC);
1869                 return -ENOMEM;
1870         }
1871 -
1872 -       i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
1873         sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
1874         if (sbi->s_mb_maxs == NULL) {
1875                 clear_opt(sbi->s_mount_opt, MBALLOC);
1876 +               kfree(sbi->s_mb_maxs);
1877 -               kfree(sbi->s_mb_offsets);
1878                 return -ENOMEM;
1879         }
1880
1881 @@ -2858,11 +2658,13 @@
1882  static noinline_for_stack void
1883  ext4_mb_free_committed_blocks(struct super_block *sb)
1884  {
1885 +       struct ext4_sb_info *sbi = EXT4_SB(sb);
1886 +       int err;
1887 +       int i;
1888 +       int count = 0;
1889 +       int count2 = 0;
1890 +       struct ext4_free_metadata *md;
1891         struct ext4_buddy e4b;
1892 -       struct ext4_group_info *db;
1893 -       struct ext4_sb_info *sbi = EXT4_SB(sb);
1894 -       int err, count = 0, count2 = 0;
1895 -       struct ext4_free_data *entry;
1896
1897         if (list_empty(&sbi->s_committed_transaction))
1898                 return;
1899 @@ -2870,46 +2672,44 @@
1900         /* there is committed blocks to be freed yet */
1901         do {
1902                 /* get next array of blocks */
1903 +               md = NULL;
1904 -               entry = NULL;
1905                 spin_lock(&sbi->s_md_lock);
1906                 if (!list_empty(&sbi->s_committed_transaction)) {
1907 +                       md = list_entry(sbi->s_committed_transaction.next,
1908 +                                       struct ext4_free_metadata, list);
1909 +                       list_del(&md->list);
1910 -                       entry = list_entry(sbi->s_committed_transaction.next,
1911 -                                       struct ext4_free_data, list);
1912 -                       list_del(&entry->list);
1913                 }
1914                 spin_unlock(&sbi->s_md_lock);
1915
1916 +               if (md == NULL)
1917 -               if (entry == NULL)
1918                         break;
1919
1920                 mb_debug("gonna free %u blocks in group %lu (0x%p):",
1921 +                               md->num, md->group, md);
1922 -                               entry->count, entry->group, entry);
1923
1924 +               err = ext4_mb_load_buddy(sb, md->group, &e4b);
1925 -               err = ext4_mb_load_buddy(sb, entry->group, &e4b);
1926                 /* we expect to find existing buddy because it's pinned */
1927                 BUG_ON(err != 0);
1928
1929 -               db = e4b.bd_info;
1930                 /* there are blocks to put in buddy to make them really free */
1931 +               count += md->num;
1932 -               count += entry->count;
1933                 count2++;
1934 +               ext4_lock_group(sb, md->group);
1935 +               for (i = 0; i < md->num; i++) {
1936 +                       mb_debug(" %u", md->blocks[i]);
1937 +                       mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
1938 -               ext4_lock_group(sb, entry->group);
1939 -               /* Take it out of per group rb tree */
1940 -               rb_erase(&entry->node, &(db->bb_free_root));
1941 -               mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
1942 -
1943 -               if (!db->bb_free_root.rb_node) {
1944 -                       /* No more items in the per group rb tree
1945 -                        * balance refcounts from ext4_mb_free_metadata()
1946 -                        */
1947 -                       page_cache_release(e4b.bd_buddy_page);
1948 -                       page_cache_release(e4b.bd_bitmap_page);
1949                 }
1950 +               mb_debug("\n");
1951 +               ext4_unlock_group(sb, md->group);
1952 -               ext4_unlock_group(sb, entry->group);
1953
1954 +               /* balance refcounts from ext4_mb_free_metadata() */
1955 +               page_cache_release(e4b.bd_buddy_page);
1956 +               page_cache_release(e4b.bd_bitmap_page);
1957 +
1958 +               kfree(md);
1959 -               kmem_cache_free(ext4_free_ext_cachep, entry);
1960                 ext4_mb_release_desc(&e4b);
1961 +
1962 +       } while (md);
1963 -       } while (1);
1964
1965         mb_debug("freed %u blocks in %u structures\n", count, count2);
1966  }
1967 @@ -3064,16 +2864,6 @@
1968                 kmem_cache_destroy(ext4_pspace_cachep);
1969                 return -ENOMEM;
1970         }
1971 -
1972 -       ext4_free_ext_cachep =
1973 -               kmem_cache_create("ext4_free_block_extents",
1974 -                                    sizeof(struct ext4_free_data),
1975 -                                    0, SLAB_RECLAIM_ACCOUNT, NULL);
1976 -       if (ext4_free_ext_cachep == NULL) {
1977 -               kmem_cache_destroy(ext4_pspace_cachep);
1978 -               kmem_cache_destroy(ext4_ac_cachep);
1979 -               return -ENOMEM;
1980 -       }
1981  #ifdef CONFIG_PROC_FS
1982         proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
1983         if (proc_root_ext4 == NULL)
1984 @@ -3090,7 +2880,6 @@
1985  #ifdef CONFIG_PROC_FS
1986         remove_proc_entry("fs/ext4", NULL);
1987  #endif
1988 -       kmem_cache_destroy(ext4_free_ext_cachep);
1989  }
1990
1991
1992 @@ -3152,8 +2941,8 @@
1993             in_range(block + len - 1, ext4_inode_table(sb, gdp),
1994                      EXT4_SB(sb)->s_itb_per_group)) {
1995                 ext4_error(sb, __func__,
1996 +                          "Allocating block in system zone - block = %llu",
1997 +                          block);
1998 -                          "Allocating block %llu in system zone of %lu group\n",
1999 -                          block, ac->ac_b_ex.fe_group);
2000                 /* File system mounted not to panic on error
2001                  * Fix the bitmap and repeat the block allocation
2002                  * We leak some of the blocks here.
2003 @@ -3175,9 +2964,10 @@
2004                 }
2005         }
2006  #endif
2007 +       mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
2008 +                               ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
2009 +
2010         spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
2011 -       mb_set_bits(NULL, bitmap_bh->b_data,
2012 -                               ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
2013         if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2014                 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2015                 gdp->bg_free_blocks_count =
2016 @@ -3400,7 +3190,7 @@
2017         }
2018         BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
2019                         start > ac->ac_o_ex.fe_logical);
2020 +       BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
2021 -       BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
2022
2023         /* now prepare goal request */
2024
2025 @@ -3610,37 +3400,10 @@
2026                 ac->ac_criteria = 20;
2027                 return 1;
2028         }
2029 -
2030         return 0;
2031  }
2032
2033  /*
2034 - * the function goes through all block freed in the group
2035 - * but not yet committed and marks them used in in-core bitmap.
2036 - * buddy must be generated from this bitmap
2037 - * Need to be called with ext4 group lock (ext4_lock_group)
2038 - */
2039 -static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
2040 -                                               ext4_group_t group)
2041 -{
2042 -       struct rb_node *n;
2043 -       struct ext4_group_info *grp;
2044 -       struct ext4_free_data *entry;
2045 -
2046 -       grp = ext4_get_group_info(sb, group);
2047 -       n = rb_first(&(grp->bb_free_root));
2048 -
2049 -       while (n) {
2050 -               entry = rb_entry(n, struct ext4_free_data, node);
2051 -               mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
2052 -                               bitmap, entry->start_blk,
2053 -                               entry->count);
2054 -               n = rb_next(n);
2055 -       }
2056 -       return;
2057 -}
2058 -
2059 -/*
2060   * the function goes through all preallocation in this group and marks them
2061   * used in in-core bitmap. buddy must be generated from this bitmap
2062   * Need to be called with ext4 group lock (ext4_lock_group)
2063 @@ -3698,7 +3461,6 @@
2064                         struct super_block *sb, struct ext4_prealloc_space *pa)
2065  {
2066         unsigned long grp;
2067 -       ext4_fsblk_t grp_blk;
2068
2069         if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
2070                 return;
2071 @@ -3713,12 +3475,8 @@
2072         pa->pa_deleted = 1;
2073         spin_unlock(&pa->pa_lock);
2074
2075 +       /* -1 is to protect from crossing allocation group */
2076 +       ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL);
2077 -       grp_blk = pa->pa_pstart;
2078 -       /* If linear, pa_pstart may be in the next group when pa is used up */
2079 -       if (pa->pa_linear)
2080 -               grp_blk--;
2081 -
2082 -       ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
2083
2084         /*
2085          * possible race:
2086 @@ -3807,8 +3565,6 @@
2087         pa->pa_free = pa->pa_len;
2088         atomic_set(&pa->pa_count, 1);
2089         spin_lock_init(&pa->pa_lock);
2090 -       INIT_LIST_HEAD(&pa->pa_inode_list);
2091 -       INIT_LIST_HEAD(&pa->pa_group_list);
2092         pa->pa_deleted = 0;
2093         pa->pa_linear = 0;
2094
2095 @@ -3867,7 +3623,6 @@
2096         atomic_set(&pa->pa_count, 1);
2097         spin_lock_init(&pa->pa_lock);
2098         INIT_LIST_HEAD(&pa->pa_inode_list);
2099 -       INIT_LIST_HEAD(&pa->pa_group_list);
2100         pa->pa_deleted = 0;
2101         pa->pa_linear = 1;
2102
2103 @@ -4411,7 +4166,6 @@
2104         ac->ac_pa = NULL;
2105         ac->ac_bitmap_page = NULL;
2106         ac->ac_buddy_page = NULL;
2107 -       ac->alloc_semp = NULL;
2108         ac->ac_lg = NULL;
2109
2110         /* we have to define context: we'll we work with a file or
2111 @@ -4532,7 +4286,7 @@
2112                                                 pa_inode_list) {
2113                 spin_lock(&tmp_pa->pa_lock);
2114                 if (tmp_pa->pa_deleted) {
2115 +                       spin_unlock(&pa->pa_lock);
2116 -                       spin_unlock(&tmp_pa->pa_lock);
2117                         continue;
2118                 }
2119                 if (!added && pa->pa_free < tmp_pa->pa_free) {
2120 @@ -4577,23 +4331,18 @@
2121                         pa->pa_free -= ac->ac_b_ex.fe_len;
2122                         pa->pa_len -= ac->ac_b_ex.fe_len;
2123                         spin_unlock(&pa->pa_lock);
2124 +                       /*
2125 +                        * We want to add the pa to the right bucket.
2126 +                        * Remove it from the list and while adding
2127 +                        * make sure the list to which we are adding
2128 +                        * doesn't grow big.
2129 +                        */
2130 +                       if (likely(pa->pa_free)) {
2131 +                               spin_lock(pa->pa_obj_lock);
2132 +                               list_del_rcu(&pa->pa_inode_list);
2133 +                               spin_unlock(pa->pa_obj_lock);
2134 +                               ext4_mb_add_n_trim(ac);
2135 +                       }
2136 -               }
2137 -       }
2138 -       if (ac->alloc_semp)
2139 -               up_read(ac->alloc_semp);
2140 -       if (pa) {
2141 -               /*
2142 -                * We want to add the pa to the right bucket.
2143 -                * Remove it from the list and while adding
2144 -                * make sure the list to which we are adding
2145 -                * doesn't grow big.  We need to release
2146 -                * alloc_semp before calling ext4_mb_add_n_trim()
2147 -                */
2148 -               if (pa->pa_linear && likely(pa->pa_free)) {
2149 -                       spin_lock(pa->pa_obj_lock);
2150 -                       list_del_rcu(&pa->pa_inode_list);
2151 -                       spin_unlock(pa->pa_obj_lock);
2152 -                       ext4_mb_add_n_trim(ac);
2153                 }
2154                 ext4_mb_put_pa(ac, ac->ac_sb, pa);
2155         }
2156 @@ -4700,14 +4449,10 @@
2157                                 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
2158                         ext4_mb_new_preallocation(ac);
2159         }
2160 +
2161         if (likely(ac->ac_status == AC_STATUS_FOUND)) {
2162                 *errp = ext4_mb_mark_diskspace_used(ac, handle);
2163                 if (*errp ==  -EAGAIN) {
2164 -                       /*
2165 -                        * drop the reference that we took
2166 -                        * in ext4_mb_use_best_found
2167 -                        */
2168 -                       ext4_mb_release_context(ac);
2169                         ac->ac_b_ex.fe_group = 0;
2170                         ac->ac_b_ex.fe_start = 0;
2171                         ac->ac_b_ex.fe_len = 0;
2172 @@ -4772,97 +4517,65 @@
2173         ext4_mb_free_committed_blocks(sb);
2174  }
2175
2176 -/*
2177 - * We can merge two free data extents only if the physical blocks
2178 - * are contiguous, AND the extents were freed by the same transaction,
2179 - * AND the blocks are associated with the same group.
2180 - */
2181 -static int can_merge(struct ext4_free_data *entry1,
2182 -                       struct ext4_free_data *entry2)
2183 -{
2184 -       if ((entry1->t_tid == entry2->t_tid) &&
2185 -           (entry1->group == entry2->group) &&
2186 -           ((entry1->start_blk + entry1->count) == entry2->start_blk))
2187 -               return 1;
2188 -       return 0;
2189 -}
2190 -
2191  static noinline_for_stack int
2192  ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
2193 +                         ext4_group_t group, ext4_grpblk_t block, int count)
2194 -                       struct ext4_free_data *new_entry)
2195  {
2196 -       ext4_grpblk_t block;
2197 -       struct ext4_free_data *entry;
2198         struct ext4_group_info *db = e4b->bd_info;
2199         struct super_block *sb = e4b->bd_sb;
2200         struct ext4_sb_info *sbi = EXT4_SB(sb);
2201 +       struct ext4_free_metadata *md;
2202 +       int i;
2203 -       struct rb_node **n = &db->bb_free_root.rb_node, *node;
2204 -       struct rb_node *parent = NULL, *new_node;
2205
2206         BUG_ON(e4b->bd_bitmap_page == NULL);
2207         BUG_ON(e4b->bd_buddy_page == NULL);
2208
2209 +       ext4_lock_group(sb, group);
2210 +       for (i = 0; i < count; i++) {
2211 +               md = db->bb_md_cur;
2212 +               if (md && db->bb_tid != handle->h_transaction->t_tid) {
2213 +                       db->bb_md_cur = NULL;
2214 +                       md = NULL;
2215 -       new_node = &new_entry->node;
2216 -       block = new_entry->start_blk;
2217 -
2218 -       if (!*n) {
2219 -               /* first free block exent. We need to
2220 -                  protect buddy cache from being freed,
2221 -                * otherwise we'll refresh it from
2222 -                * on-disk bitmap and lose not-yet-available
2223 -                * blocks */
2224 -               page_cache_get(e4b->bd_buddy_page);
2225 -               page_cache_get(e4b->bd_bitmap_page);
2226 -       }
2227 -       while (*n) {
2228 -               parent = *n;
2229 -               entry = rb_entry(parent, struct ext4_free_data, node);
2230 -               if (block < entry->start_blk)
2231 -                       n = &(*n)->rb_left;
2232 -               else if (block >= (entry->start_blk + entry->count))
2233 -                       n = &(*n)->rb_right;
2234 -               else {
2235 -                       ext4_error(sb, __func__,
2236 -                           "Double free of blocks %d (%d %d)\n",
2237 -                           block, entry->start_blk, entry->count);
2238 -                       return 0;
2239                 }
2240 -       }
2241
2242 +               if (md == NULL) {
2243 +                       ext4_unlock_group(sb, group);
2244 +                       md = kmalloc(sizeof(*md), GFP_NOFS);
2245 +                       if (md == NULL)
2246 +                               return -ENOMEM;
2247 +                       md->num = 0;
2248 +                       md->group = group;
2249 +
2250 +                       ext4_lock_group(sb, group);
2251 +                       if (db->bb_md_cur == NULL) {
2252 +                               spin_lock(&sbi->s_md_lock);
2253 +                               list_add(&md->list, &sbi->s_active_transaction);
2254 +                               spin_unlock(&sbi->s_md_lock);
2255 +                               /* protect buddy cache from being freed,
2256 +                                * otherwise we'll refresh it from
2257 +                                * on-disk bitmap and lose not-yet-available
2258 +                                * blocks */
2259 +                               page_cache_get(e4b->bd_buddy_page);
2260 +                               page_cache_get(e4b->bd_bitmap_page);
2261 +                               db->bb_md_cur = md;
2262 +                               db->bb_tid = handle->h_transaction->t_tid;
2263 +                               mb_debug("new md 0x%p for group %lu\n",
2264 +                                               md, md->group);
2265 +                       } else {
2266 +                               kfree(md);
2267 +                               md = db->bb_md_cur;
2268 +                       }
2269 -       rb_link_node(new_node, parent, n);
2270 -       rb_insert_color(new_node, &db->bb_free_root);
2271 -
2272 -       /* Now try to see the extent can be merged to left and right */
2273 -       node = rb_prev(new_node);
2274 -       if (node) {
2275 -               entry = rb_entry(node, struct ext4_free_data, node);
2276 -               if (can_merge(entry, new_entry)) {
2277 -                       new_entry->start_blk = entry->start_blk;
2278 -                       new_entry->count += entry->count;
2279 -                       rb_erase(node, &(db->bb_free_root));
2280 -                       spin_lock(&sbi->s_md_lock);
2281 -                       list_del(&entry->list);
2282 -                       spin_unlock(&sbi->s_md_lock);
2283 -                       kmem_cache_free(ext4_free_ext_cachep, entry);
2284                 }
2285 -       }
2286
2287 +               BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
2288 +               md->blocks[md->num] = block + i;
2289 +               md->num++;
2290 +               if (md->num == EXT4_BB_MAX_BLOCKS) {
2291 +                       /* no more space, put full container on a sb's list */
2292 +                       db->bb_md_cur = NULL;
2293 -       node = rb_next(new_node);
2294 -       if (node) {
2295 -               entry = rb_entry(node, struct ext4_free_data, node);
2296 -               if (can_merge(new_entry, entry)) {
2297 -                       new_entry->count += entry->count;
2298 -                       rb_erase(node, &(db->bb_free_root));
2299 -                       spin_lock(&sbi->s_md_lock);
2300 -                       list_del(&entry->list);
2301 -                       spin_unlock(&sbi->s_md_lock);
2302 -                       kmem_cache_free(ext4_free_ext_cachep, entry);
2303                 }
2304         }
2305 +       ext4_unlock_group(sb, group);
2306 -       /* Add the extent to active_transaction list */
2307 -       spin_lock(&sbi->s_md_lock);
2308 -       list_add(&new_entry->list, &sbi->s_active_transaction);
2309 -       spin_unlock(&sbi->s_md_lock);
2310         return 0;
2311  }
2312
2313 @@ -4962,6 +4675,11 @@
2314         err = ext4_journal_get_write_access(handle, gd_bh);
2315         if (err)
2316                 goto error_return;
2317 +
2318 +       err = ext4_mb_load_buddy(sb, block_group, &e4b);
2319 +       if (err)
2320 +               goto error_return;
2321 +
2322  #ifdef AGGRESSIVE_CHECK
2323         {
2324                 int i;
2325 @@ -4969,6 +4687,13 @@
2326                         BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
2327         }
2328  #endif
2329 +       mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
2330 +                       bit, count);
2331 +
2332 +       /* We dirtied the bitmap block */
2333 +       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
2334 +       err = ext4_journal_dirty_metadata(handle, bitmap_bh);
2335 +
2336         if (ac) {
2337                 ac->ac_b_ex.fe_group = block_group;
2338                 ac->ac_b_ex.fe_start = bit;
2339 @@ -4976,33 +4701,12 @@
2340                 ext4_mb_store_history(ac);
2341         }
2342
2343 -       err = ext4_mb_load_buddy(sb, block_group, &e4b);
2344 -       if (err)
2345 -               goto error_return;
2346         if (metadata) {
2347 +               /* blocks being freed are metadata. these blocks shouldn't
2348 +                * be used until this transaction is committed */
2349 +               ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
2350 -               struct ext4_free_data *new_entry;
2351 -               /*
2352 -                * blocks being freed are metadata. these blocks shouldn't
2353 -                * be used until this transaction is committed
2354 -                */
2355 -               new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
2356 -               new_entry->start_blk = bit;
2357 -               new_entry->group  = block_group;
2358 -               new_entry->count = count;
2359 -               new_entry->t_tid = handle->h_transaction->t_tid;
2360 -               ext4_lock_group(sb, block_group);
2361 -               mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
2362 -                               bit, count);
2363 -               ext4_mb_free_metadata(handle, &e4b, new_entry);
2364 -               ext4_unlock_group(sb, block_group);
2365         } else {
2366                 ext4_lock_group(sb, block_group);
2367 -               /* need to update group_info->bb_free and bitmap
2368 -                * with group lock held. generate_buddy look at
2369 -                * them with group lock_held
2370 -                */
2371 -               mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
2372 -                               bit, count);
2373                 mb_free_blocks(inode, &e4b, bit, count);
2374                 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
2375                 ext4_unlock_group(sb, block_group);
2376 @@ -5025,10 +4729,6 @@
2377
2378         *freed += count;
2379
2380 -       /* We dirtied the bitmap block */
2381 -       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
2382 -       err = ext4_journal_dirty_metadata(handle, bitmap_bh);
2383 -
2384         /* And the group descriptor block */
2385         BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
2386         ret = ext4_journal_dirty_metadata(handle, gd_bh);
2387 reverted:
2388 --- b/fs/ext4/mballoc.h
2389 +++ a/fs/ext4/mballoc.h
2390 @@ -18,7 +18,6 @@
2391  #include <linux/pagemap.h>
2392  #include <linux/seq_file.h>
2393  #include <linux/version.h>
2394 -#include <linux/mutex.h>
2395  #include "ext4_jbd2.h"
2396  #include "ext4.h"
2397  #include "group.h"
2398 @@ -97,27 +96,25 @@
2399   */
2400  #define MB_DEFAULT_GROUP_PREALLOC      512
2401
2402 +static struct kmem_cache *ext4_pspace_cachep;
2403 +static struct kmem_cache *ext4_ac_cachep;
2404 -struct ext4_free_data {
2405 -       /* this links the free block information from group_info */
2406 -       struct rb_node node;
2407
2408 +#ifdef EXT4_BB_MAX_BLOCKS
2409 +#undef EXT4_BB_MAX_BLOCKS
2410 +#endif
2411 +#define EXT4_BB_MAX_BLOCKS     30
2412 -       /* this links the free block information from ext4_sb_info */
2413 -       struct list_head list;
2414
2415 +struct ext4_free_metadata {
2416 -       /* group which free block extent belongs */
2417         ext4_group_t group;
2418 +       unsigned short num;
2419 +       ext4_grpblk_t  blocks[EXT4_BB_MAX_BLOCKS];
2420 +       struct list_head list;
2421 -
2422 -       /* free block extent */
2423 -       ext4_grpblk_t start_blk;
2424 -       ext4_grpblk_t count;
2425 -
2426 -       /* transaction which freed this extent */
2427 -       tid_t   t_tid;
2428  };
2429
2430  struct ext4_group_info {
2431         unsigned long   bb_state;
2432 +       unsigned long   bb_tid;
2433 +       struct ext4_free_metadata *bb_md_cur;
2434 -       struct rb_root  bb_free_root;
2435         unsigned short  bb_first_free;
2436         unsigned short  bb_free;
2437         unsigned short  bb_fragments;
2438 @@ -125,7 +122,6 @@
2439  #ifdef DOUBLE_CHECK
2440         void            *bb_bitmap;
2441  #endif
2442 -       struct rw_semaphore alloc_sem;
2443         unsigned short  bb_counters[];
2444  };
2445
2446 @@ -213,11 +209,6 @@
2447         __u8 ac_op;             /* operation, for history only */
2448         struct page *ac_bitmap_page;
2449         struct page *ac_buddy_page;
2450 -       /*
2451 -        * pointer to the held semaphore upon successful
2452 -        * block allocation
2453 -        */
2454 -       struct rw_semaphore *alloc_semp;
2455         struct ext4_prealloc_space *ac_pa;
2456         struct ext4_locality_group *ac_lg;
2457  };
2458 @@ -251,7 +242,6 @@
2459         struct super_block *bd_sb;
2460         __u16 bd_blkbits;
2461         ext4_group_t bd_group;
2462 -       struct rw_semaphore *alloc_semp;
2463  };
2464  #define EXT4_MB_BITMAP(e4b)    ((e4b)->bd_bitmap)
2465  #define EXT4_MB_BUDDY(e4b)     ((e4b)->bd_buddy)
2466 @@ -261,6 +251,8 @@
2467  {
2468         return;
2469  }
2470 +#else
2471 +static void ext4_mb_store_history(struct ext4_allocation_context *ac);
2472  #endif
2473
2474  #define in_range(b, first, len)        ((b) >= (first) && (b) <= (first) + (len) - 1)
2475 @@ -268,6 +260,19 @@
2476  static struct proc_dir_entry *proc_root_ext4;
2477  struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
2478
2479 +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
2480 +                                       ext4_group_t group);
2481 +static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
2482 +static void ext4_mb_free_committed_blocks(struct super_block *);
2483 +static void ext4_mb_return_to_preallocation(struct inode *inode,
2484 +                                       struct ext4_buddy *e4b, sector_t block,
2485 +                                       int count);
2486 +static void ext4_mb_put_pa(struct ext4_allocation_context *,
2487 +                       struct super_block *, struct ext4_prealloc_space *pa);
2488 +static int ext4_mb_init_per_dev_proc(struct super_block *sb);
2489 +static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
2490 +
2491 +
2492  static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
2493  {
2494         struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
2495 @@ -292,7 +297,7 @@
2496                                                 &(grinfo->bb_state));
2497  }
2498
2499 +static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
2500 -static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
2501                                         struct ext4_free_extent *fex)
2502  {
2503         ext4_fsblk_t block;
2504 reverted:
2505 --- b/fs/ext4/migrate.c
2506 +++ a/fs/ext4/migrate.c
2507 @@ -480,7 +480,7 @@
2508                                         + 1);
2509         if (IS_ERR(handle)) {
2510                 retval = PTR_ERR(handle);
2511 +               goto err_out;
2512 -               return retval;
2513         }
2514         tmp_inode = ext4_new_inode(handle,
2515                                 inode->i_sb->s_root->d_inode,
2516 @@ -488,7 +488,8 @@
2517         if (IS_ERR(tmp_inode)) {
2518                 retval = -ENOMEM;
2519                 ext4_journal_stop(handle);
2520 +               tmp_inode = NULL;
2521 +               goto err_out;
2522 -               return retval;
2523         }
2524         i_size_write(tmp_inode, i_size_read(inode));
2525         /*
2526 @@ -616,7 +617,8 @@
2527
2528         ext4_journal_stop(handle);
2529
2530 +       if (tmp_inode)
2531 +               iput(tmp_inode);
2532 -       iput(tmp_inode);
2533
2534         return retval;
2535  }
2536 reverted:
2537 --- b/fs/ext4/namei.c
2538 +++ a/fs/ext4/namei.c
2539 @@ -371,8 +371,6 @@
2540                 goto fail;
2541         }
2542         hinfo->hash_version = root->info.hash_version;
2543 -       if (hinfo->hash_version <= DX_HASH_TEA)
2544 -               hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
2545         hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2546         if (dentry)
2547                 ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
2548 @@ -642,9 +640,6 @@
2549         dir = dir_file->f_path.dentry->d_inode;
2550         if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
2551                 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
2552 -               if (hinfo.hash_version <= DX_HASH_TEA)
2553 -                       hinfo.hash_version +=
2554 -                               EXT4_SB(dir->i_sb)->s_hash_unsigned;
2555                 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2556                 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
2557                                                start_hash, start_minor_hash);
2558 @@ -1055,16 +1050,8 @@
2559                         return ERR_PTR(-EIO);
2560                 }
2561                 inode = ext4_iget(dir->i_sb, ino);
2562 +               if (IS_ERR(inode))
2563 +                       return ERR_CAST(inode);
2564 -               if (unlikely(IS_ERR(inode))) {
2565 -                       if (PTR_ERR(inode) == -ESTALE) {
2566 -                               ext4_error(dir->i_sb, __func__,
2567 -                                               "deleted inode referenced: %u",
2568 -                                               ino);
2569 -                               return ERR_PTR(-EIO);
2570 -                       } else {
2571 -                               return ERR_CAST(inode);
2572 -                       }
2573 -               }
2574         }
2575         return d_splice_alias(inode, dentry);
2576  }
2577 @@ -1390,7 +1377,7 @@
2578         struct fake_dirent *fde;
2579
2580         blocksize =  dir->i_sb->s_blocksize;
2581 +       dxtrace(printk("Creating index\n"));
2582 -       dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
2583         retval = ext4_journal_get_write_access(handle, bh);
2584         if (retval) {
2585                 ext4_std_error(dir->i_sb, retval);
2586 @@ -1399,20 +1386,6 @@
2587         }
2588         root = (struct dx_root *) bh->b_data;
2589
2590 -       /* The 0th block becomes the root, move the dirents out */
2591 -       fde = &root->dotdot;
2592 -       de = (struct ext4_dir_entry_2 *)((char *)fde +
2593 -               ext4_rec_len_from_disk(fde->rec_len));
2594 -       if ((char *) de >= (((char *) root) + blocksize)) {
2595 -               ext4_error(dir->i_sb, __func__,
2596 -                          "invalid rec_len for '..' in inode %lu",
2597 -                          dir->i_ino);
2598 -               brelse(bh);
2599 -               return -EIO;
2600 -       }
2601 -       len = ((char *) root) + blocksize - (char *) de;
2602 -
2603 -       /* Allocate new block for the 0th block's dirents */
2604         bh2 = ext4_append (handle, dir, &block, &retval);
2605         if (!(bh2)) {
2606                 brelse(bh);
2607 @@ -1421,6 +1394,11 @@
2608         EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
2609         data1 = bh2->b_data;
2610
2611 +       /* The 0th block becomes the root, move the dirents out */
2612 +       fde = &root->dotdot;
2613 +       de = (struct ext4_dir_entry_2 *)((char *)fde +
2614 +               ext4_rec_len_from_disk(fde->rec_len));
2615 +       len = ((char *) root) + blocksize - (char *) de;
2616         memcpy (data1, de, len);
2617         de = (struct ext4_dir_entry_2 *) data1;
2618         top = data1 + len;
2619 @@ -1440,8 +1418,6 @@
2620
2621         /* Initialize as for dx_probe */
2622         hinfo.hash_version = root->info.hash_version;
2623 -       if (hinfo.hash_version <= DX_HASH_TEA)
2624 -               hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
2625         hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2626         ext4fs_dirhash(name, namelen, &hinfo);
2627         frame = frames;
2628 @@ -2314,7 +2290,7 @@
2629         struct inode * old_inode, * new_inode;
2630         struct buffer_head * old_bh, * new_bh, * dir_bh;
2631         struct ext4_dir_entry_2 * old_de, * new_de;
2632 +       int retval;
2633 -       int retval, force_da_alloc = 0;
2634
2635         old_bh = new_bh = dir_bh = NULL;
2636
2637 @@ -2452,7 +2428,6 @@
2638                 ext4_mark_inode_dirty(handle, new_inode);
2639                 if (!new_inode->i_nlink)
2640                         ext4_orphan_add(handle, new_inode);
2641 -               force_da_alloc = 1;
2642         }
2643         retval = 0;
2644
2645 @@ -2461,8 +2436,6 @@
2646         brelse (old_bh);
2647         brelse (new_bh);
2648         ext4_journal_stop(handle);
2649 -       if (retval == 0 && force_da_alloc)
2650 -               ext4_alloc_da_blocks(old_inode);
2651         return retval;
2652  }
2653
2654 reverted:
2655 --- b/fs/ext4/resize.c
2656 +++ a/fs/ext4/resize.c
2657 @@ -284,9 +284,11 @@
2658         if ((err = extend_or_restart_transaction(handle, 2, bh)))
2659                 goto exit_bh;
2660
2661 +       mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
2662 +                       bh->b_data);
2663 -       mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
2664         ext4_journal_dirty_metadata(handle, bh);
2665         brelse(bh);
2666 +
2667         /* Mark unused entries in inode bitmap used */
2668         ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
2669                    input->inode_bitmap, input->inode_bitmap - start);
2670 @@ -295,7 +297,7 @@
2671                 goto exit_journal;
2672         }
2673
2674 +       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
2675 -       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
2676                         bh->b_data);
2677         ext4_journal_dirty_metadata(handle, bh);
2678  exit_bh:
2679 @@ -745,7 +747,6 @@
2680         struct inode *inode = NULL;
2681         handle_t *handle;
2682         int gdb_off, gdb_num;
2683 -       int num_grp_locked = 0;
2684         int err, err2;
2685
2686         gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
2687 @@ -786,7 +787,6 @@
2688                 }
2689         }
2690
2691 -
2692         if ((err = verify_group_input(sb, input)))
2693                 goto exit_put;
2694
2695 @@ -855,18 +855,15 @@
2696           * using the new disk blocks.
2697           */
2698
2699 -       num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
2700         /* Update group descriptor block for new group */
2701         gdp = (struct ext4_group_desc *)((char *)primary->b_data +
2702                                          gdb_off * EXT4_DESC_SIZE(sb));
2703
2704 -       memset(gdp, 0, EXT4_DESC_SIZE(sb));
2705         ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
2706         ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
2707         ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
2708         gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
2709         gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
2710 -       gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
2711         gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
2712
2713         /*
2714 @@ -874,11 +871,9 @@
2715          * descriptor
2716          */
2717         if (test_opt(sb, MBALLOC)) {
2718 +               err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
2719 +               if (err)
2720 -               err = ext4_mb_add_groupinfo(sb, input->group, gdp);
2721 -               if (err) {
2722 -                       ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
2723                         goto exit_journal;
2724 -               }
2725         }
2726         /*
2727          * Make the new blocks and inodes valid next.  We do this before
2728 @@ -920,7 +915,6 @@
2729
2730         /* Update the global fs size fields */
2731         sbi->s_groups_count++;
2732 -       ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
2733
2734         ext4_journal_dirty_metadata(handle, primary);
2735
2736 @@ -982,7 +976,9 @@
2737         struct buffer_head * bh;
2738         handle_t *handle;
2739         int err;
2740 +       unsigned long freed_blocks;
2741         ext4_group_t group;
2742 +       struct ext4_group_info *grp;
2743
2744         /* We don't need to worry about locking wrt other resizers just
2745          * yet: we're going to revalidate es->s_blocks_count after
2746 @@ -1081,13 +1077,50 @@
2747         unlock_super(sb);
2748         ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
2749                    o_blocks_count + add);
2750 +       ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
2751 -       /* We add the blocks to the bitmap and set the group need init bit */
2752 -       ext4_add_groupblocks(handle, sb, o_blocks_count, add);
2753         ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
2754                    o_blocks_count + add);
2755         if ((err = ext4_journal_stop(handle)))
2756                 goto exit_put;
2757
2758 +       /*
2759 +        * Mark mballoc pages as not up to date so that they will be updated
2760 +        * next time they are loaded by ext4_mb_load_buddy.
2761 +        */
2762 +       if (test_opt(sb, MBALLOC)) {
2763 +               struct ext4_sb_info *sbi = EXT4_SB(sb);
2764 +               struct inode *inode = sbi->s_buddy_cache;
2765 +               int blocks_per_page;
2766 +               int block;
2767 +               int pnum;
2768 +               struct page *page;
2769 +
2770 +               /* Set buddy page as not up to date */
2771 +               blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2772 +               block = group * 2;
2773 +               pnum = block / blocks_per_page;
2774 +               page = find_get_page(inode->i_mapping, pnum);
2775 +               if (page != NULL) {
2776 +                       ClearPageUptodate(page);
2777 +                       page_cache_release(page);
2778 +               }
2779 +
2780 +               /* Set bitmap page as not up to date */
2781 +               block++;
2782 +               pnum = block / blocks_per_page;
2783 +               page = find_get_page(inode->i_mapping, pnum);
2784 +               if (page != NULL) {
2785 +                       ClearPageUptodate(page);
2786 +                       page_cache_release(page);
2787 +               }
2788 +
2789 +               /* Get the info on the last group */
2790 +               grp = ext4_get_group_info(sb, group);
2791 +
2792 +               /* Update free blocks in group info */
2793 +               ext4_mb_update_group_info(grp, add);
2794 +       }
2795 +
2796         if (test_opt(sb, DEBUG))
2797                 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
2798                        ext4_blocks_count(es));
2799 reverted:
2800 --- b/fs/ext4/super.c
2801 +++ a/fs/ext4/super.c
2802 @@ -1493,6 +1493,7 @@
2803         ext4_group_t flex_group_count;
2804         ext4_group_t flex_group;
2805         int groups_per_flex = 0;
2806 +       __u64 block_bitmap = 0;
2807         int i;
2808
2809         if (!sbi->s_es->s_log_groups_per_flex) {
2810 @@ -1515,6 +1516,9 @@
2811                 goto failed;
2812         }
2813
2814 +       gdp = ext4_get_group_desc(sb, 1, &bh);
2815 +       block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
2816 +
2817         for (i = 0; i < sbi->s_groups_count; i++) {
2818                 gdp = ext4_get_group_desc(sb, i, &bh);
2819
2820 @@ -1916,8 +1920,8 @@
2821         struct inode *root;
2822         int ret = -EINVAL;
2823         int blocksize;
2824 +       int db_count;
2825 +       int i;
2826 -       unsigned int db_count;
2827 -       unsigned int i;
2828         int needs_recovery;
2829         __le32 features;
2830         __u64 blocks_count;
2831 @@ -2168,18 +2172,6 @@
2832         for (i = 0; i < 4; i++)
2833                 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
2834         sbi->s_def_hash_version = es->s_def_hash_version;
2835 -       i = le32_to_cpu(es->s_flags);
2836 -       if (i & EXT2_FLAGS_UNSIGNED_HASH)
2837 -               sbi->s_hash_unsigned = 3;
2838 -       else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
2839 -#ifdef __CHAR_UNSIGNED__
2840 -               es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
2841 -               sbi->s_hash_unsigned = 3;
2842 -#else
2843 -               es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
2844 -#endif
2845 -               sb->s_dirt = 1;
2846 -       }
2847
2848         if (sbi->s_blocks_per_group > blocksize * 8) {
2849                 printk(KERN_ERR
2850 @@ -2207,30 +2199,20 @@
2851         if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
2852                 goto cantfind_ext4;
2853
2854 +       /* ensure blocks_count calculation below doesn't sign-extend */
2855 +       if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
2856 +           le32_to_cpu(es->s_first_data_block) + 1) {
2857 +               printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
2858 +                      "first data block %u, blocks per group %lu\n",
2859 +                       ext4_blocks_count(es),
2860 +                       le32_to_cpu(es->s_first_data_block),
2861 +                       EXT4_BLOCKS_PER_GROUP(sb));
2862 -       /*
2863 -        * It makes no sense for the first data block to be beyond the end
2864 -        * of the filesystem.
2865 -        */
2866 -       if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
2867 -               printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
2868 -                      "block %u is beyond end of filesystem (%llu)\n",
2869 -                      le32_to_cpu(es->s_first_data_block),
2870 -                      ext4_blocks_count(es));
2871                 goto failed_mount;
2872         }
2873         blocks_count = (ext4_blocks_count(es) -
2874                         le32_to_cpu(es->s_first_data_block) +
2875                         EXT4_BLOCKS_PER_GROUP(sb) - 1);
2876         do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
2877 -       if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
2878 -               printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
2879 -                      "(block count %llu, first data block %u, "
2880 -                      "blocks per group %lu)\n", sbi->s_groups_count,
2881 -                      ext4_blocks_count(es),
2882 -                      le32_to_cpu(es->s_first_data_block),
2883 -                      EXT4_BLOCKS_PER_GROUP(sb));
2884 -               goto failed_mount;
2885 -       }
2886         sbi->s_groups_count = blocks_count;
2887         db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2888                    EXT4_DESC_PER_BLOCK(sb);
2889 @@ -2950,14 +2932,14 @@
2890
2891  static int ext4_sync_fs(struct super_block *sb, int wait)
2892  {
2893 +       int ret = 0;
2894 -       tid_t target;
2895
2896         sb->s_dirt = 0;
2897 +       if (wait)
2898 +               ret = ext4_force_commit(sb);
2899 +       else
2900 +               jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
2901 +       return ret;
2902 -       if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
2903 -               if (wait)
2904 -                       jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
2905 -       }
2906 -       return 0;
2907  }
2908
2909  /*
2910 reverted:
2911 --- b/fs/jbd2/commit.c
2912 +++ a/fs/jbd2/commit.c
2913 @@ -24,7 +24,6 @@
2914  #include <linux/crc32.h>
2915  #include <linux/writeback.h>
2916  #include <linux/backing-dev.h>
2917 -#include <linux/bio.h>
2918
2919  /*
2920   * Default IO end handler for temporary BJ_IO buffer_heads.
2921 @@ -171,34 +170,12 @@
2922   * This function along with journal_submit_commit_record
2923   * allows to write the commit record asynchronously.
2924   */
2925 +static int journal_wait_on_commit_record(struct buffer_head *bh)
2926 -static int journal_wait_on_commit_record(journal_t *journal,
2927 -                                        struct buffer_head *bh)
2928  {
2929         int ret = 0;
2930
2931 -retry:
2932         clear_buffer_dirty(bh);
2933         wait_on_buffer(bh);
2934 -       if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
2935 -               printk(KERN_WARNING
2936 -                      "JBD2: wait_on_commit_record: sync failed on %s - "
2937 -                      "disabling barriers\n", journal->j_devname);
2938 -               spin_lock(&journal->j_state_lock);
2939 -               journal->j_flags &= ~JBD2_BARRIER;
2940 -               spin_unlock(&journal->j_state_lock);
2941 -
2942 -               lock_buffer(bh);
2943 -               clear_buffer_dirty(bh);
2944 -               set_buffer_uptodate(bh);
2945 -               bh->b_end_io = journal_end_buffer_io_sync;
2946 -
2947 -               ret = submit_bh(WRITE_SYNC, bh);
2948 -               if (ret) {
2949 -                       unlock_buffer(bh);
2950 -                       return ret;
2951 -               }
2952 -               goto retry;
2953 -       }
2954
2955         if (unlikely(!buffer_uptodate(bh)))
2956                 ret = -EIO;
2957 @@ -818,7 +795,7 @@
2958                         __jbd2_journal_abort_hard(journal);
2959         }
2960         if (!err && !is_journal_aborted(journal))
2961 +               err = journal_wait_on_commit_record(cbh);
2962 -               err = journal_wait_on_commit_record(journal, cbh);
2963
2964         if (err)
2965                 jbd2_journal_abort(journal, err);
2966 reverted:
2967 --- b/fs/jbd2/journal.c
2968 +++ a/fs/jbd2/journal.c
2969 @@ -430,7 +430,7 @@
2970  }
2971
2972  /*
2973 + * Called under j_state_lock.  Returns true if a transaction was started.
2974 - * Called under j_state_lock.  Returns true if a transaction commit was started.
2975   */
2976  int __jbd2_log_start_commit(journal_t *journal, tid_t target)
2977  {
2978 @@ -498,8 +498,7 @@
2979
2980  /*
2981   * Start a commit of the current running transaction (if any).  Returns true
2982 + * if a transaction was started, and fills its tid in at *ptid
2983 - * if a transaction is going to be committed (or is currently already
2984 - * committing), and fills its tid in at *ptid
2985   */
2986  int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
2987  {
2988 @@ -509,19 +508,15 @@
2989         if (journal->j_running_transaction) {
2990                 tid_t tid = journal->j_running_transaction->t_tid;
2991
2992 +               ret = __jbd2_log_start_commit(journal, tid);
2993 +               if (ret && ptid)
2994 -               __jbd2_log_start_commit(journal, tid);
2995 -               /* There's a running transaction and we've just made sure
2996 -                * it's commit has been scheduled. */
2997 -               if (ptid)
2998                         *ptid = tid;
2999 +       } else if (journal->j_committing_transaction && ptid) {
3000 -               ret = 1;
3001 -       } else if (journal->j_committing_transaction) {
3002                 /*
3003                  * If ext3_write_super() recently started a commit, then we
3004                  * have to wait for completion of that transaction
3005                  */
3006 +               *ptid = journal->j_committing_transaction->t_tid;
3007 -               if (ptid)
3008 -                       *ptid = journal->j_committing_transaction->t_tid;
3009                 ret = 1;
3010         }
3011         spin_unlock(&journal->j_state_lock);
3012 reverted:
3013 --- b/fs/jbd2/revoke.c
3014 +++ a/fs/jbd2/revoke.c
3015 @@ -55,25 +55,6 @@
3016   *                     need do nothing.
3017   * RevokeValid set, Revoked set:
3018   *                     buffer has been revoked.
3019 - *
3020 - * Locking rules:
3021 - * We keep two hash tables of revoke records. One hashtable belongs to the
3022 - * running transaction (is pointed to by journal->j_revoke), the other one
3023 - * belongs to the committing transaction. Accesses to the second hash table
3024 - * happen only from the kjournald and no other thread touches this table.  Also
3025 - * journal_switch_revoke_table() which switches which hashtable belongs to the
3026 - * running and which to the committing transaction is called only from
3027 - * kjournald. Therefore we need no locks when accessing the hashtable belonging
3028 - * to the committing transaction.
3029 - *
3030 - * All users operating on the hash table belonging to the running transaction
3031 - * have a handle to the transaction. Therefore they are safe from kjournald
3032 - * switching hash tables under them. For operations on the lists of entries in
3033 - * the hash table j_revoke_lock is used.
3034 - *
3035 - * Finally, also replay code uses the hash tables but at this moment noone else
3036 - * can touch them (filesystem isn't mounted yet) and hence no locking is
3037 - * needed.
3038   */
3039
3040  #ifndef __KERNEL__
3041 @@ -420,6 +401,8 @@
3042   * the second time we would still have a pending revoke to cancel.  So,
3043   * do not trust the Revoked bit on buffers unless RevokeValid is also
3044   * set.
3045 + *
3046 + * The caller must have the journal locked.
3047   */
3048  int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
3049  {
3050 @@ -497,7 +480,10 @@
3051  /*
3052   * Write revoke records to the journal for all entries in the current
3053   * revoke hash, deleting the entries as we go.
3054 + *
3055 + * Called with the journal lock held.
3056   */
3057 +
3058  void jbd2_journal_write_revoke_records(journal_t *journal,
3059                                   transaction_t *transaction)
3060  {
3061 reverted:
3062 --- b/fs/jbd2/transaction.c
3063 +++ a/fs/jbd2/transaction.c
3064 @@ -2049,46 +2049,26 @@
3065  }
3066
3067  /*
3068 + * This function must be called when inode is journaled in ordered mode
3069 + * before truncation happens. It starts writeout of truncated part in
3070 + * case it is in the committing transaction so that we stand to ordered
3071 + * mode consistency guarantees.
3072 - * File truncate and transaction commit interact with each other in a
3073 - * non-trivial way.  If a transaction writing data block A is
3074 - * committing, we cannot discard the data by truncate until we have
3075 - * written them.  Otherwise if we crashed after the transaction with
3076 - * write has committed but before the transaction with truncate has
3077 - * committed, we could see stale data in block A.  This function is a
3078 - * helper to solve this problem.  It starts writeout of the truncated
3079 - * part in case it is in the committing transaction.
3080 - *
3081 - * Filesystem code must call this function when inode is journaled in
3082 - * ordered mode before truncation happens and after the inode has been
3083 - * placed on orphan list with the new inode size. The second condition
3084 - * avoids the race that someone writes new data and we start
3085 - * committing the transaction after this function has been called but
3086 - * before a transaction for truncate is started (and furthermore it
3087 - * allows us to optimize the case where the addition to orphan list
3088 - * happens in the same transaction as write --- we don't have to write
3089 - * any data in such case).
3090   */
3091 +int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
3092 -int jbd2_journal_begin_ordered_truncate(journal_t *journal,
3093 -                                       struct jbd2_inode *jinode,
3094                                         loff_t new_size)
3095  {
3096 +       journal_t *journal;
3097 +       transaction_t *commit_trans;
3098 -       transaction_t *inode_trans, *commit_trans;
3099         int ret = 0;
3100
3101 +       if (!inode->i_transaction && !inode->i_next_transaction)
3102 -       /* This is a quick check to avoid locking if not necessary */
3103 -       if (!jinode->i_transaction)
3104                 goto out;
3105 +       journal = inode->i_transaction->t_journal;
3106 -       /* Locks are here just to force reading of recent values, it is
3107 -        * enough that the transaction was not committing before we started
3108 -        * a transaction adding the inode to orphan list */
3109         spin_lock(&journal->j_state_lock);
3110         commit_trans = journal->j_committing_transaction;
3111         spin_unlock(&journal->j_state_lock);
3112 +       if (inode->i_transaction == commit_trans) {
3113 +               ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
3114 -       spin_lock(&journal->j_list_lock);
3115 -       inode_trans = jinode->i_transaction;
3116 -       spin_unlock(&journal->j_list_lock);
3117 -       if (inode_trans == commit_trans) {
3118 -               ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
3119                         new_size, LLONG_MAX);
3120                 if (ret)
3121                         jbd2_journal_abort(journal, ret);
3122 reverted:
3123 --- b/include/linux/jbd2.h
3124 +++ a/include/linux/jbd2.h
3125 @@ -308,8 +308,7 @@
3126                 int val = (expr);                                            \
3127                 if (!val) {                                                  \
3128                         printk(KERN_ERR                                      \
3129 +                               "EXT3-fs unexpected failure: %s;\n",# expr); \
3130 -                              "JBD2 unexpected failure: %s: %s;\n",         \
3131 -                              __func__, #expr);                             \
3132                         printk(KERN_ERR why "\n");                           \
3133                 }                                                            \
3134                 val;                                                         \
3135 @@ -330,7 +329,6 @@
3136         BH_State,               /* Pins most journal_head state */
3137         BH_JournalHead,         /* Pins bh->b_private and jh->b_bh */
3138         BH_Unshadow,            /* Dummy bit, for BJ_Shadow wakeup filtering */
3139 -       BH_JBDPrivateStart,     /* First bit available for private use by FS */
3140  };
3141
3142  BUFFER_FNS(JBD, jbd)
3143 @@ -1075,8 +1073,7 @@
3144  extern int        jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
3145  extern int        jbd2_journal_force_commit(journal_t *);
3146  extern int        jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
3147 +extern int        jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
3148 -extern int        jbd2_journal_begin_ordered_truncate(journal_t *journal,
3149 -                               struct jbd2_inode *inode, loff_t new_size);
3150  extern void       jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
3151  extern void       jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
3152