src/patches/suse-2.6.27.39/patches.kernel.org/patch-2.6.27.18-19

   1 From: Og <og@kroah.com>
   2 Subject: Linux 2.6.27.19
   3
   4 Upstream 2.6.27.19 release from kernel.org
   5
   6 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
   7
   8 diff --git a/Makefile b/Makefile
   9 index 9273a73..dbe8543 100644
  10 --- a/Makefile
  11 +++ b/Makefile
  12 @@ -1,7 +1,7 @@
  13  VERSION = 2
  14  PATCHLEVEL = 6
  15  SUBLEVEL = 27
  16 -EXTRAVERSION = .18
  17 +EXTRAVERSION = .19
  18  NAME = Trembling Tortoise
  19
  20  # *DOCUMENTATION*
  21 diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
  22 index 5af4e9b..ada0692 100644
  23 --- a/arch/powerpc/kernel/align.c
  24 +++ b/arch/powerpc/kernel/align.c
  25 @@ -646,11 +646,16 @@ static int emulate_vsx(unsigned char __user *addr, unsigned int reg,
  26                        unsigned int areg, struct pt_regs *regs,
  27                        unsigned int flags, unsigned int length)
  28  {
  29 -       char *ptr = (char *) &current->thread.TS_FPR(reg);
  30 +       char *ptr;
  31         int ret = 0;
  32
  33         flush_vsx_to_thread(current);
  34
  35 +       if (reg < 32)
  36 +               ptr = (char *) &current->thread.TS_FPR(reg);
  37 +       else
  38 +               ptr = (char *) &current->thread.vr[reg - 32];
  39 +
  40         if (flags & ST)
  41                 ret = __copy_to_user(addr, ptr, length);
  42          else {
  43 diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
  44 index 5b719a0..7c3b8dc 100644
  45 --- a/arch/x86/mm/pageattr.c
  46 +++ b/arch/x86/mm/pageattr.c
  47 @@ -619,6 +619,13 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
  48         unsigned int level;
  49         pte_t *kpte, old_pte;
  50
  51 +       /*
  52 +        * If we're called with lazy mmu updates enabled, the
  53 +        * in-memory pte state may be stale.  Flush pending updates to
  54 +        * bring them up to date.
  55 +        */
  56 +       arch_flush_lazy_mmu_mode();
  57 +
  58  repeat:
  59         kpte = lookup_address(address, &level);
  60         if (!kpte)
  61 @@ -836,6 +843,13 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
  62         else
  63                 cpa_flush_all(cache);
  64
  65 +       /*
  66 +        * If we've been called with lazy mmu updates enabled, then
  67 +        * make sure that everything gets flushed out before we
  68 +        * return.
  69 +        */
  70 +       arch_flush_lazy_mmu_mode();
  71 +
  72  out:
  73         cpa_fill_pool(NULL);
  74
  75 diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c
  76 index c5be6a1..b6f55e8 100644
  77 --- a/drivers/ata/pata_via.c
  78 +++ b/drivers/ata/pata_via.c
  79 @@ -111,7 +111,8 @@ static const struct via_isa_bridge {
  80         { "vt8237s",    PCI_DEVICE_ID_VIA_8237S,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
  81         { "vt8251",     PCI_DEVICE_ID_VIA_8251,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
  82         { "cx700",      PCI_DEVICE_ID_VIA_CX700,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_SATA_PATA },
  83 -       { "vt6410",     PCI_DEVICE_ID_VIA_6410,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES},
  84 +       { "vt6410",     PCI_DEVICE_ID_VIA_6410,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES },
  85 +       { "vt6415",     PCI_DEVICE_ID_VIA_6415,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST | VIA_NO_ENABLES },
  86         { "vt8237a",    PCI_DEVICE_ID_VIA_8237A,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
  87         { "vt8237",     PCI_DEVICE_ID_VIA_8237,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
  88         { "vt8235",     PCI_DEVICE_ID_VIA_8235,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
  89 @@ -594,6 +595,7 @@ static int via_reinit_one(struct pci_dev *pdev)
  90  #endif
  91
  92  static const struct pci_device_id via[] = {
  93 +       { PCI_VDEVICE(VIA, 0x0415), },
  94         { PCI_VDEVICE(VIA, 0x0571), },
  95         { PCI_VDEVICE(VIA, 0x0581), },
  96         { PCI_VDEVICE(VIA, 0x1571), },
  97 diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
  98 index 89e3b7f..8b6f9c0 100644
  99 --- a/drivers/ata/sata_nv.c
 100 +++ b/drivers/ata/sata_nv.c
 101 @@ -421,19 +421,21 @@ static struct ata_port_operations nv_generic_ops = {
 102         .hardreset              = ATA_OP_NULL,
 103  };
 104
 105 -/* OSDL bz3352 reports that nf2/3 controllers can't determine device
 106 - * signature reliably.  Also, the following thread reports detection
 107 - * failure on cold boot with the standard debouncing timing.
 108 +/* nf2 is ripe with hardreset related problems.
 109 + *
 110 + * kernel bz#3352 reports nf2/3 controllers can't determine device
 111 + * signature reliably.  The following thread reports detection failure
 112 + * on cold boot with the standard debouncing timing.
 113   *
 114   * http://thread.gmane.org/gmane.linux.ide/34098
 115   *
 116 - * Debounce with hotplug timing and request follow-up SRST.
 117 + * And bz#12176 reports that hardreset simply doesn't work on nf2.
 118 + * Give up on it and just don't do hardreset.
 119   */
 120  static struct ata_port_operations nv_nf2_ops = {
 121 -       .inherits               = &nv_common_ops,
 122 +       .inherits               = &nv_generic_ops,
 123         .freeze                 = nv_nf2_freeze,
 124         .thaw                   = nv_nf2_thaw,
 125 -       .hardreset              = nv_noclassify_hardreset,
 126  };
 127
 128  /* For initial probing after boot and hot plugging, hardreset mostly
 129 diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c
 130 index 58630cc..f2ada0c 100644
 131 --- a/drivers/bluetooth/btsdio.c
 132 +++ b/drivers/bluetooth/btsdio.c
 133 @@ -91,6 +91,7 @@ static int btsdio_tx_packet(struct btsdio_data *data, struct sk_buff *skb)
 134
 135         err = sdio_writesb(data->func, REG_TDAT, skb->data, skb->len);
 136         if (err < 0) {
 137 +               skb_pull(skb, 4);
 138                 sdio_writeb(data->func, 0x01, REG_PC_WRT, NULL);
 139                 return err;
 140         }
 141 @@ -152,7 +153,7 @@ static int btsdio_rx_packet(struct btsdio_data *data)
 142
 143         err = sdio_readsb(data->func, skb->data, REG_RDAT, len - 4);
 144         if (err < 0) {
 145 -               kfree(skb);
 146 +               kfree_skb(skb);
 147                 return err;
 148         }
 149
 150 diff --git a/drivers/net/3c505.c b/drivers/net/3c505.c
 151 index fdfb2b2..ae8e36c 100644
 152 --- a/drivers/net/3c505.c
 153 +++ b/drivers/net/3c505.c
 154 @@ -493,21 +493,27 @@ static bool receive_pcb(struct net_device *dev, pcb_struct * pcb)
 155         }
 156         /* read the data */
 157         spin_lock_irqsave(&adapter->lock, flags);
 158 -       i = 0;
 159 -       do {
 160 -               j = 0;
 161 -               while (((stat = get_status(dev->base_addr)) & ACRF) == 0 && j++ < 20000);
 162 -               pcb->data.raw[i++] = inb_command(dev->base_addr);
 163 -               if (i > MAX_PCB_DATA)
 164 -                       INVALID_PCB_MSG(i);
 165 -       } while ((stat & ASF_PCB_MASK) != ASF_PCB_END && j < 20000);
 166 +       for (i = 0; i < MAX_PCB_DATA; i++) {
 167 +               for (j = 0; j < 20000; j++) {
 168 +                       stat = get_status(dev->base_addr);
 169 +                       if (stat & ACRF)
 170 +                               break;
 171 +               }
 172 +               pcb->data.raw[i] = inb_command(dev->base_addr);
 173 +               if ((stat & ASF_PCB_MASK) == ASF_PCB_END || j >= 20000)
 174 +                       break;
 175 +       }
 176         spin_unlock_irqrestore(&adapter->lock, flags);
 177 +       if (i >= MAX_PCB_DATA) {
 178 +               INVALID_PCB_MSG(i);
 179 +               return false;
 180 +       }
 181         if (j >= 20000) {
 182                 TIMEOUT_MSG(__LINE__);
 183                 return false;
 184         }
 185 -       /* woops, the last "data" byte was really the length! */
 186 -       total_length = pcb->data.raw[--i];
 187 +       /* the last "data" byte was really the length! */
 188 +       total_length = pcb->data.raw[i];
 189
 190         /* safety check total length vs data length */
 191         if (total_length != (pcb->length + 2)) {
 192 diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
 193 index c3edcdc..2d90a3c 100644
 194 --- a/drivers/pci/intel-iommu.c
 195 +++ b/drivers/pci/intel-iommu.c
 196 @@ -72,6 +72,8 @@ static struct deferred_flush_tables *deferred_flush;
 197  /* bitmap for indexing intel_iommus */
 198  static int g_num_of_iommus;
 199
 200 +static int rwbf_quirk = 0;
 201 +
 202  static DEFINE_SPINLOCK(async_umap_flush_lock);
 203  static LIST_HEAD(unmaps_to_do);
 204
 205 @@ -527,7 +529,7 @@ static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 206         u32 val;
 207         unsigned long flag;
 208
 209 -       if (!cap_rwbf(iommu->cap))
 210 +       if (!rwbf_quirk && !cap_rwbf(iommu->cap))
 211                 return;
 212         val = iommu->gcmd | DMA_GCMD_WBF;
 213
 214 @@ -2453,3 +2455,12 @@ int __init intel_iommu_init(void)
 215         return 0;
 216  }
 217
 218 +static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
 219 +{
 220 +       /* Mobile 4 Series Chipset neglects to set RWBF capability,
 221 +          but needs it */
 222 +       printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
 223 +       rwbf_quirk = 1;
 224 +}
 225 +
 226 +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
 227 diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
 228 index 299e075..55ac5c3 100644
 229 --- a/drivers/scsi/libiscsi.c
 230 +++ b/drivers/scsi/libiscsi.c
 231 @@ -1844,6 +1844,7 @@ void iscsi_pool_free(struct iscsi_pool *q)
 232                 kfree(q->pool[i]);
 233         if (q->pool)
 234                 kfree(q->pool);
 235 +       kfree(q->queue);
 236  }
 237  EXPORT_SYMBOL_GPL(iscsi_pool_free);
 238
 239 diff --git a/fs/ext2/super.c b/fs/ext2/super.c
 240 index fd88c7b..2ebc0c4 100644
 241 --- a/fs/ext2/super.c
 242 +++ b/fs/ext2/super.c
 243 @@ -1177,9 +1177,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 244         es = sbi->s_es;
 245         if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
 246             (old_mount_opt & EXT2_MOUNT_XIP)) &&
 247 -           invalidate_inodes(sb))
 248 -               ext2_warning(sb, __func__, "busy inodes while remounting "\
 249 -                            "xip remain in cache (no functional problem)");
 250 +           invalidate_inodes(sb)) {
 251 +               ext2_warning(sb, __func__, "refusing change of xip flag "
 252 +                            "with busy inodes while remounting");
 253 +               sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
 254 +               sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
 255 +       }
 256         if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 257                 return 0;
 258         if (*flags & MS_RDONLY) {
 259 diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
 260 index e9fa960..8b7c776 100644
 261 --- a/fs/ext4/balloc.c
 262 +++ b/fs/ext4/balloc.c
 263 @@ -20,6 +20,7 @@
 264  #include "ext4.h"
 265  #include "ext4_jbd2.h"
 266  #include "group.h"
 267 +#include "mballoc.h"
 268
 269  /*
 270   * balloc.c contains the blocks allocation and deallocation routines
 271 @@ -318,18 +319,41 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 272                             block_group, bitmap_blk);
 273                 return NULL;
 274         }
 275 -       if (bh_uptodate_or_lock(bh))
 276 +
 277 +       if (bitmap_uptodate(bh))
 278                 return bh;
 279
 280 +       lock_buffer(bh);
 281 +       if (bitmap_uptodate(bh)) {
 282 +               unlock_buffer(bh);
 283 +               return bh;
 284 +       }
 285         spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
 286         if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 287                 ext4_init_block_bitmap(sb, bh, block_group, desc);
 288 +               set_bitmap_uptodate(bh);
 289                 set_buffer_uptodate(bh);
 290                 unlock_buffer(bh);
 291                 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
 292                 return bh;
 293         }
 294         spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
 295 +       if (buffer_uptodate(bh)) {
 296 +               /*
 297 +                * if not uninit if bh is uptodate,
 298 +                * bitmap is also uptodate
 299 +                */
 300 +               set_bitmap_uptodate(bh);
 301 +               unlock_buffer(bh);
 302 +               return bh;
 303 +       }
 304 +       /*
 305 +        * submit the buffer_head for read. We can
 306 +        * safely mark the bitmap as uptodate now.
 307 +        * We do it here so the bitmap uptodate bit
 308 +        * get set with buffer lock held.
 309 +        */
 310 +       set_bitmap_uptodate(bh);
 311         if (bh_submit_read(bh) < 0) {
 312                 put_bh(bh);
 313                 ext4_error(sb, __func__,
 314 @@ -837,6 +861,136 @@ error_return:
 315  }
 316
 317  /**
 318 + * ext4_add_groupblocks() -- Add given blocks to an existing group
 319 + * @handle:                    handle to this transaction
 320 + * @sb:                                super block
 321 + * @block:                     start physcial block to add to the block group
 322 + * @count:                     number of blocks to free
 323 + *
 324 + * This marks the blocks as free in the bitmap. We ask the
 325 + * mballoc to reload the buddy after this by setting group
 326 + * EXT4_GROUP_INFO_NEED_INIT_BIT flag
 327 + */
 328 +void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 329 +                        ext4_fsblk_t block, unsigned long count)
 330 +{
 331 +       struct buffer_head *bitmap_bh = NULL;
 332 +       struct buffer_head *gd_bh;
 333 +       ext4_group_t block_group;
 334 +       ext4_grpblk_t bit;
 335 +       unsigned long i;
 336 +       struct ext4_group_desc *desc;
 337 +       struct ext4_super_block *es;
 338 +       struct ext4_sb_info *sbi;
 339 +       int err = 0, ret;
 340 +       ext4_grpblk_t blocks_freed;
 341 +       struct ext4_group_info *grp;
 342 +
 343 +       sbi = EXT4_SB(sb);
 344 +       es = sbi->s_es;
 345 +       ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
 346 +
 347 +       ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
 348 +       grp = ext4_get_group_info(sb, block_group);
 349 +       /*
 350 +        * Check to see if we are freeing blocks across a group
 351 +        * boundary.
 352 +        */
 353 +       if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
 354 +               goto error_return;
 355 +
 356 +       bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 357 +       if (!bitmap_bh)
 358 +               goto error_return;
 359 +       desc = ext4_get_group_desc(sb, block_group, &gd_bh);
 360 +       if (!desc)
 361 +               goto error_return;
 362 +
 363 +       if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
 364 +           in_range(ext4_inode_bitmap(sb, desc), block, count) ||
 365 +           in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
 366 +           in_range(block + count - 1, ext4_inode_table(sb, desc),
 367 +                    sbi->s_itb_per_group)) {
 368 +               ext4_error(sb, __func__,
 369 +                          "Adding blocks in system zones - "
 370 +                           "Block = %llu, count = %lu",
 371 +                           block, count);
 372 +               goto error_return;
 373 +       }
 374 +
 375 +       /*
 376 +        * We are about to add blocks to the bitmap,
 377 +        * so we need undo access.
 378 +        */
 379 +       BUFFER_TRACE(bitmap_bh, "getting undo access");
 380 +       err = ext4_journal_get_undo_access(handle, bitmap_bh);
 381 +       if (err)
 382 +               goto error_return;
 383 +
 384 +       /*
 385 +        * We are about to modify some metadata.  Call the journal APIs
 386 +        * to unshare ->b_data if a currently-committing transaction is
 387 +        * using it
 388 +        */
 389 +       BUFFER_TRACE(gd_bh, "get_write_access");
 390 +       err = ext4_journal_get_write_access(handle, gd_bh);
 391 +       if (err)
 392 +               goto error_return;
 393 +       /*
 394 +        * make sure we don't allow a parallel init on other groups in the
 395 +        * same buddy cache
 396 +        */
 397 +       down_write(&grp->alloc_sem);
 398 +       for (i = 0, blocks_freed = 0; i < count; i++) {
 399 +               BUFFER_TRACE(bitmap_bh, "clear bit");
 400 +               if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
 401 +                                               bit + i, bitmap_bh->b_data)) {
 402 +                       ext4_error(sb, __func__,
 403 +                                  "bit already cleared for block %llu",
 404 +                                  (ext4_fsblk_t)(block + i));
 405 +                       BUFFER_TRACE(bitmap_bh, "bit already cleared");
 406 +               } else {
 407 +                       blocks_freed++;
 408 +               }
 409 +       }
 410 +       spin_lock(sb_bgl_lock(sbi, block_group));
 411 +       le16_add_cpu(&desc->bg_free_blocks_count, blocks_freed);
 412 +       desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
 413 +       spin_unlock(sb_bgl_lock(sbi, block_group));
 414 +       percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
 415 +
 416 +       if (sbi->s_log_groups_per_flex) {
 417 +               ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
 418 +               spin_lock(sb_bgl_lock(sbi, flex_group));
 419 +               sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
 420 +               spin_unlock(sb_bgl_lock(sbi, flex_group));
 421 +       }
 422 +       /*
 423 +        * request to reload the buddy with the
 424 +        * new bitmap information
 425 +        */
 426 +       set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
 427 +       ext4_mb_update_group_info(grp, blocks_freed);
 428 +       up_write(&grp->alloc_sem);
 429 +
 430 +       /* We dirtied the bitmap block */
 431 +       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
 432 +       err = ext4_journal_dirty_metadata(handle, bitmap_bh);
 433 +
 434 +       /* And the group descriptor block */
 435 +       BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
 436 +       ret = ext4_journal_dirty_metadata(handle, gd_bh);
 437 +       if (!err)
 438 +               err = ret;
 439 +       sb->s_dirt = 1;
 440 +
 441 +error_return:
 442 +       brelse(bitmap_bh);
 443 +       ext4_std_error(sb, err);
 444 +       return;
 445 +}
 446 +
 447 +/**
 448   * ext4_free_blocks() -- Free given blocks and update quota
 449   * @handle:            handle for this transaction
 450   * @inode:             inode
 451 diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
 452 index 4829dac..85f58af 100644
 453 --- a/fs/ext4/ext4.h
 454 +++ b/fs/ext4/ext4.h
 455 @@ -19,6 +19,7 @@
 456  #include <linux/types.h>
 457  #include <linux/blkdev.h>
 458  #include <linux/magic.h>
 459 +#include <linux/jbd2.h>
 460  #include "ext4_i.h"
 461
 462  /*
 463 @@ -889,6 +890,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len)
 464  #define DX_HASH_LEGACY         0
 465  #define DX_HASH_HALF_MD4       1
 466  #define DX_HASH_TEA            2
 467 +#define DX_HASH_LEGACY_UNSIGNED        3
 468 +#define DX_HASH_HALF_MD4_UNSIGNED      4
 469 +#define DX_HASH_TEA_UNSIGNED           5
 470
 471  #ifdef __KERNEL__
 472
 473 @@ -988,9 +992,11 @@ extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
 474                                                 ext4_fsblk_t nblocks);
 475  extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
 476                         ext4_fsblk_t block, unsigned long count, int metadata);
 477 -extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
 478 -                                ext4_fsblk_t block, unsigned long count,
 479 +extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
 480 +                               ext4_fsblk_t block, unsigned long count,
 481                                 unsigned long *pdquot_freed_blocks);
 482 +extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 483 +                               ext4_fsblk_t block, unsigned long count);
 484  extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *);
 485  extern void ext4_check_blocks_bitmap (struct super_block *);
 486  extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 487 @@ -1038,12 +1044,13 @@ extern int __init init_ext4_mballoc(void);
 488  extern void exit_ext4_mballoc(void);
 489  extern void ext4_mb_free_blocks(handle_t *, struct inode *,
 490                 unsigned long, unsigned long, int, unsigned long *);
 491 -extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
 492 +extern int ext4_mb_add_groupinfo(struct super_block *sb,
 493                 ext4_group_t i, struct ext4_group_desc *desc);
 494  extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
 495                 ext4_grpblk_t add);
 496 -
 497 -
 498 +extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 499 +extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
 500 +                                               ext4_group_t, int);
 501  /* inode.c */
 502  int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 503                 struct buffer_head *bh, ext4_fsblk_t blocknr);
 504 @@ -1167,8 +1174,11 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
 505
 506  static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
 507  {
 508 -       return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
 509 -               le32_to_cpu(raw_inode->i_size_lo);
 510 +       if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
 511 +               return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
 512 +                       le32_to_cpu(raw_inode->i_size_lo);
 513 +       else
 514 +               return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
 515  }
 516
 517  static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
 518 @@ -1244,6 +1254,23 @@ extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
 519                         sector_t block, unsigned long max_blocks,
 520                         struct buffer_head *bh, int create,
 521                         int extend_disksize, int flag);
 522 +/*
 523 + * Add new method to test wether block and inode bitmaps are properly
 524 + * initialized. With uninit_bg reading the block from disk is not enough
 525 + * to mark the bitmap uptodate. We need to also zero-out the bitmap
 526 + */
 527 +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart
 528 +
 529 +static inline int bitmap_uptodate(struct buffer_head *bh)
 530 +{
 531 +       return (buffer_uptodate(bh) &&
 532 +               test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
 533 +}
 534 +static inline void set_bitmap_uptodate(struct buffer_head *bh)
 535 +{
 536 +       set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
 537 +}
 538 +
 539  #endif /* __KERNEL__ */
 540
 541  #endif /* _EXT4_H */
 542 diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
 543 index 6300226..f20df8a 100644
 544 --- a/fs/ext4/ext4_sb.h
 545 +++ b/fs/ext4/ext4_sb.h
 546 @@ -56,6 +56,7 @@ struct ext4_sb_info {
 547         u32 s_next_generation;
 548         u32 s_hash_seed[4];
 549         int s_def_hash_version;
 550 +       int s_hash_unsigned;    /* 3 if hash should be signed, 0 if not */
 551         struct percpu_counter s_freeblocks_counter;
 552         struct percpu_counter s_freeinodes_counter;
 553         struct percpu_counter s_dirs_counter;
 554 @@ -102,7 +103,8 @@ struct ext4_sb_info {
 555         struct list_head s_committed_transaction;
 556         spinlock_t s_md_lock;
 557         tid_t s_last_transaction;
 558 -       unsigned short *s_mb_offsets, *s_mb_maxs;
 559 +       unsigned short *s_mb_offsets;
 560 +       unsigned int *s_mb_maxs;
 561
 562         /* tunables */
 563         unsigned long s_stripe;
 564 diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
 565 index 1d6329d..bd7d14d 100644
 566 --- a/fs/ext4/hash.c
 567 +++ b/fs/ext4/hash.c
 568 @@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 569
 570
 571  /* The old legacy hash */
 572 -static __u32 dx_hack_hash (const char *name, int len)
 573 +static __u32 dx_hack_hash_unsigned(const char *name, int len)
 574  {
 575 -       __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
 576 +       __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
 577 +       const unsigned char *ucp = (const unsigned char *) name;
 578 +
 579 +       while (len--) {
 580 +               hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373));
 581 +
 582 +               if (hash & 0x80000000)
 583 +                       hash -= 0x7fffffff;
 584 +               hash1 = hash0;
 585 +               hash0 = hash;
 586 +       }
 587 +       return hash0 << 1;
 588 +}
 589 +
 590 +static __u32 dx_hack_hash_signed(const char *name, int len)
 591 +{
 592 +       __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
 593 +       const signed char *scp = (const signed char *) name;
 594 +
 595         while (len--) {
 596 -               __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
 597 +               hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373));
 598
 599 -               if (hash & 0x80000000) hash -= 0x7fffffff;
 600 +               if (hash & 0x80000000)
 601 +                       hash -= 0x7fffffff;
 602                 hash1 = hash0;
 603                 hash0 = hash;
 604         }
 605 -       return (hash0 << 1);
 606 +       return hash0 << 1;
 607  }
 608
 609 -static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
 610 +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
 611  {
 612         __u32   pad, val;
 613         int     i;
 614 +       const signed char *scp = (const signed char *) msg;
 615 +
 616 +       pad = (__u32)len | ((__u32)len << 8);
 617 +       pad |= pad << 16;
 618 +
 619 +       val = pad;
 620 +       if (len > num*4)
 621 +               len = num * 4;
 622 +       for (i = 0; i < len; i++) {
 623 +               if ((i % 4) == 0)
 624 +                       val = pad;
 625 +               val = ((int) scp[i]) + (val << 8);
 626 +               if ((i % 4) == 3) {
 627 +                       *buf++ = val;
 628 +                       val = pad;
 629 +                       num--;
 630 +               }
 631 +       }
 632 +       if (--num >= 0)
 633 +               *buf++ = val;
 634 +       while (--num >= 0)
 635 +               *buf++ = pad;
 636 +}
 637 +
 638 +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
 639 +{
 640 +       __u32   pad, val;
 641 +       int     i;
 642 +       const unsigned char *ucp = (const unsigned char *) msg;
 643
 644         pad = (__u32)len | ((__u32)len << 8);
 645         pad |= pad << 16;
 646 @@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
 647         for (i=0; i < len; i++) {
 648                 if ((i % 4) == 0)
 649                         val = pad;
 650 -               val = msg[i] + (val << 8);
 651 +               val = ((int) ucp[i]) + (val << 8);
 652                 if ((i % 4) == 3) {
 653                         *buf++ = val;
 654                         val = pad;
 655 @@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 656         const char      *p;
 657         int             i;
 658         __u32           in[8], buf[4];
 659 +       void            (*str2hashbuf)(const char *, int, __u32 *, int) =
 660 +                               str2hashbuf_signed;
 661
 662         /* Initialize the default seed for the hash checksum functions */
 663         buf[0] = 0x67452301;
 664 @@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 665         }
 666
 667         switch (hinfo->hash_version) {
 668 +       case DX_HASH_LEGACY_UNSIGNED:
 669 +               hash = dx_hack_hash_unsigned(name, len);
 670 +               break;
 671         case DX_HASH_LEGACY:
 672 -               hash = dx_hack_hash(name, len);
 673 +               hash = dx_hack_hash_signed(name, len);
 674                 break;
 675 +       case DX_HASH_HALF_MD4_UNSIGNED:
 676 +               str2hashbuf = str2hashbuf_unsigned;
 677         case DX_HASH_HALF_MD4:
 678                 p = name;
 679                 while (len > 0) {
 680 -                       str2hashbuf(p, len, in, 8);
 681 +                       (*str2hashbuf)(p, len, in, 8);
 682                         half_md4_transform(buf, in);
 683                         len -= 32;
 684                         p += 32;
 685 @@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
 686                 minor_hash = buf[2];
 687                 hash = buf[1];
 688                 break;
 689 +       case DX_HASH_TEA_UNSIGNED:
 690 +               str2hashbuf = str2hashbuf_unsigned;
 691         case DX_HASH_TEA:
 692                 p = name;
 693                 while (len > 0) {
 694 -                       str2hashbuf(p, len, in, 4);
 695 +                       (*str2hashbuf)(p, len, in, 4);
 696                         TEA_transform(buf, in);
 697                         len -= 16;
 698                         p += 16;
 699 diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
 700 index 9805924..b994854 100644
 701 --- a/fs/ext4/ialloc.c
 702 +++ b/fs/ext4/ialloc.c
 703 @@ -84,7 +84,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
 704         }
 705
 706         memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
 707 -       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
 708 +       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
 709                         bh->b_data);
 710
 711         return EXT4_INODES_PER_GROUP(sb);
 712 @@ -115,18 +115,40 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 713                             block_group, bitmap_blk);
 714                 return NULL;
 715         }
 716 -       if (bh_uptodate_or_lock(bh))
 717 +       if (bitmap_uptodate(bh))
 718                 return bh;
 719
 720 +       lock_buffer(bh);
 721 +       if (bitmap_uptodate(bh)) {
 722 +               unlock_buffer(bh);
 723 +               return bh;
 724 +       }
 725         spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
 726         if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 727                 ext4_init_inode_bitmap(sb, bh, block_group, desc);
 728 +               set_bitmap_uptodate(bh);
 729                 set_buffer_uptodate(bh);
 730                 unlock_buffer(bh);
 731                 spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
 732                 return bh;
 733         }
 734         spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
 735 +       if (buffer_uptodate(bh)) {
 736 +               /*
 737 +                * if not uninit if bh is uptodate,
 738 +                * bitmap is also uptodate
 739 +                */
 740 +               set_bitmap_uptodate(bh);
 741 +               unlock_buffer(bh);
 742 +               return bh;
 743 +       }
 744 +       /*
 745 +        * submit the buffer_head for read. We can
 746 +        * safely mark the bitmap as uptodate now.
 747 +        * We do it here so the bitmap uptodate bit
 748 +        * get set with buffer lock held.
 749 +        */
 750 +       set_bitmap_uptodate(bh);
 751         if (bh_submit_read(bh) < 0) {
 752                 put_bh(bh);
 753                 ext4_error(sb, __func__,
 754 @@ -567,6 +589,77 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 755  }
 756
 757  /*
 758 + * claim the inode from the inode bitmap. If the group
 759 + * is uninit we need to take the groups's sb_bgl_lock
 760 + * and clear the uninit flag. The inode bitmap update
 761 + * and group desc uninit flag clear should be done
 762 + * after holding sb_bgl_lock so that ext4_read_inode_bitmap
 763 + * doesn't race with the ext4_claim_inode
 764 + */
 765 +static int ext4_claim_inode(struct super_block *sb,
 766 +                       struct buffer_head *inode_bitmap_bh,
 767 +                       unsigned long ino, ext4_group_t group, int mode)
 768 +{
 769 +       int free = 0, retval = 0;
 770 +       struct ext4_sb_info *sbi = EXT4_SB(sb);
 771 +       struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
 772 +
 773 +       spin_lock(sb_bgl_lock(sbi, group));
 774 +       if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
 775 +               /* not a free inode */
 776 +               retval = 1;
 777 +               goto err_ret;
 778 +       }
 779 +       ino++;
 780 +       if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
 781 +                       ino > EXT4_INODES_PER_GROUP(sb)) {
 782 +               spin_unlock(sb_bgl_lock(sbi, group));
 783 +               ext4_error(sb, __func__,
 784 +                          "reserved inode or inode > inodes count - "
 785 +                          "block_group = %lu, inode=%lu", group,
 786 +                          ino + group * EXT4_INODES_PER_GROUP(sb));
 787 +               return 1;
 788 +       }
 789 +       /* If we didn't allocate from within the initialized part of the inode
 790 +        * table then we need to initialize up to this inode. */
 791 +       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
 792 +
 793 +               if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 794 +                       gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
 795 +                       /* When marking the block group with
 796 +                        * ~EXT4_BG_INODE_UNINIT we don't want to depend
 797 +                        * on the value of bg_itable_unused even though
 798 +                        * mke2fs could have initialized the same for us.
 799 +                        * Instead we calculated the value below
 800 +                        */
 801 +
 802 +                       free = 0;
 803 +               } else {
 804 +                       free = EXT4_INODES_PER_GROUP(sb) -
 805 +                               le16_to_cpu(gdp->bg_itable_unused);
 806 +               }
 807 +
 808 +               /*
 809 +                * Check the relative inode number against the last used
 810 +                * relative inode number in this group. if it is greater
 811 +                * we need to  update the bg_itable_unused count
 812 +                *
 813 +                */
 814 +               if (ino > free)
 815 +                       gdp->bg_itable_unused =
 816 +                               cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
 817 +       }
 818 +       le16_add_cpu(&gdp->bg_free_inodes_count, -1);
 819 +       if (S_ISDIR(mode)) {
 820 +               le16_add_cpu(&gdp->bg_used_dirs_count, 1);
 821 +       }
 822 +       gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 823 +err_ret:
 824 +       spin_unlock(sb_bgl_lock(sbi, group));
 825 +       return retval;
 826 +}
 827 +
 828 +/*
 829   * There are two policies for allocating an inode.  If the new inode is
 830   * a directory, then a forward search is made for a block group with both
 831   * free space and a low directory-to-inode ratio; if that fails, then of
 832 @@ -649,8 +742,12 @@ repeat_in_this_group:
 833                         if (err)
 834                                 goto fail;
 835
 836 -                       if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group),
 837 -                                               ino, bitmap_bh->b_data)) {
 838 +                       BUFFER_TRACE(bh2, "get_write_access");
 839 +                       err = ext4_journal_get_write_access(handle, bh2);
 840 +                       if (err)
 841 +                               goto fail;
 842 +                       if (!ext4_claim_inode(sb, bitmap_bh,
 843 +                                               ino, group, mode)) {
 844                                 /* we won it */
 845                                 BUFFER_TRACE(bitmap_bh,
 846                                         "call ext4_journal_dirty_metadata");
 847 @@ -658,10 +755,13 @@ repeat_in_this_group:
 848                                                                 bitmap_bh);
 849                                 if (err)
 850                                         goto fail;
 851 +                               /* zero bit is inode number 1*/
 852 +                               ino++;
 853                                 goto got;
 854                         }
 855                         /* we lost it */
 856                         jbd2_journal_release_buffer(handle, bitmap_bh);
 857 +                       jbd2_journal_release_buffer(handle, bh2);
 858
 859                         if (++ino < EXT4_INODES_PER_GROUP(sb))
 860                                 goto repeat_in_this_group;
 861 @@ -681,21 +781,6 @@ repeat_in_this_group:
 862         goto out;
 863
 864  got:
 865 -       ino++;
 866 -       if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
 867 -           ino > EXT4_INODES_PER_GROUP(sb)) {
 868 -               ext4_error(sb, __func__,
 869 -                          "reserved inode or inode > inodes count - "
 870 -                          "block_group = %lu, inode=%lu", group,
 871 -                          ino + group * EXT4_INODES_PER_GROUP(sb));
 872 -               err = -EIO;
 873 -               goto fail;
 874 -       }
 875 -
 876 -       BUFFER_TRACE(bh2, "get_write_access");
 877 -       err = ext4_journal_get_write_access(handle, bh2);
 878 -       if (err) goto fail;
 879 -
 880         /* We may have to initialize the block bitmap if it isn't already */
 881         if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
 882             gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 883 @@ -730,47 +815,10 @@ got:
 884                 if (err)
 885                         goto fail;
 886         }
 887 -
 888 -       spin_lock(sb_bgl_lock(sbi, group));
 889 -       /* If we didn't allocate from within the initialized part of the inode
 890 -        * table then we need to initialize up to this inode. */
 891 -       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
 892 -               if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 893 -                       gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
 894 -
 895 -                       /* When marking the block group with
 896 -                        * ~EXT4_BG_INODE_UNINIT we don't want to depend
 897 -                        * on the value of bg_itable_unused even though
 898 -                        * mke2fs could have initialized the same for us.
 899 -                        * Instead we calculated the value below
 900 -                        */
 901 -
 902 -                       free = 0;
 903 -               } else {
 904 -                       free = EXT4_INODES_PER_GROUP(sb) -
 905 -                               le16_to_cpu(gdp->bg_itable_unused);
 906 -               }
 907 -
 908 -               /*
 909 -                * Check the relative inode number against the last used
 910 -                * relative inode number in this group. if it is greater
 911 -                * we need to  update the bg_itable_unused count
 912 -                *
 913 -                */
 914 -               if (ino > free)
 915 -                       gdp->bg_itable_unused =
 916 -                               cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino);
 917 -       }
 918 -
 919 -       le16_add_cpu(&gdp->bg_free_inodes_count, -1);
 920 -       if (S_ISDIR(mode)) {
 921 -               le16_add_cpu(&gdp->bg_used_dirs_count, 1);
 922 -       }
 923 -       gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 924 -       spin_unlock(sb_bgl_lock(sbi, group));
 925 -       BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
 926 +       BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
 927         err = ext4_journal_dirty_metadata(handle, bh2);
 928 -       if (err) goto fail;
 929 +       if (err)
 930 +               goto fail;
 931
 932         percpu_counter_dec(&sbi->s_freeinodes_counter);
 933         if (S_ISDIR(mode))
 934 diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
 935 index d77f674..6e7f085 100644
 936 --- a/fs/ext4/inode.c
 937 +++ b/fs/ext4/inode.c
 938 @@ -351,9 +351,9 @@ static int ext4_block_to_path(struct inode *inode,
 939                 final = ptrs;
 940         } else {
 941                 ext4_warning(inode->i_sb, "ext4_block_to_path",
 942 -                               "block %lu > max",
 943 +                               "block %lu > max in inode %lu",
 944                                 i_block + direct_blocks +
 945 -                               indirect_blocks + double_blocks);
 946 +                               indirect_blocks + double_blocks, inode->i_ino);
 947         }
 948         if (boundary)
 949                 *boundary = final - 1 - (i_block & (ptrs - 1));
 950 @@ -1648,18 +1648,25 @@ struct mpage_da_data {
 951   */
 952  static int mpage_da_submit_io(struct mpage_da_data *mpd)
 953  {
 954 -       struct address_space *mapping = mpd->inode->i_mapping;
 955 -       int ret = 0, err, nr_pages, i;
 956 -       unsigned long index, end;
 957 +       long pages_skipped;
 958         struct pagevec pvec;
 959 +       unsigned long index, end;
 960 +       int ret = 0, err, nr_pages, i;
 961 +       struct inode *inode = mpd->inode;
 962 +       struct address_space *mapping = inode->i_mapping;
 963
 964         BUG_ON(mpd->next_page <= mpd->first_page);
 965 -       pagevec_init(&pvec, 0);
 966 +       /*
 967 +        * We need to start from the first_page to the next_page - 1
 968 +        * to make sure we also write the mapped dirty buffer_heads.
 969 +        * If we look at mpd->lbh.b_blocknr we would only be looking
 970 +        * at the currently mapped buffer_heads.
 971 +        */
 972         index = mpd->first_page;
 973         end = mpd->next_page - 1;
 974
 975 +       pagevec_init(&pvec, 0);
 976         while (index <= end) {
 977 -               /* XXX: optimize tail */
 978                 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
 979                 if (nr_pages == 0)
 980                         break;
 981 @@ -1671,6 +1678,10 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 982                                 break;
 983                         index++;
 984
 985 +                       BUG_ON(!PageLocked(page));
 986 +                       BUG_ON(PageWriteback(page));
 987 +
 988 +                       pages_skipped = mpd->wbc->pages_skipped;
 989                         err = mapping->a_ops->writepage(page, mpd->wbc);
 990                         if (!err)
 991                                 mpd->pages_written++;
 992 @@ -1991,11 +2002,29 @@ static int __mpage_da_writepage(struct page *page,
 993                 bh = head;
 994                 do {
 995                         BUG_ON(buffer_locked(bh));
 996 +                       /*
 997 +                        * We need to try to allocate
 998 +                        * unmapped blocks in the same page.
 999 +                        * Otherwise we won't make progress
1000 +                        * with the page in ext4_da_writepage
1001 +                        */
1002                         if (buffer_dirty(bh) &&
1003                                 (!buffer_mapped(bh) || buffer_delay(bh))) {
1004                                 mpage_add_bh_to_extent(mpd, logical, bh);
1005                                 if (mpd->io_done)
1006                                         return MPAGE_DA_EXTENT_TAIL;
1007 +                       } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
1008 +                               /*
1009 +                                * mapped dirty buffer. We need to update
1010 +                                * the b_state because we look at
1011 +                                * b_state in mpage_da_map_blocks. We don't
1012 +                                * update b_size because if we find an
1013 +                                * unmapped buffer_head later we need to
1014 +                                * use the b_state flag of that buffer_head.
1015 +                                */
1016 +                               if (mpd->lbh.b_size == 0)
1017 +                                       mpd->lbh.b_state =
1018 +                                               bh->b_state & BH_FLAGS;
1019                         }
1020                         logical++;
1021                 } while ((bh = bh->b_this_page) != head);
1022 @@ -2298,6 +2327,20 @@ static int ext4_da_writepages(struct address_space *mapping,
1023          */
1024         if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1025                 return 0;
1026 +
1027 +       /*
1028 +        * If the filesystem has aborted, it is read-only, so return
1029 +        * right away instead of dumping stack traces later on that
1030 +        * will obscure the real source of the problem.  We test
1031 +        * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because
1032 +        * the latter could be true if the filesystem is mounted
1033 +        * read-only, and in that case, ext4_da_writepages should
1034 +        * *never* be called, so if that ever happens, we would want
1035 +        * the stack trace.
1036 +        */
1037 +       if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT))
1038 +               return -EROFS;
1039 +
1040         /*
1041          * Make sure nr_to_write is >= sbi->s_mb_stream_request
1042          * This make sure small files blocks are allocated in
1043 @@ -2336,7 +2379,7 @@ restart_loop:
1044                 handle = ext4_journal_start(inode, needed_blocks);
1045                 if (IS_ERR(handle)) {
1046                         ret = PTR_ERR(handle);
1047 -                       printk(KERN_EMERG "%s: jbd2_start: "
1048 +                       printk(KERN_CRIT "%s: jbd2_start: "
1049                                "%ld pages, ino %lu; err %d\n", __func__,
1050                                 wbc->nr_to_write, inode->i_ino, ret);
1051                         dump_stack();
1052 diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
1053 index ba86b56..dbf6c0e 100644
1054 --- a/fs/ext4/mballoc.c
1055 +++ b/fs/ext4/mballoc.c
1056 @@ -100,7 +100,7 @@
1057   * inode as:
1058   *
1059   *  {                        page                        }
1060 - *  [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
1061 + *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1062   *
1063   *
1064   * one block each for bitmap and buddy information.  So for each group we
1065 @@ -330,6 +330,18 @@
1066   *        object
1067   *
1068   */
1069 +static struct kmem_cache *ext4_pspace_cachep;
1070 +static struct kmem_cache *ext4_ac_cachep;
1071 +static struct kmem_cache *ext4_free_ext_cachep;
1072 +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
1073 +                                       ext4_group_t group);
1074 +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
1075 +                                               ext4_group_t group);
1076 +static int ext4_mb_init_per_dev_proc(struct super_block *sb);
1077 +static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
1078 +static void ext4_mb_free_committed_blocks(struct super_block *);
1079 +static void ext4_mb_poll_new_transaction(struct super_block *sb,
1080 +                                        handle_t *handle);
1081
1082  static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
1083  {
1084 @@ -718,7 +730,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
1085   * stored in the inode as
1086   *
1087   * {                        page                        }
1088 - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]...
1089 + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
1090   *
1091   *
1092   * one block each for bitmap and buddy information.
1093 @@ -784,20 +796,42 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1094                 if (bh[i] == NULL)
1095                         goto out;
1096
1097 -               if (bh_uptodate_or_lock(bh[i]))
1098 +               if (bitmap_uptodate(bh[i]))
1099                         continue;
1100
1101 +               lock_buffer(bh[i]);
1102 +               if (bitmap_uptodate(bh[i])) {
1103 +                       unlock_buffer(bh[i]);
1104 +                       continue;
1105 +               }
1106                 spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1107                 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
1108                         ext4_init_block_bitmap(sb, bh[i],
1109                                                 first_group + i, desc);
1110 +                       set_bitmap_uptodate(bh[i]);
1111                         set_buffer_uptodate(bh[i]);
1112                         unlock_buffer(bh[i]);
1113                         spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1114                         continue;
1115                 }
1116                 spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
1117 +               if (buffer_uptodate(bh[i])) {
1118 +                       /*
1119 +                        * if not uninit if bh is uptodate,
1120 +                        * bitmap is also uptodate
1121 +                        */
1122 +                       set_bitmap_uptodate(bh[i]);
1123 +                       unlock_buffer(bh[i]);
1124 +                       continue;
1125 +               }
1126                 get_bh(bh[i]);
1127 +               /*
1128 +                * submit the buffer_head for read. We can
1129 +                * safely mark the bitmap as uptodate now.
1130 +                * We do it here so the bitmap uptodate bit
1131 +                * get set with buffer lock held.
1132 +                */
1133 +               set_bitmap_uptodate(bh[i]);
1134                 bh[i]->b_end_io = end_buffer_read_sync;
1135                 submit_bh(READ, bh[i]);
1136                 mb_debug("read bitmap for group %lu\n", first_group + i);
1137 @@ -814,6 +848,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1138
1139         err = 0;
1140         first_block = page->index * blocks_per_page;
1141 +       /* init the page  */
1142 +       memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
1143         for (i = 0; i < blocks_per_page; i++) {
1144                 int group;
1145                 struct ext4_group_info *grinfo;
1146 @@ -840,7 +876,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1147                         BUG_ON(incore == NULL);
1148                         mb_debug("put buddy for group %u in page %lu/%x\n",
1149                                 group, page->index, i * blocksize);
1150 -                       memset(data, 0xff, blocksize);
1151                         grinfo = ext4_get_group_info(sb, group);
1152                         grinfo->bb_fragments = 0;
1153                         memset(grinfo->bb_counters, 0,
1154 @@ -848,7 +883,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1155                         /*
1156                          * incore got set to the group block bitmap below
1157                          */
1158 +                       ext4_lock_group(sb, group);
1159                         ext4_mb_generate_buddy(sb, data, incore, group);
1160 +                       ext4_unlock_group(sb, group);
1161                         incore = NULL;
1162                 } else {
1163                         /* this is block of bitmap */
1164 @@ -862,6 +899,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
1165
1166                         /* mark all preallocated blks used in in-core bitmap */
1167                         ext4_mb_generate_from_pa(sb, data, group);
1168 +                       ext4_mb_generate_from_freelist(sb, data, group);
1169                         ext4_unlock_group(sb, group);
1170
1171                         /* set incore so that the buddy information can be
1172 @@ -886,18 +924,20 @@ static noinline_for_stack int
1173  ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1174                                         struct ext4_buddy *e4b)
1175  {
1176 -       struct ext4_sb_info *sbi = EXT4_SB(sb);
1177 -       struct inode *inode = sbi->s_buddy_cache;
1178         int blocks_per_page;
1179         int block;
1180         int pnum;
1181         int poff;
1182         struct page *page;
1183         int ret;
1184 +       struct ext4_group_info *grp;
1185 +       struct ext4_sb_info *sbi = EXT4_SB(sb);
1186 +       struct inode *inode = sbi->s_buddy_cache;
1187
1188         mb_debug("load group %lu\n", group);
1189
1190         blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1191 +       grp = ext4_get_group_info(sb, group);
1192
1193         e4b->bd_blkbits = sb->s_blocksize_bits;
1194         e4b->bd_info = ext4_get_group_info(sb, group);
1195 @@ -905,6 +945,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1196         e4b->bd_group = group;
1197         e4b->bd_buddy_page = NULL;
1198         e4b->bd_bitmap_page = NULL;
1199 +       e4b->alloc_semp = &grp->alloc_sem;
1200 +
1201 +       /* Take the read lock on the group alloc
1202 +        * sem. This would make sure a parallel
1203 +        * ext4_mb_init_group happening on other
1204 +        * groups mapped by the page is blocked
1205 +        * till we are done with allocation
1206 +        */
1207 +       down_read(e4b->alloc_semp);
1208
1209         /*
1210          * the buddy cache inode stores the block bitmap
1211 @@ -920,6 +969,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1212         page = find_get_page(inode->i_mapping, pnum);
1213         if (page == NULL || !PageUptodate(page)) {
1214                 if (page)
1215 +                       /*
1216 +                        * drop the page reference and try
1217 +                        * to get the page with lock. If we
1218 +                        * are not uptodate that implies
1219 +                        * somebody just created the page but
1220 +                        * is yet to initialize the same. So
1221 +                        * wait for it to initialize.
1222 +                        */
1223                         page_cache_release(page);
1224                 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1225                 if (page) {
1226 @@ -985,6 +1042,9 @@ err:
1227                 page_cache_release(e4b->bd_buddy_page);
1228         e4b->bd_buddy = NULL;
1229         e4b->bd_bitmap = NULL;
1230 +
1231 +       /* Done with the buddy cache */
1232 +       up_read(e4b->alloc_semp);
1233         return ret;
1234  }
1235
1236 @@ -994,6 +1054,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b)
1237                 page_cache_release(e4b->bd_bitmap_page);
1238         if (e4b->bd_buddy_page)
1239                 page_cache_release(e4b->bd_buddy_page);
1240 +       /* Done with the buddy cache */
1241 +       if (e4b->alloc_semp)
1242 +               up_read(e4b->alloc_semp);
1243  }
1244
1245
1246 @@ -1031,7 +1094,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
1247                         cur += 32;
1248                         continue;
1249                 }
1250 -               mb_clear_bit_atomic(lock, cur, bm);
1251 +               if (lock)
1252 +                       mb_clear_bit_atomic(lock, cur, bm);
1253 +               else
1254 +                       mb_clear_bit(cur, bm);
1255                 cur++;
1256         }
1257  }
1258 @@ -1049,7 +1115,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
1259                         cur += 32;
1260                         continue;
1261                 }
1262 -               mb_set_bit_atomic(lock, cur, bm);
1263 +               if (lock)
1264 +                       mb_set_bit_atomic(lock, cur, bm);
1265 +               else
1266 +                       mb_set_bit(cur, bm);
1267                 cur++;
1268         }
1269  }
1270 @@ -1296,13 +1365,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1271         ac->ac_tail = ret & 0xffff;
1272         ac->ac_buddy = ret >> 16;
1273
1274 -       /* XXXXXXX: SUCH A HORRIBLE **CK */
1275 -       /*FIXME!! Why ? */
1276 +       /*
1277 +        * take the page reference. We want the page to be pinned
1278 +        * so that we don't get a ext4_mb_init_cache_call for this
1279 +        * group until we update the bitmap. That would mean we
1280 +        * double allocate blocks. The reference is dropped
1281 +        * in ext4_mb_release_context
1282 +        */
1283         ac->ac_bitmap_page = e4b->bd_bitmap_page;
1284         get_page(ac->ac_bitmap_page);
1285         ac->ac_buddy_page = e4b->bd_buddy_page;
1286         get_page(ac->ac_buddy_page);
1287 -
1288 +       /* on allocation we use ac to track the held semaphore */
1289 +       ac->alloc_semp =  e4b->alloc_semp;
1290 +       e4b->alloc_semp = NULL;
1291         /* store last allocated for subsequent stream allocation */
1292         if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
1293                 spin_lock(&sbi->s_md_lock);
1294 @@ -1326,6 +1402,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1295         struct ext4_free_extent ex;
1296         int max;
1297
1298 +       if (ac->ac_status == AC_STATUS_FOUND)
1299 +               return;
1300         /*
1301          * We don't want to scan for a whole year
1302          */
1303 @@ -1692,6 +1770,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1304         return 0;
1305  }
1306
1307 +/*
1308 + * lock the group_info alloc_sem of all the groups
1309 + * belonging to the same buddy cache page. This
1310 + * make sure other parallel operation on the buddy
1311 + * cache doesn't happen  whild holding the buddy cache
1312 + * lock
1313 + */
1314 +int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1315 +{
1316 +       int i;
1317 +       int block, pnum;
1318 +       int blocks_per_page;
1319 +       int groups_per_page;
1320 +       ext4_group_t first_group;
1321 +       struct ext4_group_info *grp;
1322 +
1323 +       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1324 +       /*
1325 +        * the buddy cache inode stores the block bitmap
1326 +        * and buddy information in consecutive blocks.
1327 +        * So for each group we need two blocks.
1328 +        */
1329 +       block = group * 2;
1330 +       pnum = block / blocks_per_page;
1331 +       first_group = pnum * blocks_per_page / 2;
1332 +
1333 +       groups_per_page = blocks_per_page >> 1;
1334 +       if (groups_per_page == 0)
1335 +               groups_per_page = 1;
1336 +       /* read all groups the page covers into the cache */
1337 +       for (i = 0; i < groups_per_page; i++) {
1338 +
1339 +               if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
1340 +                       break;
1341 +               grp = ext4_get_group_info(sb, first_group + i);
1342 +               /* take all groups write allocation
1343 +                * semaphore. This make sure there is
1344 +                * no block allocation going on in any
1345 +                * of that groups
1346 +                */
1347 +               down_write(&grp->alloc_sem);
1348 +       }
1349 +       return i;
1350 +}
1351 +
1352 +void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1353 +                                       ext4_group_t group, int locked_group)
1354 +{
1355 +       int i;
1356 +       int block, pnum;
1357 +       int blocks_per_page;
1358 +       ext4_group_t first_group;
1359 +       struct ext4_group_info *grp;
1360 +
1361 +       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1362 +       /*
1363 +        * the buddy cache inode stores the block bitmap
1364 +        * and buddy information in consecutive blocks.
1365 +        * So for each group we need two blocks.
1366 +        */
1367 +       block = group * 2;
1368 +       pnum = block / blocks_per_page;
1369 +       first_group = pnum * blocks_per_page / 2;
1370 +       /* release locks on all the groups */
1371 +       for (i = 0; i < locked_group; i++) {
1372 +
1373 +               grp = ext4_get_group_info(sb, first_group + i);
1374 +               /* take all groups write allocation
1375 +                * semaphore. This make sure there is
1376 +                * no block allocation going on in any
1377 +                * of that groups
1378 +                */
1379 +               up_write(&grp->alloc_sem);
1380 +       }
1381 +
1382 +}
1383 +
1384 +static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1385 +{
1386 +
1387 +       int ret;
1388 +       void *bitmap;
1389 +       int blocks_per_page;
1390 +       int block, pnum, poff;
1391 +       int num_grp_locked = 0;
1392 +       struct ext4_group_info *this_grp;
1393 +       struct ext4_sb_info *sbi = EXT4_SB(sb);
1394 +       struct inode *inode = sbi->s_buddy_cache;
1395 +       struct page *page = NULL, *bitmap_page = NULL;
1396 +
1397 +       mb_debug("init group %lu\n", group);
1398 +       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1399 +       this_grp = ext4_get_group_info(sb, group);
1400 +       /*
1401 +        * This ensures we don't add group
1402 +        * to this buddy cache via resize
1403 +        */
1404 +       num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
1405 +       if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1406 +               /*
1407 +                * somebody initialized the group
1408 +                * return without doing anything
1409 +                */
1410 +               ret = 0;
1411 +               goto err;
1412 +       }
1413 +       /*
1414 +        * the buddy cache inode stores the block bitmap
1415 +        * and buddy information in consecutive blocks.
1416 +        * So for each group we need two blocks.
1417 +        */
1418 +       block = group * 2;
1419 +       pnum = block / blocks_per_page;
1420 +       poff = block % blocks_per_page;
1421 +       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1422 +       if (page) {
1423 +               BUG_ON(page->mapping != inode->i_mapping);
1424 +               ret = ext4_mb_init_cache(page, NULL);
1425 +               if (ret) {
1426 +                       unlock_page(page);
1427 +                       goto err;
1428 +               }
1429 +               unlock_page(page);
1430 +       }
1431 +       if (page == NULL || !PageUptodate(page)) {
1432 +               ret = -EIO;
1433 +               goto err;
1434 +       }
1435 +       mark_page_accessed(page);
1436 +       bitmap_page = page;
1437 +       bitmap = page_address(page) + (poff * sb->s_blocksize);
1438 +
1439 +       /* init buddy cache */
1440 +       block++;
1441 +       pnum = block / blocks_per_page;
1442 +       poff = block % blocks_per_page;
1443 +       page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1444 +       if (page == bitmap_page) {
1445 +               /*
1446 +                * If both the bitmap and buddy are in
1447 +                * the same page we don't need to force
1448 +                * init the buddy
1449 +                */
1450 +               unlock_page(page);
1451 +       } else if (page) {
1452 +               BUG_ON(page->mapping != inode->i_mapping);
1453 +               ret = ext4_mb_init_cache(page, bitmap);
1454 +               if (ret) {
1455 +                       unlock_page(page);
1456 +                       goto err;
1457 +               }
1458 +               unlock_page(page);
1459 +       }
1460 +       if (page == NULL || !PageUptodate(page)) {
1461 +               ret = -EIO;
1462 +               goto err;
1463 +       }
1464 +       mark_page_accessed(page);
1465 +err:
1466 +       ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1467 +       if (bitmap_page)
1468 +               page_cache_release(bitmap_page);
1469 +       if (page)
1470 +               page_cache_release(page);
1471 +       return ret;
1472 +}
1473 +
1474  static noinline_for_stack int
1475  ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1476  {
1477 @@ -1775,7 +2020,7 @@ repeat:
1478                                 group = 0;
1479
1480                         /* quick check to skip empty groups */
1481 -                       grp = ext4_get_group_info(ac->ac_sb, group);
1482 +                       grp = ext4_get_group_info(sb, group);
1483                         if (grp->bb_free == 0)
1484                                 continue;
1485
1486 @@ -1788,10 +2033,9 @@ repeat:
1487                                  * we need full data about the group
1488                                  * to make a good selection
1489                                  */
1490 -                               err = ext4_mb_load_buddy(sb, group, &e4b);
1491 +                               err = ext4_mb_init_group(sb, group);
1492                                 if (err)
1493                                         goto out;
1494 -                               ext4_mb_release_desc(&e4b);
1495                         }
1496
1497                         /*
1498 @@ -2299,6 +2543,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
1499         }
1500
1501         INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
1502 +       init_rwsem(&meta_group_info[i]->alloc_sem);
1503 +       meta_group_info[i]->bb_free_root.rb_node = NULL;;
1504
1505  #ifdef DOUBLE_CHECK
1506         {
1507 @@ -2325,54 +2571,6 @@ exit_meta_group_info:
1508  } /* ext4_mb_add_groupinfo */
1509
1510  /*
1511 - * Add a group to the existing groups.
1512 - * This function is used for online resize
1513 - */
1514 -int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
1515 -                              struct ext4_group_desc *desc)
1516 -{
1517 -       struct ext4_sb_info *sbi = EXT4_SB(sb);
1518 -       struct inode *inode = sbi->s_buddy_cache;
1519 -       int blocks_per_page;
1520 -       int block;
1521 -       int pnum;
1522 -       struct page *page;
1523 -       int err;
1524 -
1525 -       /* Add group based on group descriptor*/
1526 -       err = ext4_mb_add_groupinfo(sb, group, desc);
1527 -       if (err)
1528 -               return err;
1529 -
1530 -       /*
1531 -        * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
1532 -        * datas) are set not up to date so that they will be re-initilaized
1533 -        * during the next call to ext4_mb_load_buddy
1534 -        */
1535 -
1536 -       /* Set buddy page as not up to date */
1537 -       blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1538 -       block = group * 2;
1539 -       pnum = block / blocks_per_page;
1540 -       page = find_get_page(inode->i_mapping, pnum);
1541 -       if (page != NULL) {
1542 -               ClearPageUptodate(page);
1543 -               page_cache_release(page);
1544 -       }
1545 -
1546 -       /* Set bitmap page as not up to date */
1547 -       block++;
1548 -       pnum = block / blocks_per_page;
1549 -       page = find_get_page(inode->i_mapping, pnum);
1550 -       if (page != NULL) {
1551 -               ClearPageUptodate(page);
1552 -               page_cache_release(page);
1553 -       }
1554 -
1555 -       return 0;
1556 -}
1557 -
1558 -/*
1559   * Update an existing group.
1560   * This function is used for online resize
1561   */
1562 @@ -2495,6 +2693,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
1563                 clear_opt(sbi->s_mount_opt, MBALLOC);
1564                 return -ENOMEM;
1565         }
1566 +
1567 +       i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
1568         sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
1569         if (sbi->s_mb_maxs == NULL) {
1570                 clear_opt(sbi->s_mount_opt, MBALLOC);
1571 @@ -2658,13 +2858,11 @@ int ext4_mb_release(struct super_block *sb)
1572  static noinline_for_stack void
1573  ext4_mb_free_committed_blocks(struct super_block *sb)
1574  {
1575 -       struct ext4_sb_info *sbi = EXT4_SB(sb);
1576 -       int err;
1577 -       int i;
1578 -       int count = 0;
1579 -       int count2 = 0;
1580 -       struct ext4_free_metadata *md;
1581         struct ext4_buddy e4b;
1582 +       struct ext4_group_info *db;
1583 +       struct ext4_sb_info *sbi = EXT4_SB(sb);
1584 +       int err, count = 0, count2 = 0;
1585 +       struct ext4_free_data *entry;
1586
1587         if (list_empty(&sbi->s_committed_transaction))
1588                 return;
1589 @@ -2672,44 +2870,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
1590         /* there is committed blocks to be freed yet */
1591         do {
1592                 /* get next array of blocks */
1593 -               md = NULL;
1594 +               entry = NULL;
1595                 spin_lock(&sbi->s_md_lock);
1596                 if (!list_empty(&sbi->s_committed_transaction)) {
1597 -                       md = list_entry(sbi->s_committed_transaction.next,
1598 -                                       struct ext4_free_metadata, list);
1599 -                       list_del(&md->list);
1600 +                       entry = list_entry(sbi->s_committed_transaction.next,
1601 +                                       struct ext4_free_data, list);
1602 +                       list_del(&entry->list);
1603                 }
1604                 spin_unlock(&sbi->s_md_lock);
1605
1606 -               if (md == NULL)
1607 +               if (entry == NULL)
1608                         break;
1609
1610                 mb_debug("gonna free %u blocks in group %lu (0x%p):",
1611 -                               md->num, md->group, md);
1612 +                               entry->count, entry->group, entry);
1613
1614 -               err = ext4_mb_load_buddy(sb, md->group, &e4b);
1615 +               err = ext4_mb_load_buddy(sb, entry->group, &e4b);
1616                 /* we expect to find existing buddy because it's pinned */
1617                 BUG_ON(err != 0);
1618
1619 +               db = e4b.bd_info;
1620                 /* there are blocks to put in buddy to make them really free */
1621 -               count += md->num;
1622 +               count += entry->count;
1623                 count2++;
1624 -               ext4_lock_group(sb, md->group);
1625 -               for (i = 0; i < md->num; i++) {
1626 -                       mb_debug(" %u", md->blocks[i]);
1627 -                       mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
1628 +               ext4_lock_group(sb, entry->group);
1629 +               /* Take it out of per group rb tree */
1630 +               rb_erase(&entry->node, &(db->bb_free_root));
1631 +               mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
1632 +
1633 +               if (!db->bb_free_root.rb_node) {
1634 +                       /* No more items in the per group rb tree
1635 +                        * balance refcounts from ext4_mb_free_metadata()
1636 +                        */
1637 +                       page_cache_release(e4b.bd_buddy_page);
1638 +                       page_cache_release(e4b.bd_bitmap_page);
1639                 }
1640 -               mb_debug("\n");
1641 -               ext4_unlock_group(sb, md->group);
1642 -
1643 -               /* balance refcounts from ext4_mb_free_metadata() */
1644 -               page_cache_release(e4b.bd_buddy_page);
1645 -               page_cache_release(e4b.bd_bitmap_page);
1646 +               ext4_unlock_group(sb, entry->group);
1647
1648 -               kfree(md);
1649 +               kmem_cache_free(ext4_free_ext_cachep, entry);
1650                 ext4_mb_release_desc(&e4b);
1651 -
1652 -       } while (md);
1653 +       } while (1);
1654
1655         mb_debug("freed %u blocks in %u structures\n", count, count2);
1656  }
1657 @@ -2864,6 +3064,16 @@ int __init init_ext4_mballoc(void)
1658                 kmem_cache_destroy(ext4_pspace_cachep);
1659                 return -ENOMEM;
1660         }
1661 +
1662 +       ext4_free_ext_cachep =
1663 +               kmem_cache_create("ext4_free_block_extents",
1664 +                                    sizeof(struct ext4_free_data),
1665 +                                    0, SLAB_RECLAIM_ACCOUNT, NULL);
1666 +       if (ext4_free_ext_cachep == NULL) {
1667 +               kmem_cache_destroy(ext4_pspace_cachep);
1668 +               kmem_cache_destroy(ext4_ac_cachep);
1669 +               return -ENOMEM;
1670 +       }
1671  #ifdef CONFIG_PROC_FS
1672         proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
1673         if (proc_root_ext4 == NULL)
1674 @@ -2880,6 +3090,7 @@ void exit_ext4_mballoc(void)
1675  #ifdef CONFIG_PROC_FS
1676         remove_proc_entry("fs/ext4", NULL);
1677  #endif
1678 +       kmem_cache_destroy(ext4_free_ext_cachep);
1679  }
1680
1681
1682 @@ -2941,8 +3152,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
1683             in_range(block + len - 1, ext4_inode_table(sb, gdp),
1684                      EXT4_SB(sb)->s_itb_per_group)) {
1685                 ext4_error(sb, __func__,
1686 -                          "Allocating block in system zone - block = %llu",
1687 -                          block);
1688 +                          "Allocating block %llu in system zone of %lu group\n",
1689 +                          block, ac->ac_b_ex.fe_group);
1690                 /* File system mounted not to panic on error
1691                  * Fix the bitmap and repeat the block allocation
1692                  * We leak some of the blocks here.
1693 @@ -2964,10 +3175,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
1694                 }
1695         }
1696  #endif
1697 -       mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data,
1698 -                               ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
1699 -
1700         spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
1701 +       mb_set_bits(NULL, bitmap_bh->b_data,
1702 +                               ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
1703         if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
1704                 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
1705                 gdp->bg_free_blocks_count =
1706 @@ -3400,10 +3610,37 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
1707                 ac->ac_criteria = 20;
1708                 return 1;
1709         }
1710 +
1711         return 0;
1712  }
1713
1714  /*
1715 + * the function goes through all block freed in the group
1716 + * but not yet committed and marks them used in in-core bitmap.
1717 + * buddy must be generated from this bitmap
1718 + * Need to be called with ext4 group lock (ext4_lock_group)
1719 + */
1720 +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
1721 +                                               ext4_group_t group)
1722 +{
1723 +       struct rb_node *n;
1724 +       struct ext4_group_info *grp;
1725 +       struct ext4_free_data *entry;
1726 +
1727 +       grp = ext4_get_group_info(sb, group);
1728 +       n = rb_first(&(grp->bb_free_root));
1729 +
1730 +       while (n) {
1731 +               entry = rb_entry(n, struct ext4_free_data, node);
1732 +               mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
1733 +                               bitmap, entry->start_blk,
1734 +                               entry->count);
1735 +               n = rb_next(n);
1736 +       }
1737 +       return;
1738 +}
1739 +
1740 +/*
1741   * the function goes through all preallocation in this group and marks them
1742   * used in in-core bitmap. buddy must be generated from this bitmap
1743   * Need to be called with ext4 group lock (ext4_lock_group)
1744 @@ -4166,6 +4403,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
1745         ac->ac_pa = NULL;
1746         ac->ac_bitmap_page = NULL;
1747         ac->ac_buddy_page = NULL;
1748 +       ac->alloc_semp = NULL;
1749         ac->ac_lg = NULL;
1750
1751         /* we have to define context: we'll we work with a file or
1752 @@ -4346,6 +4584,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
1753                 }
1754                 ext4_mb_put_pa(ac, ac->ac_sb, pa);
1755         }
1756 +       if (ac->alloc_semp)
1757 +               up_read(ac->alloc_semp);
1758         if (ac->ac_bitmap_page)
1759                 page_cache_release(ac->ac_bitmap_page);
1760         if (ac->ac_buddy_page)
1761 @@ -4449,10 +4689,14 @@ repeat:
1762                                 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
1763                         ext4_mb_new_preallocation(ac);
1764         }
1765 -
1766         if (likely(ac->ac_status == AC_STATUS_FOUND)) {
1767                 *errp = ext4_mb_mark_diskspace_used(ac, handle);
1768                 if (*errp ==  -EAGAIN) {
1769 +                       /*
1770 +                        * drop the reference that we took
1771 +                        * in ext4_mb_use_best_found
1772 +                        */
1773 +                       ext4_mb_release_context(ac);
1774                         ac->ac_b_ex.fe_group = 0;
1775                         ac->ac_b_ex.fe_start = 0;
1776                         ac->ac_b_ex.fe_len = 0;
1777 @@ -4517,65 +4761,97 @@ static void ext4_mb_poll_new_transaction(struct super_block *sb,
1778         ext4_mb_free_committed_blocks(sb);
1779  }
1780
1781 +/*
1782 + * We can merge two free data extents only if the physical blocks
1783 + * are contiguous, AND the extents were freed by the same transaction,
1784 + * AND the blocks are associated with the same group.
1785 + */
1786 +static int can_merge(struct ext4_free_data *entry1,
1787 +                       struct ext4_free_data *entry2)
1788 +{
1789 +       if ((entry1->t_tid == entry2->t_tid) &&
1790 +           (entry1->group == entry2->group) &&
1791 +           ((entry1->start_blk + entry1->count) == entry2->start_blk))
1792 +               return 1;
1793 +       return 0;
1794 +}
1795 +
1796  static noinline_for_stack int
1797  ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
1798 -                         ext4_group_t group, ext4_grpblk_t block, int count)
1799 +                       struct ext4_free_data *new_entry)
1800  {
1801 +       ext4_grpblk_t block;
1802 +       struct ext4_free_data *entry;
1803         struct ext4_group_info *db = e4b->bd_info;
1804         struct super_block *sb = e4b->bd_sb;
1805         struct ext4_sb_info *sbi = EXT4_SB(sb);
1806 -       struct ext4_free_metadata *md;
1807 -       int i;
1808 +       struct rb_node **n = &db->bb_free_root.rb_node, *node;
1809 +       struct rb_node *parent = NULL, *new_node;
1810
1811         BUG_ON(e4b->bd_bitmap_page == NULL);
1812         BUG_ON(e4b->bd_buddy_page == NULL);
1813
1814 -       ext4_lock_group(sb, group);
1815 -       for (i = 0; i < count; i++) {
1816 -               md = db->bb_md_cur;
1817 -               if (md && db->bb_tid != handle->h_transaction->t_tid) {
1818 -                       db->bb_md_cur = NULL;
1819 -                       md = NULL;
1820 +       new_node = &new_entry->node;
1821 +       block = new_entry->start_blk;
1822 +
1823 +       if (!*n) {
1824 +               /* first free block exent. We need to
1825 +                  protect buddy cache from being freed,
1826 +                * otherwise we'll refresh it from
1827 +                * on-disk bitmap and lose not-yet-available
1828 +                * blocks */
1829 +               page_cache_get(e4b->bd_buddy_page);
1830 +               page_cache_get(e4b->bd_bitmap_page);
1831 +       }
1832 +       while (*n) {
1833 +               parent = *n;
1834 +               entry = rb_entry(parent, struct ext4_free_data, node);
1835 +               if (block < entry->start_blk)
1836 +                       n = &(*n)->rb_left;
1837 +               else if (block >= (entry->start_blk + entry->count))
1838 +                       n = &(*n)->rb_right;
1839 +               else {
1840 +                       ext4_error(sb, __func__,
1841 +                           "Double free of blocks %d (%d %d)\n",
1842 +                           block, entry->start_blk, entry->count);
1843 +                       return 0;
1844                 }
1845 +       }
1846
1847 -               if (md == NULL) {
1848 -                       ext4_unlock_group(sb, group);
1849 -                       md = kmalloc(sizeof(*md), GFP_NOFS);
1850 -                       if (md == NULL)
1851 -                               return -ENOMEM;
1852 -                       md->num = 0;
1853 -                       md->group = group;
1854 -
1855 -                       ext4_lock_group(sb, group);
1856 -                       if (db->bb_md_cur == NULL) {
1857 -                               spin_lock(&sbi->s_md_lock);
1858 -                               list_add(&md->list, &sbi->s_active_transaction);
1859 -                               spin_unlock(&sbi->s_md_lock);
1860 -                               /* protect buddy cache from being freed,
1861 -                                * otherwise we'll refresh it from
1862 -                                * on-disk bitmap and lose not-yet-available
1863 -                                * blocks */
1864 -                               page_cache_get(e4b->bd_buddy_page);
1865 -                               page_cache_get(e4b->bd_bitmap_page);
1866 -                               db->bb_md_cur = md;
1867 -                               db->bb_tid = handle->h_transaction->t_tid;
1868 -                               mb_debug("new md 0x%p for group %lu\n",
1869 -                                               md, md->group);
1870 -                       } else {
1871 -                               kfree(md);
1872 -                               md = db->bb_md_cur;
1873 -                       }
1874 +       rb_link_node(new_node, parent, n);
1875 +       rb_insert_color(new_node, &db->bb_free_root);
1876 +
1877 +       /* Now try to see the extent can be merged to left and right */
1878 +       node = rb_prev(new_node);
1879 +       if (node) {
1880 +               entry = rb_entry(node, struct ext4_free_data, node);
1881 +               if (can_merge(entry, new_entry)) {
1882 +                       new_entry->start_blk = entry->start_blk;
1883 +                       new_entry->count += entry->count;
1884 +                       rb_erase(node, &(db->bb_free_root));
1885 +                       spin_lock(&sbi->s_md_lock);
1886 +                       list_del(&entry->list);
1887 +                       spin_unlock(&sbi->s_md_lock);
1888 +                       kmem_cache_free(ext4_free_ext_cachep, entry);
1889                 }
1890 +       }
1891
1892 -               BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
1893 -               md->blocks[md->num] = block + i;
1894 -               md->num++;
1895 -               if (md->num == EXT4_BB_MAX_BLOCKS) {
1896 -                       /* no more space, put full container on a sb's list */
1897 -                       db->bb_md_cur = NULL;
1898 +       node = rb_next(new_node);
1899 +       if (node) {
1900 +               entry = rb_entry(node, struct ext4_free_data, node);
1901 +               if (can_merge(new_entry, entry)) {
1902 +                       new_entry->count += entry->count;
1903 +                       rb_erase(node, &(db->bb_free_root));
1904 +                       spin_lock(&sbi->s_md_lock);
1905 +                       list_del(&entry->list);
1906 +                       spin_unlock(&sbi->s_md_lock);
1907 +                       kmem_cache_free(ext4_free_ext_cachep, entry);
1908                 }
1909         }
1910 -       ext4_unlock_group(sb, group);
1911 +       /* Add the extent to active_transaction list */
1912 +       spin_lock(&sbi->s_md_lock);
1913 +       list_add(&new_entry->list, &sbi->s_active_transaction);
1914 +       spin_unlock(&sbi->s_md_lock);
1915         return 0;
1916  }
1917
1918 @@ -4675,11 +4951,6 @@ do_more:
1919         err = ext4_journal_get_write_access(handle, gd_bh);
1920         if (err)
1921                 goto error_return;
1922 -
1923 -       err = ext4_mb_load_buddy(sb, block_group, &e4b);
1924 -       if (err)
1925 -               goto error_return;
1926 -
1927  #ifdef AGGRESSIVE_CHECK
1928         {
1929                 int i;
1930 @@ -4687,13 +4958,6 @@ do_more:
1931                         BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
1932         }
1933  #endif
1934 -       mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
1935 -                       bit, count);
1936 -
1937 -       /* We dirtied the bitmap block */
1938 -       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
1939 -       err = ext4_journal_dirty_metadata(handle, bitmap_bh);
1940 -
1941         if (ac) {
1942                 ac->ac_b_ex.fe_group = block_group;
1943                 ac->ac_b_ex.fe_start = bit;
1944 @@ -4701,12 +4965,33 @@ do_more:
1945                 ext4_mb_store_history(ac);
1946         }
1947
1948 +       err = ext4_mb_load_buddy(sb, block_group, &e4b);
1949 +       if (err)
1950 +               goto error_return;
1951         if (metadata) {
1952 -               /* blocks being freed are metadata. these blocks shouldn't
1953 -                * be used until this transaction is committed */
1954 -               ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
1955 +               struct ext4_free_data *new_entry;
1956 +               /*
1957 +                * blocks being freed are metadata. these blocks shouldn't
1958 +                * be used until this transaction is committed
1959 +                */
1960 +               new_entry  = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
1961 +               new_entry->start_blk = bit;
1962 +               new_entry->group  = block_group;
1963 +               new_entry->count = count;
1964 +               new_entry->t_tid = handle->h_transaction->t_tid;
1965 +               ext4_lock_group(sb, block_group);
1966 +               mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
1967 +                               bit, count);
1968 +               ext4_mb_free_metadata(handle, &e4b, new_entry);
1969 +               ext4_unlock_group(sb, block_group);
1970         } else {
1971                 ext4_lock_group(sb, block_group);
1972 +               /* need to update group_info->bb_free and bitmap
1973 +                * with group lock held. generate_buddy look at
1974 +                * them with group lock_held
1975 +                */
1976 +               mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
1977 +                               bit, count);
1978                 mb_free_blocks(inode, &e4b, bit, count);
1979                 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
1980                 ext4_unlock_group(sb, block_group);
1981 @@ -4729,6 +5014,10 @@ do_more:
1982
1983         *freed += count;
1984
1985 +       /* We dirtied the bitmap block */
1986 +       BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
1987 +       err = ext4_journal_dirty_metadata(handle, bitmap_bh);
1988 +
1989         /* And the group descriptor block */
1990         BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
1991         ret = ext4_journal_dirty_metadata(handle, gd_bh);
1992 diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
1993 index c7c9906..0a28dd3 100644
1994 --- a/fs/ext4/mballoc.h
1995 +++ b/fs/ext4/mballoc.h
1996 @@ -18,6 +18,7 @@
1997  #include <linux/pagemap.h>
1998  #include <linux/seq_file.h>
1999  #include <linux/version.h>
2000 +#include <linux/mutex.h>
2001  #include "ext4_jbd2.h"
2002  #include "ext4.h"
2003  #include "group.h"
2004 @@ -96,25 +97,27 @@
2005   */
2006  #define MB_DEFAULT_GROUP_PREALLOC      512
2007
2008 -static struct kmem_cache *ext4_pspace_cachep;
2009 -static struct kmem_cache *ext4_ac_cachep;
2010 +struct ext4_free_data {
2011 +       /* this links the free block information from group_info */
2012 +       struct rb_node node;
2013
2014 -#ifdef EXT4_BB_MAX_BLOCKS
2015 -#undef EXT4_BB_MAX_BLOCKS
2016 -#endif
2017 -#define EXT4_BB_MAX_BLOCKS     30
2018 +       /* this links the free block information from ext4_sb_info */
2019 +       struct list_head list;
2020
2021 -struct ext4_free_metadata {
2022 +       /* group which free block extent belongs */
2023         ext4_group_t group;
2024 -       unsigned short num;
2025 -       ext4_grpblk_t  blocks[EXT4_BB_MAX_BLOCKS];
2026 -       struct list_head list;
2027 +
2028 +       /* free block extent */
2029 +       ext4_grpblk_t start_blk;
2030 +       ext4_grpblk_t count;
2031 +
2032 +       /* transaction which freed this extent */
2033 +       tid_t   t_tid;
2034  };
2035
2036  struct ext4_group_info {
2037         unsigned long   bb_state;
2038 -       unsigned long   bb_tid;
2039 -       struct ext4_free_metadata *bb_md_cur;
2040 +       struct rb_root  bb_free_root;
2041         unsigned short  bb_first_free;
2042         unsigned short  bb_free;
2043         unsigned short  bb_fragments;
2044 @@ -122,6 +125,7 @@ struct ext4_group_info {
2045  #ifdef DOUBLE_CHECK
2046         void            *bb_bitmap;
2047  #endif
2048 +       struct rw_semaphore alloc_sem;
2049         unsigned short  bb_counters[];
2050  };
2051
2052 @@ -209,6 +213,11 @@ struct ext4_allocation_context {
2053         __u8 ac_op;             /* operation, for history only */
2054         struct page *ac_bitmap_page;
2055         struct page *ac_buddy_page;
2056 +       /*
2057 +        * pointer to the held semaphore upon successful
2058 +        * block allocation
2059 +        */
2060 +       struct rw_semaphore *alloc_semp;
2061         struct ext4_prealloc_space *ac_pa;
2062         struct ext4_locality_group *ac_lg;
2063  };
2064 @@ -242,6 +251,7 @@ struct ext4_buddy {
2065         struct super_block *bd_sb;
2066         __u16 bd_blkbits;
2067         ext4_group_t bd_group;
2068 +       struct rw_semaphore *alloc_semp;
2069  };
2070  #define EXT4_MB_BITMAP(e4b)    ((e4b)->bd_bitmap)
2071  #define EXT4_MB_BUDDY(e4b)     ((e4b)->bd_buddy)
2072 @@ -251,8 +261,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
2073  {
2074         return;
2075  }
2076 -#else
2077 -static void ext4_mb_store_history(struct ext4_allocation_context *ac);
2078  #endif
2079
2080  #define in_range(b, first, len)        ((b) >= (first) && (b) <= (first) + (len) - 1)
2081 @@ -260,19 +268,6 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac);
2082  static struct proc_dir_entry *proc_root_ext4;
2083  struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
2084
2085 -static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
2086 -                                       ext4_group_t group);
2087 -static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
2088 -static void ext4_mb_free_committed_blocks(struct super_block *);
2089 -static void ext4_mb_return_to_preallocation(struct inode *inode,
2090 -                                       struct ext4_buddy *e4b, sector_t block,
2091 -                                       int count);
2092 -static void ext4_mb_put_pa(struct ext4_allocation_context *,
2093 -                       struct super_block *, struct ext4_prealloc_space *pa);
2094 -static int ext4_mb_init_per_dev_proc(struct super_block *sb);
2095 -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
2096 -
2097 -
2098  static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
2099  {
2100         struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
2101 @@ -297,7 +292,7 @@ static inline int ext4_is_group_locked(struct super_block *sb,
2102                                                 &(grinfo->bb_state));
2103  }
2104
2105 -static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
2106 +static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
2107                                         struct ext4_free_extent *fex)
2108  {
2109         ext4_fsblk_t block;
2110 diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
2111 index d626533..4f3628f 100644
2112 --- a/fs/ext4/namei.c
2113 +++ b/fs/ext4/namei.c
2114 @@ -371,6 +371,8 @@ dx_probe(struct dentry *dentry, struct inode *dir,
2115                 goto fail;
2116         }
2117         hinfo->hash_version = root->info.hash_version;
2118 +       if (hinfo->hash_version <= DX_HASH_TEA)
2119 +               hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
2120         hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2121         if (dentry)
2122                 ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
2123 @@ -640,6 +642,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
2124         dir = dir_file->f_path.dentry->d_inode;
2125         if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
2126                 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
2127 +               if (hinfo.hash_version <= DX_HASH_TEA)
2128 +                       hinfo.hash_version +=
2129 +                               EXT4_SB(dir->i_sb)->s_hash_unsigned;
2130                 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2131                 count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
2132                                                start_hash, start_minor_hash);
2133 @@ -1377,7 +1382,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
2134         struct fake_dirent *fde;
2135
2136         blocksize =  dir->i_sb->s_blocksize;
2137 -       dxtrace(printk("Creating index\n"));
2138 +       dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
2139         retval = ext4_journal_get_write_access(handle, bh);
2140         if (retval) {
2141                 ext4_std_error(dir->i_sb, retval);
2142 @@ -1386,6 +1391,20 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
2143         }
2144         root = (struct dx_root *) bh->b_data;
2145
2146 +       /* The 0th block becomes the root, move the dirents out */
2147 +       fde = &root->dotdot;
2148 +       de = (struct ext4_dir_entry_2 *)((char *)fde +
2149 +               ext4_rec_len_from_disk(fde->rec_len));
2150 +       if ((char *) de >= (((char *) root) + blocksize)) {
2151 +               ext4_error(dir->i_sb, __func__,
2152 +                          "invalid rec_len for '..' in inode %lu",
2153 +                          dir->i_ino);
2154 +               brelse(bh);
2155 +               return -EIO;
2156 +       }
2157 +       len = ((char *) root) + blocksize - (char *) de;
2158 +
2159 +       /* Allocate new block for the 0th block's dirents */
2160         bh2 = ext4_append (handle, dir, &block, &retval);
2161         if (!(bh2)) {
2162                 brelse(bh);
2163 @@ -1394,11 +1413,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
2164         EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
2165         data1 = bh2->b_data;
2166
2167 -       /* The 0th block becomes the root, move the dirents out */
2168 -       fde = &root->dotdot;
2169 -       de = (struct ext4_dir_entry_2 *)((char *)fde +
2170 -               ext4_rec_len_from_disk(fde->rec_len));
2171 -       len = ((char *) root) + blocksize - (char *) de;
2172         memcpy (data1, de, len);
2173         de = (struct ext4_dir_entry_2 *) data1;
2174         top = data1 + len;
2175 @@ -1418,6 +1432,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
2176
2177         /* Initialize as for dx_probe */
2178         hinfo.hash_version = root->info.hash_version;
2179 +       if (hinfo.hash_version <= DX_HASH_TEA)
2180 +               hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
2181         hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2182         ext4fs_dirhash(name, namelen, &hinfo);
2183         frame = frames;
2184 diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
2185 index 3922a8b..0070431 100644
2186 --- a/fs/ext4/resize.c
2187 +++ b/fs/ext4/resize.c
2188 @@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb,
2189         if ((err = extend_or_restart_transaction(handle, 2, bh)))
2190                 goto exit_bh;
2191
2192 -       mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb),
2193 -                       bh->b_data);
2194 +       mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data);
2195         ext4_journal_dirty_metadata(handle, bh);
2196         brelse(bh);
2197 -
2198         /* Mark unused entries in inode bitmap used */
2199         ext4_debug("clear inode bitmap %#04llx (+%llu)\n",
2200                    input->inode_bitmap, input->inode_bitmap - start);
2201 @@ -297,7 +295,7 @@ static int setup_new_group_blocks(struct super_block *sb,
2202                 goto exit_journal;
2203         }
2204
2205 -       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb),
2206 +       mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
2207                         bh->b_data);
2208         ext4_journal_dirty_metadata(handle, bh);
2209  exit_bh:
2210 @@ -747,6 +745,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2211         struct inode *inode = NULL;
2212         handle_t *handle;
2213         int gdb_off, gdb_num;
2214 +       int num_grp_locked = 0;
2215         int err, err2;
2216
2217         gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
2218 @@ -787,6 +786,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2219                 }
2220         }
2221
2222 +
2223         if ((err = verify_group_input(sb, input)))
2224                 goto exit_put;
2225
2226 @@ -855,15 +855,18 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2227           * using the new disk blocks.
2228           */
2229
2230 +       num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
2231         /* Update group descriptor block for new group */
2232         gdp = (struct ext4_group_desc *)((char *)primary->b_data +
2233                                          gdb_off * EXT4_DESC_SIZE(sb));
2234
2235 +       memset(gdp, 0, EXT4_DESC_SIZE(sb));
2236         ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
2237         ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
2238         ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
2239         gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count);
2240         gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb));
2241 +       gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
2242         gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
2243
2244         /*
2245 @@ -871,9 +874,11 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2246          * descriptor
2247          */
2248         if (test_opt(sb, MBALLOC)) {
2249 -               err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
2250 -               if (err)
2251 +               err = ext4_mb_add_groupinfo(sb, input->group, gdp);
2252 +               if (err) {
2253 +                       ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
2254                         goto exit_journal;
2255 +               }
2256         }
2257         /*
2258          * Make the new blocks and inodes valid next.  We do this before
2259 @@ -915,6 +920,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
2260
2261         /* Update the global fs size fields */
2262         sbi->s_groups_count++;
2263 +       ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
2264
2265         ext4_journal_dirty_metadata(handle, primary);
2266
2267 @@ -976,9 +982,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
2268         struct buffer_head * bh;
2269         handle_t *handle;
2270         int err;
2271 -       unsigned long freed_blocks;
2272         ext4_group_t group;
2273 -       struct ext4_group_info *grp;
2274
2275         /* We don't need to worry about locking wrt other resizers just
2276          * yet: we're going to revalidate es->s_blocks_count after
2277 @@ -1077,50 +1081,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
2278         unlock_super(sb);
2279         ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
2280                    o_blocks_count + add);
2281 -       ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
2282 +       /* We add the blocks to the bitmap and set the group need init bit */
2283 +       ext4_add_groupblocks(handle, sb, o_blocks_count, add);
2284         ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
2285                    o_blocks_count + add);
2286         if ((err = ext4_journal_stop(handle)))
2287                 goto exit_put;
2288
2289 -       /*
2290 -        * Mark mballoc pages as not up to date so that they will be updated
2291 -        * next time they are loaded by ext4_mb_load_buddy.
2292 -        */
2293 -       if (test_opt(sb, MBALLOC)) {
2294 -               struct ext4_sb_info *sbi = EXT4_SB(sb);
2295 -               struct inode *inode = sbi->s_buddy_cache;
2296 -               int blocks_per_page;
2297 -               int block;
2298 -               int pnum;
2299 -               struct page *page;
2300 -
2301 -               /* Set buddy page as not up to date */
2302 -               blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
2303 -               block = group * 2;
2304 -               pnum = block / blocks_per_page;
2305 -               page = find_get_page(inode->i_mapping, pnum);
2306 -               if (page != NULL) {
2307 -                       ClearPageUptodate(page);
2308 -                       page_cache_release(page);
2309 -               }
2310 -
2311 -               /* Set bitmap page as not up to date */
2312 -               block++;
2313 -               pnum = block / blocks_per_page;
2314 -               page = find_get_page(inode->i_mapping, pnum);
2315 -               if (page != NULL) {
2316 -                       ClearPageUptodate(page);
2317 -                       page_cache_release(page);
2318 -               }
2319 -
2320 -               /* Get the info on the last group */
2321 -               grp = ext4_get_group_info(sb, group);
2322 -
2323 -               /* Update free blocks in group info */
2324 -               ext4_mb_update_group_info(grp, add);
2325 -       }
2326 -
2327         if (test_opt(sb, DEBUG))
2328                 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
2329                        ext4_blocks_count(es));
2330 diff --git a/fs/ext4/super.c b/fs/ext4/super.c
2331 index 7726e8e..5e4491d 100644
2332 --- a/fs/ext4/super.c
2333 +++ b/fs/ext4/super.c
2334 @@ -1493,7 +1493,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
2335         ext4_group_t flex_group_count;
2336         ext4_group_t flex_group;
2337         int groups_per_flex = 0;
2338 -       __u64 block_bitmap = 0;
2339         int i;
2340
2341         if (!sbi->s_es->s_log_groups_per_flex) {
2342 @@ -1516,9 +1515,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
2343                 goto failed;
2344         }
2345
2346 -       gdp = ext4_get_group_desc(sb, 1, &bh);
2347 -       block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
2348 -
2349         for (i = 0; i < sbi->s_groups_count; i++) {
2350                 gdp = ext4_get_group_desc(sb, i, &bh);
2351
2352 @@ -1920,8 +1916,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2353         struct inode *root;
2354         int ret = -EINVAL;
2355         int blocksize;
2356 -       int db_count;
2357 -       int i;
2358 +       unsigned int db_count;
2359 +       unsigned int i;
2360         int needs_recovery;
2361         __le32 features;
2362         __u64 blocks_count;
2363 @@ -2172,6 +2168,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2364         for (i = 0; i < 4; i++)
2365                 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
2366         sbi->s_def_hash_version = es->s_def_hash_version;
2367 +       i = le32_to_cpu(es->s_flags);
2368 +       if (i & EXT2_FLAGS_UNSIGNED_HASH)
2369 +               sbi->s_hash_unsigned = 3;
2370 +       else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
2371 +#ifdef __CHAR_UNSIGNED__
2372 +               es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
2373 +               sbi->s_hash_unsigned = 3;
2374 +#else
2375 +               es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
2376 +#endif
2377 +               sb->s_dirt = 1;
2378 +       }
2379
2380         if (sbi->s_blocks_per_group > blocksize * 8) {
2381                 printk(KERN_ERR
2382 @@ -2199,20 +2207,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2383         if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
2384                 goto cantfind_ext4;
2385
2386 -       /* ensure blocks_count calculation below doesn't sign-extend */
2387 -       if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) <
2388 -           le32_to_cpu(es->s_first_data_block) + 1) {
2389 -               printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, "
2390 -                      "first data block %u, blocks per group %lu\n",
2391 -                       ext4_blocks_count(es),
2392 -                       le32_to_cpu(es->s_first_data_block),
2393 -                       EXT4_BLOCKS_PER_GROUP(sb));
2394 +       /*
2395 +        * It makes no sense for the first data block to be beyond the end
2396 +        * of the filesystem.
2397 +        */
2398 +       if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
2399 +               printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
2400 +                      "block %u is beyond end of filesystem (%llu)\n",
2401 +                      le32_to_cpu(es->s_first_data_block),
2402 +                      ext4_blocks_count(es));
2403                 goto failed_mount;
2404         }
2405         blocks_count = (ext4_blocks_count(es) -
2406                         le32_to_cpu(es->s_first_data_block) +
2407                         EXT4_BLOCKS_PER_GROUP(sb) - 1);
2408         do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
2409 +       if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
2410 +               printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
2411 +                      "(block count %llu, first data block %u, "
2412 +                      "blocks per group %lu)\n", sbi->s_groups_count,
2413 +                      ext4_blocks_count(es),
2414 +                      le32_to_cpu(es->s_first_data_block),
2415 +                      EXT4_BLOCKS_PER_GROUP(sb));
2416 +               goto failed_mount;
2417 +       }
2418         sbi->s_groups_count = blocks_count;
2419         db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2420                    EXT4_DESC_PER_BLOCK(sb);
2421 diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
2422 index 6caf22d..b1f0756 100644
2423 --- a/fs/jbd2/commit.c
2424 +++ b/fs/jbd2/commit.c
2425 @@ -24,6 +24,7 @@
2426  #include <linux/crc32.h>
2427  #include <linux/writeback.h>
2428  #include <linux/backing-dev.h>
2429 +#include <linux/bio.h>
2430
2431  /*
2432   * Default IO end handler for temporary BJ_IO buffer_heads.
2433 @@ -170,12 +171,34 @@ static int journal_submit_commit_record(journal_t *journal,
2434   * This function along with journal_submit_commit_record
2435   * allows to write the commit record asynchronously.
2436   */
2437 -static int journal_wait_on_commit_record(struct buffer_head *bh)
2438 +static int journal_wait_on_commit_record(journal_t *journal,
2439 +                                        struct buffer_head *bh)
2440  {
2441         int ret = 0;
2442
2443 +retry:
2444         clear_buffer_dirty(bh);
2445         wait_on_buffer(bh);
2446 +       if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
2447 +               printk(KERN_WARNING
2448 +                      "JBD2: wait_on_commit_record: sync failed on %s - "
2449 +                      "disabling barriers\n", journal->j_devname);
2450 +               spin_lock(&journal->j_state_lock);
2451 +               journal->j_flags &= ~JBD2_BARRIER;
2452 +               spin_unlock(&journal->j_state_lock);
2453 +
2454 +               lock_buffer(bh);
2455 +               clear_buffer_dirty(bh);
2456 +               set_buffer_uptodate(bh);
2457 +               bh->b_end_io = journal_end_buffer_io_sync;
2458 +
2459 +               ret = submit_bh(WRITE_SYNC, bh);
2460 +               if (ret) {
2461 +                       unlock_buffer(bh);
2462 +                       return ret;
2463 +               }
2464 +               goto retry;
2465 +       }
2466
2467         if (unlikely(!buffer_uptodate(bh)))
2468                 ret = -EIO;
2469 @@ -795,7 +818,7 @@ wait_for_iobuf:
2470                         __jbd2_journal_abort_hard(journal);
2471         }
2472         if (!err && !is_journal_aborted(journal))
2473 -               err = journal_wait_on_commit_record(cbh);
2474 +               err = journal_wait_on_commit_record(journal, cbh);
2475
2476         if (err)
2477                 jbd2_journal_abort(journal, err);
2478 diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
2479 index 66c3499..0e1bd70 100644
2480 --- a/include/linux/jbd2.h
2481 +++ b/include/linux/jbd2.h
2482 @@ -308,7 +308,8 @@ void buffer_assertion_failure(struct buffer_head *bh);
2483                 int val = (expr);                                            \
2484                 if (!val) {                                                  \
2485                         printk(KERN_ERR                                      \
2486 -                               "EXT3-fs unexpected failure: %s;\n",# expr); \
2487 +                              "JBD2 unexpected failure: %s: %s;\n",         \
2488 +                              __func__, #expr);                             \
2489                         printk(KERN_ERR why "\n");                           \
2490                 }                                                            \
2491                 val;                                                         \
2492 @@ -329,6 +330,7 @@ enum jbd_state_bits {
2493         BH_State,               /* Pins most journal_head state */
2494         BH_JournalHead,         /* Pins bh->b_private and jh->b_bh */
2495         BH_Unshadow,            /* Dummy bit, for BJ_Shadow wakeup filtering */
2496 +       BH_JBDPrivateStart,     /* First bit available for private use by FS */
2497  };
2498
2499  BUFFER_FNS(JBD, jbd)
2500 diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
2501 index 794e546..e7e7c7d 100644
2502 --- a/include/linux/pci_ids.h
2503 +++ b/include/linux/pci_ids.h
2504 @@ -1301,6 +1301,7 @@
2505  #define PCI_DEVICE_ID_VIA_VT3351       0x0351
2506  #define PCI_DEVICE_ID_VIA_VT3364       0x0364
2507  #define PCI_DEVICE_ID_VIA_8371_0       0x0391
2508 +#define PCI_DEVICE_ID_VIA_6415         0x0415
2509  #define PCI_DEVICE_ID_VIA_8501_0       0x0501
2510  #define PCI_DEVICE_ID_VIA_82C561       0x0561
2511  #define PCI_DEVICE_ID_VIA_82C586_1     0x0571
2512 diff --git a/include/linux/pid.h b/include/linux/pid.h
2513 index d7e98ff..93997c9 100644
2514 --- a/include/linux/pid.h
2515 +++ b/include/linux/pid.h
2516 @@ -123,6 +123,24 @@ extern struct pid *alloc_pid(struct pid_namespace *ns);
2517  extern void free_pid(struct pid *pid);
2518
2519  /*
2520 + * ns_of_pid() returns the pid namespace in which the specified pid was
2521 + * allocated.
2522 + *
2523 + * NOTE:
2524 + *     ns_of_pid() is expected to be called for a process (task) that has
2525 + *     an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid
2526 + *     is expected to be non-NULL. If @pid is NULL, caller should handle
2527 + *     the resulting NULL pid-ns.
2528 + */
2529 +static inline struct pid_namespace *ns_of_pid(struct pid *pid)
2530 +{
2531 +       struct pid_namespace *ns = NULL;
2532 +       if (pid)
2533 +               ns = pid->numbers[pid->level].ns;
2534 +       return ns;
2535 +}
2536 +
2537 +/*
2538   * the helpers to get the pid's id seen from different namespaces
2539   *
2540   * pid_nr()    : global id, i.e. the id seen from the init namespace;
2541 diff --git a/ipc/mqueue.c b/ipc/mqueue.c
2542 index a58bfad..ca502aa 100644
2543 --- a/ipc/mqueue.c
2544 +++ b/ipc/mqueue.c
2545 @@ -498,7 +498,8 @@ static void __do_notify(struct mqueue_inode_info *info)
2546                         sig_i.si_errno = 0;
2547                         sig_i.si_code = SI_MESGQ;
2548                         sig_i.si_value = info->notify.sigev_value;
2549 -                       sig_i.si_pid = task_tgid_vnr(current);
2550 +                       sig_i.si_pid = task_tgid_nr_ns(current,
2551 +                                               ns_of_pid(info->notify_owner));
2552                         sig_i.si_uid = current->uid;
2553
2554                         kill_pid_info(info->notify.sigev_signo,