.27 ext4 patches

author Greg Kroah-Hartman <gregkh@suse.de>

Mon, 19 Apr 2010 17:26:16 +0000 (10:26 -0700)

committer Greg Kroah-Hartman <gregkh@suse.de>

Mon, 19 Apr 2010 17:26:16 +0000 (10:26 -0700)
author Greg Kroah-Hartman <gregkh@suse.de>
Mon, 19 Apr 2010 17:26:16 +0000 (10:26 -0700)
committer Greg Kroah-Hartman <gregkh@suse.de>
Mon, 19 Apr 2010 17:26:16 +0000 (10:26 -0700)
diff --git a/queue-2.6.27/ext4-add-percpu-dirty-block-accounting.patch b/queue-2.6.27/ext4-add-percpu-dirty-block-accounting.patch

new file mode 100644 (file)

index 0000000..a86a935
--- /dev/null
+++ b/queue-2.6.27/ext4-add-percpu-dirty-block-accounting.patch
@@ -0,0 +1,312 @@
+From tytso@mit.edu  Mon Apr 19 10:21:01 2010
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Mon, 15 Mar 2010 20:25:58 -0400
+Subject: ext4: Add percpu dirty block accounting.
+To: stable@kernel.org
+Cc: "Theodore Ts'o" <tytso@mit.edu>, Ext4 Developers List <linux-ext4@vger.kernel.org>, Mingming Cao <cmm@us.ibm.com>, "Jayson R. King" <dev@jaysonking.com>, "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+Message-ID: <1268699165-17461-5-git-send-email-tytso@mit.edu>
+
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+
+commit 6bc6e63fcd7dac9e633ea29f1fddd9580ab28f3f upstream.
+
+This patch adds dirty block accounting using percpu_counters.  Delayed
+allocation block reservation is now done by updating dirty block
+counter.  In a later patch we switch to non delalloc mode if the
+filesystem free blocks is greater than 150% of total filesystem dirty
+blocks
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: Mingming Cao<cmm@us.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Jayson R. King <dev@jaysonking.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/balloc.c  |   62 ++++++++++++++++++++++++++++++++++--------------------
+ fs/ext4/ext4_sb.h |    1 
+ fs/ext4/inode.c   |   22 +++++++++----------
+ fs/ext4/mballoc.c |   31 ++++++++++++---------------
+ fs/ext4/super.c   |    8 ++++++
+ 5 files changed, 73 insertions(+), 51 deletions(-)
+
+--- a/fs/ext4/balloc.c
++++ b/fs/ext4/balloc.c
+@@ -1757,26 +1757,38 @@ out:
+ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+                                               ext4_fsblk_t nblocks)
+ {
+-      s64 free_blocks;
++      s64 free_blocks, dirty_blocks;
+       ext4_fsblk_t root_blocks = 0;
+       struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
++      struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
+ 
+-      free_blocks = percpu_counter_read(fbc);
++      free_blocks  = percpu_counter_read_positive(fbc);
++      dirty_blocks = percpu_counter_read_positive(dbc);
+ 
+       if (!capable(CAP_SYS_RESOURCE) &&
+               sbi->s_resuid != current->fsuid &&
+               (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+               root_blocks = ext4_r_blocks_count(sbi->s_es);
+ 
+-      if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
+-              free_blocks = percpu_counter_sum(&sbi->s_freeblocks_counter);
+-
+-      if (free_blocks < (root_blocks + nblocks))
++      if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
++                                              EXT4_FREEBLOCKS_WATERMARK) {
++              free_blocks  = percpu_counter_sum(fbc);
++              dirty_blocks = percpu_counter_sum(dbc);
++              if (dirty_blocks < 0) {
++                      printk(KERN_CRIT "Dirty block accounting "
++                                      "went wrong %lld\n",
++                                      dirty_blocks);
++              }
++      }
++      /* Check whether we have space after
++       * accounting for current dirty blocks
++       */
++      if (free_blocks < ((s64)(root_blocks + nblocks) + dirty_blocks))
+               /* we don't have free space */
+               return -ENOSPC;
+ 
+-      /* reduce fs free blocks counter */
+-      percpu_counter_sub(fbc, nblocks);
++      /* Add the blocks to nblocks */
++      percpu_counter_add(dbc, nblocks);
+       return 0;
+ }
+ 
+@@ -1792,23 +1804,28 @@ int ext4_claim_free_blocks(struct ext4_s
+ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+                                               ext4_fsblk_t nblocks)
+ {
+-      ext4_fsblk_t free_blocks;
++      ext4_fsblk_t free_blocks, dirty_blocks;
+       ext4_fsblk_t root_blocks = 0;
++      struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
++      struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
+ 
+-      free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
++      free_blocks  = percpu_counter_read_positive(fbc);
++      dirty_blocks = percpu_counter_read_positive(dbc);
+ 
+       if (!capable(CAP_SYS_RESOURCE) &&
+               sbi->s_resuid != current->fsuid &&
+               (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+               root_blocks = ext4_r_blocks_count(sbi->s_es);
+ 
+-      if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
+-              free_blocks = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+-
+-      if (free_blocks <= root_blocks)
++      if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
++                                              EXT4_FREEBLOCKS_WATERMARK) {
++              free_blocks  = percpu_counter_sum_positive(fbc);
++              dirty_blocks = percpu_counter_sum_positive(dbc);
++      }
++      if (free_blocks <= (root_blocks + dirty_blocks))
+               /* we don't have free space */
+               return 0;
+-      if (free_blocks - root_blocks < nblocks)
++      if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
+               return free_blocks - root_blocks;
+       return nblocks;
+ }
+@@ -2089,13 +2106,14 @@ allocated:
+       le16_add_cpu(&gdp->bg_free_blocks_count, -num);
+       gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
+       spin_unlock(sb_bgl_lock(sbi, group_no));
+-      if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
+-              /*
+-               * we allocated less blocks than we
+-               * claimed. Add the difference back.
+-               */
+-              percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
+-      }
++      percpu_counter_sub(&sbi->s_freeblocks_counter, num);
++      /*
++       * Now reduce the dirty block count also. Should not go negative
++       */
++      if (!EXT4_I(inode)->i_delalloc_reserved_flag)
++              percpu_counter_sub(&sbi->s_dirtyblocks_counter, *count);
++      else
++              percpu_counter_sub(&sbi->s_dirtyblocks_counter, num);
+       if (sbi->s_log_groups_per_flex) {
+               ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
+               spin_lock(sb_bgl_lock(sbi, flex_group));
+--- a/fs/ext4/ext4_sb.h
++++ b/fs/ext4/ext4_sb.h
+@@ -60,6 +60,7 @@ struct ext4_sb_info {
+       struct percpu_counter s_freeblocks_counter;
+       struct percpu_counter s_freeinodes_counter;
+       struct percpu_counter s_dirs_counter;
++      struct percpu_counter s_dirtyblocks_counter;
+       struct blockgroup_lock s_blockgroup_lock;
+ 
+       /* root of the per fs reservation window tree */
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1032,19 +1032,20 @@ static void ext4_da_update_reserve_space
+       BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+       mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+ 
+-      /* Account for allocated meta_blocks */
+-      mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+-
+-      /* update fs free blocks counter for truncate case */
+-      percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
++      if (mdb_free) {
++              /* Account for allocated meta_blocks */
++              mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
++
++              /* update fs dirty blocks counter */
++              percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
++              EXT4_I(inode)->i_allocated_meta_blocks = 0;
++              EXT4_I(inode)->i_reserved_meta_blocks = mdb;
++      }
+ 
+       /* update per-inode reservations */
+       BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
+       EXT4_I(inode)->i_reserved_data_blocks -= used;
+ 
+-      BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+-      EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+-      EXT4_I(inode)->i_allocated_meta_blocks = 0;
+       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ 
+       /*
+@@ -1609,8 +1610,8 @@ static void ext4_da_release_space(struct
+ 
+       release = to_free + mdb_free;
+ 
+-      /* update fs free blocks counter for truncate case */
+-      percpu_counter_add(&sbi->s_freeblocks_counter, release);
++      /* update fs dirty blocks counter for truncate case */
++      percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
+ 
+       /* update per-inode reservations */
+       BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
+@@ -2546,7 +2547,6 @@ static int ext4_da_write_begin(struct fi
+       index = pos >> PAGE_CACHE_SHIFT;
+       from = pos & (PAGE_CACHE_SIZE - 1);
+       to = from + len;
+-
+ retry:
+       /*
+        * With delayed allocation, we don't log the i_disksize update
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -3100,7 +3100,7 @@ void exit_ext4_mballoc(void)
+  */
+ static noinline_for_stack int
+ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
+-                              handle_t *handle)
++                              handle_t *handle, unsigned long reserv_blks)
+ {
+       struct buffer_head *bitmap_bh = NULL;
+       struct ext4_super_block *es;
+@@ -3188,21 +3188,16 @@ ext4_mb_mark_diskspace_used(struct ext4_
+       le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
+       gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
+       spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+-
++      percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+       /*
+-       * free blocks account has already be reduced/reserved
+-       * at write_begin() time for delayed allocation
+-       * do not double accounting
++       * Now reduce the dirty block count also. Should not go negative
+        */
+-      if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
+-                      ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
+-              /*
+-               * we allocated less blocks than we calimed
+-               * Add the difference back
+-               */
+-              percpu_counter_add(&sbi->s_freeblocks_counter,
+-                              ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
+-      }
++      if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
++              /* release all the reserved blocks if non delalloc */
++              percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
++      else
++              percpu_counter_sub(&sbi->s_dirtyblocks_counter,
++                                              ac->ac_b_ex.fe_len);
+ 
+       if (sbi->s_log_groups_per_flex) {
+               ext4_group_t flex_group = ext4_flex_group(sbi,
+@@ -4636,12 +4631,13 @@ static int ext4_mb_discard_preallocation
+ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
+                                struct ext4_allocation_request *ar, int *errp)
+ {
++      int freed;
+       struct ext4_allocation_context *ac = NULL;
+       struct ext4_sb_info *sbi;
+       struct super_block *sb;
+       ext4_fsblk_t block = 0;
+-      int freed;
+-      int inquota;
++      unsigned long inquota;
++      unsigned long reserv_blks = 0;
+ 
+       sb = ar->inode->i_sb;
+       sbi = EXT4_SB(sb);
+@@ -4659,6 +4655,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t
+                       *errp = -ENOSPC;
+                       return 0;
+               }
++              reserv_blks = ar->len;
+       }
+       while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
+               ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+@@ -4704,7 +4701,7 @@ repeat:
+                       ext4_mb_new_preallocation(ac);
+       }
+       if (likely(ac->ac_status == AC_STATUS_FOUND)) {
+-              *errp = ext4_mb_mark_diskspace_used(ac, handle);
++              *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
+               if (*errp ==  -EAGAIN) {
+                       /*
+                        * drop the reference that we took
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -521,6 +521,7 @@ static void ext4_put_super(struct super_
+       percpu_counter_destroy(&sbi->s_freeblocks_counter);
+       percpu_counter_destroy(&sbi->s_freeinodes_counter);
+       percpu_counter_destroy(&sbi->s_dirs_counter);
++      percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+       brelse(sbi->s_sbh);
+ #ifdef CONFIG_QUOTA
+       for (i = 0; i < MAXQUOTAS; i++)
+@@ -2280,6 +2281,9 @@ static int ext4_fill_super(struct super_
+               err = percpu_counter_init(&sbi->s_dirs_counter,
+                               ext4_count_dirs(sb));
+       }
++      if (!err) {
++              err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
++      }
+       if (err) {
+               printk(KERN_ERR "EXT4-fs: insufficient memory\n");
+               goto failed_mount3;
+@@ -2517,6 +2521,7 @@ failed_mount3:
+       percpu_counter_destroy(&sbi->s_freeblocks_counter);
+       percpu_counter_destroy(&sbi->s_freeinodes_counter);
+       percpu_counter_destroy(&sbi->s_dirs_counter);
++      percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+ failed_mount2:
+       for (i = 0; i < db_count; i++)
+               brelse(sbi->s_group_desc[i]);
+@@ -3208,7 +3213,8 @@ static int ext4_statfs(struct dentry *de
+       buf->f_type = EXT4_SUPER_MAGIC;
+       buf->f_bsize = sb->s_blocksize;
+       buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
+-      buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
++      buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
++                     percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
+       ext4_free_blocks_count_set(es, buf->f_bfree);
+       buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
+       if (buf->f_bfree < ext4_r_blocks_count(es))
diff --git a/queue-2.6.27/ext4-fix-file-fragmentation-during-large-file-write.patch b/queue-2.6.27/ext4-fix-file-fragmentation-during-large-file-write.patch

new file mode 100644 (file)

index 0000000..00bd977
--- /dev/null
+++ b/queue-2.6.27/ext4-fix-file-fragmentation-during-large-file-write.patch
@@ -0,0 +1,199 @@
+From tytso@mit.edu  Mon Apr 19 10:23:42 2010
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Mon, 15 Mar 2010 20:26:04 -0400
+Subject: ext4: Fix file fragmentation during large file write.
+To: stable@kernel.org
+Cc: Ext4 Developers List <linux-ext4@vger.kernel.org>, "Theodore Ts'o" <tytso@mit.edu>, "Jayson R. King" <dev@jaysonking.com>, "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+Message-ID: <1268699165-17461-11-git-send-email-tytso@mit.edu>
+
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+
+commit 22208dedbd7626e5fc4339c417f8d24cc21f79d7 upstream.
+
+The range_cyclic writeback mode uses the address_space writeback_index
+as the start index for writeback.  With delayed allocation we were
+updating writeback_index wrongly resulting in highly fragmented file.
+This patch reduces the number of extents reduced from 4000 to 27 for a
+3GB file.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+[dev@jaysonking.com: Some changed lines from the original version of this patch were dropped, since they were rolled up with another cherry-picked patch applied to 2.6.27.y earlier.]
+Signed-off-by: Jayson R. King <dev@jaysonking.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/inode.c |   88 +++++++++++++++++++++++++++++++++++---------------------
+ 1 file changed, 55 insertions(+), 33 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1721,7 +1721,11 @@ static int mpage_da_submit_io(struct mpa
+ 
+                       pages_skipped = mpd->wbc->pages_skipped;
+                       err = mapping->a_ops->writepage(page, mpd->wbc);
+-                      if (!err)
++                      if (!err && (pages_skipped == mpd->wbc->pages_skipped))
++                              /*
++                               * have successfully written the page
++                               * without skipping the same
++                               */
+                               mpd->pages_written++;
+                       /*
+                        * In error case, we have to continue because
+@@ -2175,7 +2179,6 @@ static int mpage_da_writepages(struct ad
+                              struct writeback_control *wbc,
+                              struct mpage_da_data *mpd)
+ {
+-      long to_write;
+       int ret;
+ 
+       if (!mpd->get_block)
+@@ -2190,19 +2193,18 @@ static int mpage_da_writepages(struct ad
+       mpd->pages_written = 0;
+       mpd->retval = 0;
+ 
+-      to_write = wbc->nr_to_write;
+-
+       ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
+-
+       /*
+        * Handle last extent of pages
+        */
+       if (!mpd->io_done && mpd->next_page != mpd->first_page) {
+               if (mpage_da_map_blocks(mpd) == 0)
+                       mpage_da_submit_io(mpd);
+-      }
+ 
+-      wbc->nr_to_write = to_write - mpd->pages_written;
++              mpd->io_done = 1;
++              ret = MPAGE_DA_EXTENT_TAIL;
++      }
++      wbc->nr_to_write -= mpd->pages_written;
+       return ret;
+ }
+ 
+@@ -2447,11 +2449,14 @@ static int ext4_da_writepages_trans_bloc
+ static int ext4_da_writepages(struct address_space *mapping,
+                             struct writeback_control *wbc)
+ {
++      pgoff_t index;
++      int range_whole = 0;
+       handle_t *handle = NULL;
+       struct mpage_da_data mpd;
+       struct inode *inode = mapping->host;
++      int no_nrwrite_index_update;
++      long pages_written = 0, pages_skipped;
+       int needed_blocks, ret = 0, nr_to_writebump = 0;
+-      long to_write, pages_skipped = 0;
+       struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+ 
+       /*
+@@ -2485,16 +2490,26 @@ static int ext4_da_writepages(struct add
+               nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
+               wbc->nr_to_write = sbi->s_mb_stream_request;
+       }
++      if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
++              range_whole = 1;
+ 
+-
+-      pages_skipped = wbc->pages_skipped;
++      if (wbc->range_cyclic)
++              index = mapping->writeback_index;
++      else
++              index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ 
+       mpd.wbc = wbc;
+       mpd.inode = mapping->host;
+ 
+-restart_loop:
+-      to_write = wbc->nr_to_write;
+-      while (!ret && to_write > 0) {
++      /*
++       * we don't want write_cache_pages to update
++       * nr_to_write and writeback_index
++       */
++      no_nrwrite_index_update = wbc->no_nrwrite_index_update;
++      wbc->no_nrwrite_index_update = 1;
++      pages_skipped = wbc->pages_skipped;
++
++      while (!ret && wbc->nr_to_write > 0) {
+ 
+               /*
+                * we  insert one extent at a time. So we need
+@@ -2527,46 +2542,53 @@ restart_loop:
+                               goto out_writepages;
+                       }
+               }
+-              to_write -= wbc->nr_to_write;
+-
+               mpd.get_block = ext4_da_get_block_write;
+               ret = mpage_da_writepages(mapping, wbc, &mpd);
+ 
+               ext4_journal_stop(handle);
+ 
+-              if (mpd.retval == -ENOSPC)
++              if (mpd.retval == -ENOSPC) {
++                      /* commit the transaction which would
++                       * free blocks released in the transaction
++                       * and try again
++                       */
+                       jbd2_journal_force_commit_nested(sbi->s_journal);
+-
+-              /* reset the retry count */
+-              if (ret == MPAGE_DA_EXTENT_TAIL) {
++                      wbc->pages_skipped = pages_skipped;
++                      ret = 0;
++              } else if (ret == MPAGE_DA_EXTENT_TAIL) {
+                       /*
+                        * got one extent now try with
+                        * rest of the pages
+                        */
+-                      to_write += wbc->nr_to_write;
++                      pages_written += mpd.pages_written;
++                      wbc->pages_skipped = pages_skipped;
+                       ret = 0;
+-              } else if (wbc->nr_to_write) {
++              } else if (wbc->nr_to_write)
+                       /*
+                        * There is no more writeout needed
+                        * or we requested for a noblocking writeout
+                        * and we found the device congested
+                        */
+-                      to_write += wbc->nr_to_write;
+                       break;
+-              }
+-              wbc->nr_to_write = to_write;
+-      }
+-
+-      if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) {
+-              /* We skipped pages in this loop */
+-              wbc->nr_to_write = to_write +
+-                              wbc->pages_skipped - pages_skipped;
+-              wbc->pages_skipped = pages_skipped;
+-              goto restart_loop;
+       }
++      if (pages_skipped != wbc->pages_skipped)
++              printk(KERN_EMERG "This should not happen leaving %s "
++                              "with nr_to_write = %ld ret = %d\n",
++                              __func__, wbc->nr_to_write, ret);
++
++      /* Update index */
++      index += pages_written;
++      if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
++              /*
++               * set the writeback_index so that range_cyclic
++               * mode will write it back later
++               */
++              mapping->writeback_index = index;
+ 
+ out_writepages:
+-      wbc->nr_to_write = to_write - nr_to_writebump;
++      if (!no_nrwrite_index_update)
++              wbc->no_nrwrite_index_update = 0;
++      wbc->nr_to_write -= nr_to_writebump;
+       return ret;
+ }
+ 
diff --git a/queue-2.6.27/ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch b/queue-2.6.27/ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch

new file mode 100644 (file)

index 0000000..e3e0632
--- /dev/null
+++ b/queue-2.6.27/ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch
@@ -0,0 +1,103 @@
+From tytso@mit.edu  Mon Apr 19 10:24:03 2010
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Mon, 15 Mar 2010 20:26:05 -0400
+Subject: ext4: Implement range_cyclic in ext4_da_writepages instead of write_cache_pages
+To: stable@kernel.org
+Cc: Ext4 Developers List <linux-ext4@vger.kernel.org>, "Theodore Ts'o" <tytso@mit.edu>, "Jayson R. King" <dev@jaysonking.com>, "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+Message-ID: <1268699165-17461-12-git-send-email-tytso@mit.edu>
+
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+
+commit 2acf2c261b823d9d9ed954f348b97620297a36b5 upstream.
+
+With delayed allocation we lock the page in write_cache_pages() and
+try to build an in memory extent of contiguous blocks.  This is needed
+so that we can get large contiguous blocks request.  If range_cyclic
+mode is enabled, write_cache_pages() will loop back to the 0 index if
+no I/O has been done yet, and try to start writing from the beginning
+of the range.  That causes an attempt to take the page lock of lower
+index page while holding the page lock of higher index page, which can
+cause a dead lock with another writeback thread.
+
+The solution is to implement the range_cyclic behavior in
+ext4_da_writepages() instead.
+
+http://bugzilla.kernel.org/show_bug.cgi?id=12579
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Jayson R. King <dev@jaysonking.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/inode.c |   21 +++++++++++++++++++--
+ 1 file changed, 19 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -2456,6 +2456,7 @@ static int ext4_da_writepages(struct add
+       struct inode *inode = mapping->host;
+       int no_nrwrite_index_update;
+       long pages_written = 0, pages_skipped;
++      int range_cyclic, cycled = 1, io_done = 0;
+       int needed_blocks, ret = 0, nr_to_writebump = 0;
+       struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+ 
+@@ -2493,9 +2494,15 @@ static int ext4_da_writepages(struct add
+       if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+               range_whole = 1;
+ 
+-      if (wbc->range_cyclic)
++      range_cyclic = wbc->range_cyclic;
++      if (wbc->range_cyclic) {
+               index = mapping->writeback_index;
+-      else
++              if (index)
++                      cycled = 0;
++              wbc->range_start = index << PAGE_CACHE_SHIFT;
++              wbc->range_end  = LLONG_MAX;
++              wbc->range_cyclic = 0;
++      } else
+               index = wbc->range_start >> PAGE_CACHE_SHIFT;
+ 
+       mpd.wbc = wbc;
+@@ -2509,6 +2516,7 @@ static int ext4_da_writepages(struct add
+       wbc->no_nrwrite_index_update = 1;
+       pages_skipped = wbc->pages_skipped;
+ 
++retry:
+       while (!ret && wbc->nr_to_write > 0) {
+ 
+               /*
+@@ -2563,6 +2571,7 @@ static int ext4_da_writepages(struct add
+                       pages_written += mpd.pages_written;
+                       wbc->pages_skipped = pages_skipped;
+                       ret = 0;
++                      io_done = 1;
+               } else if (wbc->nr_to_write)
+                       /*
+                        * There is no more writeout needed
+@@ -2571,6 +2580,13 @@ static int ext4_da_writepages(struct add
+                        */
+                       break;
+       }
++      if (!io_done && !cycled) {
++              cycled = 1;
++              index = 0;
++              wbc->range_start = index << PAGE_CACHE_SHIFT;
++              wbc->range_end  = mapping->writeback_index - 1;
++              goto retry;
++      }
+       if (pages_skipped != wbc->pages_skipped)
+               printk(KERN_EMERG "This should not happen leaving %s "
+                               "with nr_to_write = %ld ret = %d\n",
+@@ -2578,6 +2594,7 @@ static int ext4_da_writepages(struct add
+ 
+       /* Update index */
+       index += pages_written;
++      wbc->range_cyclic = range_cyclic;
+       if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+               /*
+                * set the writeback_index so that range_cyclic
diff --git a/queue-2.6.27/ext4-invalidate-pages-if-delalloc-block-allocation-fails.patch b/queue-2.6.27/ext4-invalidate-pages-if-delalloc-block-allocation-fails.patch

new file mode 100644 (file)

index 0000000..33b9713
--- /dev/null
+++ b/queue-2.6.27/ext4-invalidate-pages-if-delalloc-block-allocation-fails.patch
@@ -0,0 +1,171 @@
+From tytso@mit.edu  Mon Apr 19 10:19:40 2010
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Mon, 15 Mar 2010 20:25:55 -0400
+Subject: ext4: invalidate pages if delalloc block allocation fails.
+To: stable@kernel.org
+Cc: Ext4 Developers List <linux-ext4@vger.kernel.org>, "Theodore Ts'o" <tytso@mit.edu>, "Jayson R. King" <dev@jaysonking.com>, "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+Message-ID: <1268699165-17461-2-git-send-email-tytso@mit.edu>
+
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+
+commit c4a0c46ec92c194c873232b88debce4e1a448483 upstream.
+
+We are a bit agressive in invalidating all the pages. But
+it is ok because we really don't know why the block allocation
+failed and it is better to come of the writeback path
+so that user can look for more info.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: Jayson R. King <dev@jaysonking.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/inode.c |   85 ++++++++++++++++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 73 insertions(+), 12 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1821,6 +1821,39 @@ static inline void __unmap_underlying_bl
+               unmap_underlying_metadata(bdev, bh->b_blocknr + i);
+ }
+ 
++static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
++                                      sector_t logical, long blk_cnt)
++{
++      int nr_pages, i;
++      pgoff_t index, end;
++      struct pagevec pvec;
++      struct inode *inode = mpd->inode;
++      struct address_space *mapping = inode->i_mapping;
++
++      index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
++      end   = (logical + blk_cnt - 1) >>
++                              (PAGE_CACHE_SHIFT - inode->i_blkbits);
++      while (index <= end) {
++              nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
++              if (nr_pages == 0)
++                      break;
++              for (i = 0; i < nr_pages; i++) {
++                      struct page *page = pvec.pages[i];
++                      index = page->index;
++                      if (index > end)
++                              break;
++                      index++;
++
++                      BUG_ON(!PageLocked(page));
++                      BUG_ON(PageWriteback(page));
++                      block_invalidatepage(page, 0);
++                      ClearPageUptodate(page);
++                      unlock_page(page);
++              }
++      }
++      return;
++}
++
+ /*
+  * mpage_da_map_blocks - go through given space
+  *
+@@ -1830,7 +1863,7 @@ static inline void __unmap_underlying_bl
+  * The function skips space we know is already mapped to disk blocks.
+  *
+  */
+-static void mpage_da_map_blocks(struct mpage_da_data *mpd)
++static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
+ {
+       int err = 0;
+       struct buffer_head *lbh = &mpd->lbh;
+@@ -1841,7 +1874,7 @@ static void mpage_da_map_blocks(struct m
+        * We consider only non-mapped and non-allocated blocks
+        */
+       if (buffer_mapped(lbh) && !buffer_delay(lbh))
+-              return;
++              return 0;
+ 
+       new.b_state = lbh->b_state;
+       new.b_blocknr = 0;
+@@ -1852,10 +1885,38 @@ static void mpage_da_map_blocks(struct m
+        * to write simply return
+        */
+       if (!new.b_size)
+-              return;
++              return 0;
+       err = mpd->get_block(mpd->inode, next, &new, 1);
+-      if (err)
+-              return;
++      if (err) {
++
++              /* If get block returns with error
++               * we simply return. Later writepage
++               * will redirty the page and writepages
++               * will find the dirty page again
++               */
++              if (err == -EAGAIN)
++                      return 0;
++              /*
++               * get block failure will cause us
++               * to loop in writepages. Because
++               * a_ops->writepage won't be able to
++               * make progress. The page will be redirtied
++               * by writepage and writepages will again
++               * try to write the same.
++               */
++              printk(KERN_EMERG "%s block allocation failed for inode %lu "
++                                "at logical offset %llu with max blocks "
++                                "%zd with error %d\n",
++                                __func__, mpd->inode->i_ino,
++                                (unsigned long long)next,
++                                lbh->b_size >> mpd->inode->i_blkbits, err);
++              printk(KERN_EMERG "This should not happen.!! "
++                                      "Data will be lost\n");
++              /* invlaidate all the pages */
++              ext4_da_block_invalidatepages(mpd, next,
++                              lbh->b_size >> mpd->inode->i_blkbits);
++              return err;
++      }
+       BUG_ON(new.b_size == 0);
+ 
+       if (buffer_new(&new))
+@@ -1868,7 +1929,7 @@ static void mpage_da_map_blocks(struct m
+       if (buffer_delay(lbh) || buffer_unwritten(lbh))
+               mpage_put_bnr_to_bhs(mpd, next, &new);
+ 
+-      return;
++      return 0;
+ }
+ 
+ #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
+@@ -1937,8 +1998,8 @@ flush_it:
+        * We couldn't merge the block to our extent, so we
+        * need to flush current  extent and start new one
+        */
+-      mpage_da_map_blocks(mpd);
+-      mpage_da_submit_io(mpd);
++      if (mpage_da_map_blocks(mpd) == 0)
++              mpage_da_submit_io(mpd);
+       mpd->io_done = 1;
+       return;
+ }
+@@ -1980,8 +2041,8 @@ static int __mpage_da_writepage(struct p
+                * and start IO on them using writepage()
+                */
+               if (mpd->next_page != mpd->first_page) {
+-                      mpage_da_map_blocks(mpd);
+-                      mpage_da_submit_io(mpd);
++                      if (mpage_da_map_blocks(mpd) == 0)
++                              mpage_da_submit_io(mpd);
+                       /*
+                        * skip rest of the page in the page_vec
+                        */
+@@ -2102,8 +2163,8 @@ static int mpage_da_writepages(struct ad
+        * Handle last extent of pages
+        */
+       if (!mpd.io_done && mpd.next_page != mpd.first_page) {
+-              mpage_da_map_blocks(&mpd);
+-              mpage_da_submit_io(&mpd);
++              if (mpage_da_map_blocks(&mpd) == 0)
++                      mpage_da_submit_io(&mpd);
+       }
+ 
+       wbc->nr_to_write = to_write - mpd.pages_written;
diff --git a/queue-2.6.27/ext4-make-sure-all-the-block-allocation-paths-reserve-blocks.patch b/queue-2.6.27/ext4-make-sure-all-the-block-allocation-paths-reserve-blocks.patch

new file mode 100644 (file)

index 0000000..56313d6
--- /dev/null
+++ b/queue-2.6.27/ext4-make-sure-all-the-block-allocation-paths-reserve-blocks.patch
@@ -0,0 +1,218 @@
+From tytso@mit.edu  Mon Apr 19 10:20:41 2010
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Mon, 15 Mar 2010 20:25:57 -0400
+Subject: ext4: Make sure all the block allocation paths reserve blocks
+To: stable@kernel.org
+Cc: Ext4 Developers List <linux-ext4@vger.kernel.org>, "Theodore Ts'o" <tytso@mit.edu>, "Jayson R. King" <dev@jaysonking.com>, "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+Message-ID: <1268699165-17461-4-git-send-email-tytso@mit.edu>
+
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+
+commit a30d542a0035b886ffaafd0057ced0a2b28c3a4f upstream.
+
+With delayed allocation we need to make sure block are reserved before
+we attempt to allocate them. Otherwise we get block allocation failure
+(ENOSPC) during writepages which cannot be handled. This would mean
+silent data loss (We do a printk stating data will be lost). This patch
+updates the DIO and fallocate code path to do block reservation before
+block allocation. This is needed to make sure parallel DIO and fallocate
+request doesn't take block out of delayed reserve space.
+
+When free blocks count go below a threshold we switch to a slow patch
+which looks at other CPU's accumulated percpu counter values.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Jayson R. King <dev@jaysonking.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/balloc.c  |   58 +++++++++++++++++++++++++++++++++++++++---------------
+ fs/ext4/ext4.h    |   13 ++++++++++++
+ fs/ext4/inode.c   |    5 ----
+ fs/ext4/mballoc.c |   23 ++++++++++++---------
+ 4 files changed, 69 insertions(+), 30 deletions(-)
+
+--- a/fs/ext4/balloc.c
++++ b/fs/ext4/balloc.c
+@@ -1754,6 +1754,32 @@ out:
+       return ret;
+ }
+ 
++int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
++                                              ext4_fsblk_t nblocks)
++{
++      s64 free_blocks;
++      ext4_fsblk_t root_blocks = 0;
++      struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
++
++      free_blocks = percpu_counter_read(fbc);
++
++      if (!capable(CAP_SYS_RESOURCE) &&
++              sbi->s_resuid != current->fsuid &&
++              (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
++              root_blocks = ext4_r_blocks_count(sbi->s_es);
++
++      if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
++              free_blocks = percpu_counter_sum(&sbi->s_freeblocks_counter);
++
++      if (free_blocks < (root_blocks + nblocks))
++              /* we don't have free space */
++              return -ENOSPC;
++
++      /* reduce fs free blocks counter */
++      percpu_counter_sub(fbc, nblocks);
++      return 0;
++}
++
+ /**
+  * ext4_has_free_blocks()
+  * @sbi:      in-core super block structure.
+@@ -1775,18 +1801,17 @@ ext4_fsblk_t ext4_has_free_blocks(struct
+               sbi->s_resuid != current->fsuid &&
+               (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+               root_blocks = ext4_r_blocks_count(sbi->s_es);
+-#ifdef CONFIG_SMP
+-      if (free_blocks - root_blocks < FBC_BATCH)
+-              free_blocks =
+-                      percpu_counter_sum(&sbi->s_freeblocks_counter);
+-#endif
++
++      if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK)
++              free_blocks = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
++
+       if (free_blocks <= root_blocks)
+               /* we don't have free space */
+               return 0;
+       if (free_blocks - root_blocks < nblocks)
+               return free_blocks - root_blocks;
+       return nblocks;
+- }
++}
+ 
+ 
+ /**
+@@ -1865,14 +1890,11 @@ ext4_fsblk_t ext4_old_new_blocks(handle_
+               /*
+                * With delalloc we already reserved the blocks
+                */
+-              *count = ext4_has_free_blocks(sbi, *count);
+-      }
+-      if (*count == 0) {
+-              *errp = -ENOSPC;
+-              return 0;       /*return with ENOSPC error */
++              if (ext4_claim_free_blocks(sbi, *count)) {
++                      *errp = -ENOSPC;
++                      return 0;       /*return with ENOSPC error */
++              }
+       }
+-      num = *count;
+-
+       /*
+        * Check quota for allocation of this block.
+        */
+@@ -2067,9 +2089,13 @@ allocated:
+       le16_add_cpu(&gdp->bg_free_blocks_count, -num);
+       gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
+       spin_unlock(sb_bgl_lock(sbi, group_no));
+-      if (!EXT4_I(inode)->i_delalloc_reserved_flag)
+-              percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+-
++      if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) {
++              /*
++               * we allocated less blocks than we
++               * claimed. Add the difference back.
++               */
++              percpu_counter_add(&sbi->s_freeblocks_counter, *count - num);
++      }
+       if (sbi->s_log_groups_per_flex) {
+               ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
+               spin_lock(sb_bgl_lock(sbi, flex_group));
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1015,6 +1015,8 @@ extern ext4_fsblk_t ext4_new_blocks(hand
+                                       unsigned long *count, int *errp);
+ extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
+                       ext4_fsblk_t goal, unsigned long *count, int *errp);
++extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
++                                              ext4_fsblk_t nblocks);
+ extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+                                               ext4_fsblk_t nblocks);
+ extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
+@@ -1245,6 +1247,17 @@ do {                                                            \
+               __ext4_std_error((sb), __func__, (errno));      \
+ } while (0)
+ 
++#ifdef CONFIG_SMP
++/* Each CPU can accumulate FBC_BATCH blocks in their local
++ * counters. So we need to make sure we have free blocks more
++ * than FBC_BATCH  * nr_cpu_ids. Also add a window of 4 times.
++ */
++#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
++#else
++#define EXT4_FREEBLOCKS_WATERMARK 0
++#endif
++
++
+ /*
+  * Inodes and files operations
+  */
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1564,13 +1564,10 @@ static int ext4_da_reserve_space(struct
+       md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
+       total = md_needed + nrblocks;
+ 
+-      if (ext4_has_free_blocks(sbi, total) < total) {
++      if (ext4_claim_free_blocks(sbi, total)) {
+               spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+               return -ENOSPC;
+       }
+-      /* reduce fs free blocks counter */
+-      percpu_counter_sub(&sbi->s_freeblocks_counter, total);
+-
+       EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+       EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+ 
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -3194,9 +3194,15 @@ ext4_mb_mark_diskspace_used(struct ext4_
+        * at write_begin() time for delayed allocation
+        * do not double accounting
+        */
+-      if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+-              percpu_counter_sub(&sbi->s_freeblocks_counter,
+-                                      ac->ac_b_ex.fe_len);
++      if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) &&
++                      ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) {
++              /*
++               * we allocated less blocks than we calimed
++               * Add the difference back
++               */
++              percpu_counter_add(&sbi->s_freeblocks_counter,
++                              ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len);
++      }
+ 
+       if (sbi->s_log_groups_per_flex) {
+               ext4_group_t flex_group = ext4_flex_group(sbi,
+@@ -4649,14 +4655,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t
+               /*
+                * With delalloc we already reserved the blocks
+                */
+-              ar->len = ext4_has_free_blocks(sbi, ar->len);
+-      }
+-
+-      if (ar->len == 0) {
+-              *errp = -ENOSPC;
+-              return 0;
++              if (ext4_claim_free_blocks(sbi, ar->len)) {
++                      *errp = -ENOSPC;
++                      return 0;
++              }
+       }
+-
+       while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
+               ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+               ar->len--;
diff --git a/queue-2.6.27/ext4-retry-block-allocation-if-we-have-free-blocks-left.patch b/queue-2.6.27/ext4-retry-block-allocation-if-we-have-free-blocks-left.patch

new file mode 100644 (file)

index 0000000..76a9962
--- /dev/null
+++ b/queue-2.6.27/ext4-retry-block-allocation-if-we-have-free-blocks-left.patch
@@ -0,0 +1,200 @@
+From tytso@mit.edu  Mon Apr 19 10:22:08 2010
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Mon, 15 Mar 2010 20:26:00 -0400
+Subject: ext4: Retry block allocation if we have free blocks left
+To: stable@kernel.org
+Cc: "Theodore Ts'o" <tytso@mit.edu>, Ext4 Developers List <linux-ext4@vger.kernel.org>, Mingming Cao <cmm@us.ibm.com>, "Jayson R. King" <dev@jaysonking.com>, "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+Message-ID: <1268699165-17461-7-git-send-email-tytso@mit.edu>
+
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+
+commit df22291ff0fde0d350cf15dac3e5cc33ac528875 upstream.
+
+When we truncate files, the meta-data blocks released are not reused
+untill we commit the truncate transaction.  That means delayed get_block
+request will return ENOSPC even if we have free blocks left.  Force a
+journal commit and retry block allocation if we get ENOSPC with free
+blocks left.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: Mingming Cao <cmm@us.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Jayson R. King <dev@jaysonking.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/inode.c |   81 +++++++++++++++++++++++++++++++++++++++-----------------
+ 1 file changed, 57 insertions(+), 24 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1661,6 +1661,7 @@ struct mpage_da_data {
+       struct writeback_control *wbc;
+       int io_done;
+       long pages_written;
++      int retval;
+ };
+ 
+ /*
+@@ -1858,6 +1859,24 @@ static void ext4_da_block_invalidatepage
+       return;
+ }
+ 
++static void ext4_print_free_blocks(struct inode *inode)
++{
++      struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
++      printk(KERN_EMERG "Total free blocks count %lld\n",
++                      ext4_count_free_blocks(inode->i_sb));
++      printk(KERN_EMERG "Free/Dirty block details\n");
++      printk(KERN_EMERG "free_blocks=%lld\n",
++                      percpu_counter_sum(&sbi->s_freeblocks_counter));
++      printk(KERN_EMERG "dirty_blocks=%lld\n",
++                      percpu_counter_sum(&sbi->s_dirtyblocks_counter));
++      printk(KERN_EMERG "Block reservation details\n");
++      printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
++                      EXT4_I(inode)->i_reserved_data_blocks);
++      printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
++                      EXT4_I(inode)->i_reserved_meta_blocks);
++      return;
++}
++
+ /*
+  * mpage_da_map_blocks - go through given space
+  *
+@@ -1872,7 +1891,7 @@ static int  mpage_da_map_blocks(struct m
+       int err = 0;
+       struct buffer_head new;
+       struct buffer_head *lbh = &mpd->lbh;
+-      sector_t next = lbh->b_blocknr;
++      sector_t next;
+ 
+       /*
+        * We consider only non-mapped and non-allocated blocks
+@@ -1882,6 +1901,7 @@ static int  mpage_da_map_blocks(struct m
+       new.b_state = lbh->b_state;
+       new.b_blocknr = 0;
+       new.b_size = lbh->b_size;
++      next = lbh->b_blocknr;
+       /*
+        * If we didn't accumulate anything
+        * to write simply return
+@@ -1898,6 +1918,13 @@ static int  mpage_da_map_blocks(struct m
+                */
+               if (err == -EAGAIN)
+                       return 0;
++
++              if (err == -ENOSPC &&
++                              ext4_count_free_blocks(mpd->inode->i_sb)) {
++                      mpd->retval = err;
++                      return 0;
++              }
++
+               /*
+                * get block failure will cause us
+                * to loop in writepages. Because
+@@ -1915,8 +1942,7 @@ static int  mpage_da_map_blocks(struct m
+               printk(KERN_EMERG "This should not happen.!! "
+                                       "Data will be lost\n");
+               if (err == -ENOSPC) {
+-                      printk(KERN_CRIT "Total free blocks count %lld\n",
+-                              ext4_count_free_blocks(mpd->inode->i_sb));
++                      ext4_print_free_blocks(mpd->inode);
+               }
+               /* invlaidate all the pages */
+               ext4_da_block_invalidatepages(mpd, next,
+@@ -2141,39 +2167,36 @@ static int __mpage_da_writepage(struct p
+  */
+ static int mpage_da_writepages(struct address_space *mapping,
+                              struct writeback_control *wbc,
+-                             get_block_t get_block)
++                             struct mpage_da_data *mpd)
+ {
+-      struct mpage_da_data mpd;
+       long to_write;
+       int ret;
+ 
+-      if (!get_block)
++      if (!mpd->get_block)
+               return generic_writepages(mapping, wbc);
+ 
+-      mpd.wbc = wbc;
+-      mpd.inode = mapping->host;
+-      mpd.lbh.b_size = 0;
+-      mpd.lbh.b_state = 0;
+-      mpd.lbh.b_blocknr = 0;
+-      mpd.first_page = 0;
+-      mpd.next_page = 0;
+-      mpd.get_block = get_block;
+-      mpd.io_done = 0;
+-      mpd.pages_written = 0;
++      mpd->lbh.b_size = 0;
++      mpd->lbh.b_state = 0;
++      mpd->lbh.b_blocknr = 0;
++      mpd->first_page = 0;
++      mpd->next_page = 0;
++      mpd->io_done = 0;
++      mpd->pages_written = 0;
++      mpd->retval = 0;
+ 
+       to_write = wbc->nr_to_write;
+ 
+-      ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
++      ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
+ 
+       /*
+        * Handle last extent of pages
+        */
+-      if (!mpd.io_done && mpd.next_page != mpd.first_page) {
+-              if (mpage_da_map_blocks(&mpd) == 0)
+-                      mpage_da_submit_io(&mpd);
++      if (!mpd->io_done && mpd->next_page != mpd->first_page) {
++              if (mpage_da_map_blocks(mpd) == 0)
++                      mpage_da_submit_io(mpd);
+       }
+ 
+-      wbc->nr_to_write = to_write - mpd.pages_written;
++      wbc->nr_to_write = to_write - mpd->pages_written;
+       return ret;
+ }
+ 
+@@ -2420,6 +2443,7 @@ static int ext4_da_writepages(struct add
+ {
+       handle_t *handle = NULL;
+       loff_t range_start = 0;
++      struct mpage_da_data mpd;
+       struct inode *inode = mapping->host;
+       int needed_blocks, ret = 0, nr_to_writebump = 0;
+       long to_write, pages_skipped = 0;
+@@ -2467,6 +2491,9 @@ static int ext4_da_writepages(struct add
+       range_start =  wbc->range_start;
+       pages_skipped = wbc->pages_skipped;
+ 
++      mpd.wbc = wbc;
++      mpd.inode = mapping->host;
++
+ restart_loop:
+       to_write = wbc->nr_to_write;
+       while (!ret && to_write > 0) {
+@@ -2502,11 +2529,17 @@ restart_loop:
+                               goto out_writepages;
+                       }
+               }
+-
+               to_write -= wbc->nr_to_write;
+-              ret = mpage_da_writepages(mapping, wbc,
+-                                        ext4_da_get_block_write);
++
++              mpd.get_block = ext4_da_get_block_write;
++              ret = mpage_da_writepages(mapping, wbc, &mpd);
++
+               ext4_journal_stop(handle);
++
++              if (mpd.retval == -ENOSPC)
++                      jbd2_journal_force_commit_nested(sbi->s_journal);
++
++              /* reset the retry count */
+               if (ret == MPAGE_DA_EXTENT_TAIL) {
+                       /*
+                        * got one extent now try with
diff --git a/queue-2.6.27/ext4-retry-block-reservation.patch b/queue-2.6.27/ext4-retry-block-reservation.patch

new file mode 100644 (file)

index 0000000..d4e3cde
--- /dev/null
+++ b/queue-2.6.27/ext4-retry-block-reservation.patch
@@ -0,0 +1,131 @@
+From tytso@mit.edu  Mon Apr 19 10:21:18 2010
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Mon, 15 Mar 2010 20:25:59 -0400
+Subject: ext4: Retry block reservation
+To: stable@kernel.org
+Cc: "Theodore Ts'o" <tytso@mit.edu>, Ext4 Developers List <linux-ext4@vger.kernel.org>, Mingming Cao <cmm@us.ibm.com>, "Jayson R. King" <dev@jaysonking.com>, "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+Message-ID: <1268699165-17461-6-git-send-email-tytso@mit.edu>
+
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+
+commit 030ba6bc67b4f2bc5cd174f57785a1745c929abe upstream.
+
+During block reservation if we don't have enough blocks left, retry
+block reservation with smaller block counts.  This makes sure we try
+fallocate and DIO with smaller request size and don't fail early.  The
+delayed allocation reservation cannot try with smaller block count. So
+retry block reservation to handle temporary disk full conditions.  Also
+print free blocks details if we fail block allocation during writepages.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: Mingming Cao <cmm@us.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Jayson R. King <dev@jaysonking.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/balloc.c  |    8 +++++++-
+ fs/ext4/inode.c   |   14 +++++++++++---
+ fs/ext4/mballoc.c |    7 ++++++-
+ 3 files changed, 24 insertions(+), 5 deletions(-)
+
+--- a/fs/ext4/balloc.c
++++ b/fs/ext4/balloc.c
+@@ -1907,10 +1907,16 @@ ext4_fsblk_t ext4_old_new_blocks(handle_
+               /*
+                * With delalloc we already reserved the blocks
+                */
+-              if (ext4_claim_free_blocks(sbi, *count)) {
++              while (*count && ext4_claim_free_blocks(sbi, *count)) {
++                      /* let others to free the space */
++                      yield();
++                      *count = *count >> 1;
++              }
++              if (!*count) {
+                       *errp = -ENOSPC;
+                       return 0;       /*return with ENOSPC error */
+               }
++              num = *count;
+       }
+       /*
+        * Check quota for allocation of this block.
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1549,6 +1549,7 @@ static int ext4_journalled_write_end(str
+ 
+ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+ {
++      int retries = 0;
+        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+        unsigned long md_needed, mdblocks, total = 0;
+ 
+@@ -1557,6 +1558,7 @@ static int ext4_da_reserve_space(struct
+        * in order to allocate nrblocks
+        * worse case is one extent per block
+        */
++repeat:
+       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+       total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+       mdblocks = ext4_calc_metadata_amount(inode, total);
+@@ -1567,6 +1569,10 @@ static int ext4_da_reserve_space(struct
+ 
+       if (ext4_claim_free_blocks(sbi, total)) {
+               spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
++              if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
++                      yield();
++                      goto repeat;
++              }
+               return -ENOSPC;
+       }
+       EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+@@ -1864,20 +1870,18 @@ static void ext4_da_block_invalidatepage
+ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
+ {
+       int err = 0;
++      struct buffer_head new;
+       struct buffer_head *lbh = &mpd->lbh;
+       sector_t next = lbh->b_blocknr;
+-      struct buffer_head new;
+ 
+       /*
+        * We consider only non-mapped and non-allocated blocks
+        */
+       if (buffer_mapped(lbh) && !buffer_delay(lbh))
+               return 0;
+-
+       new.b_state = lbh->b_state;
+       new.b_blocknr = 0;
+       new.b_size = lbh->b_size;
+-
+       /*
+        * If we didn't accumulate anything
+        * to write simply return
+@@ -1910,6 +1914,10 @@ static int  mpage_da_map_blocks(struct m
+                                 lbh->b_size >> mpd->inode->i_blkbits, err);
+               printk(KERN_EMERG "This should not happen.!! "
+                                       "Data will be lost\n");
++              if (err == -ENOSPC) {
++                      printk(KERN_CRIT "Total free blocks count %lld\n",
++                              ext4_count_free_blocks(mpd->inode->i_sb));
++              }
+               /* invlaidate all the pages */
+               ext4_da_block_invalidatepages(mpd, next,
+                               lbh->b_size >> mpd->inode->i_blkbits);
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -4651,7 +4651,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t
+               /*
+                * With delalloc we already reserved the blocks
+                */
+-              if (ext4_claim_free_blocks(sbi, ar->len)) {
++              while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
++                      /* let others to free the space */
++                      yield();
++                      ar->len = ar->len >> 1;
++              }
++              if (!ar->len) {
+                       *errp = -ENOSPC;
+                       return 0;
+               }
diff --git a/queue-2.6.27/ext4-use-tag-dirty-lookup-during-mpage_da_submit_io.patch b/queue-2.6.27/ext4-use-tag-dirty-lookup-during-mpage_da_submit_io.patch

new file mode 100644 (file)

index 0000000..b908a0b
--- /dev/null
+++ b/queue-2.6.27/ext4-use-tag-dirty-lookup-during-mpage_da_submit_io.patch
@@ -0,0 +1,99 @@
+From tytso@mit.edu  Mon Apr 19 10:22:28 2010
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Mon, 15 Mar 2010 20:26:01 -0400
+Subject: ext4: Use tag dirty lookup during mpage_da_submit_io
+To: stable@kernel.org
+Cc: Ext4 Developers List <linux-ext4@vger.kernel.org>, "Theodore Ts'o" <tytso@mit.edu>, "Jayson R. King" <dev@jaysonking.com>, "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+Message-ID: <1268699165-17461-8-git-send-email-tytso@mit.edu>
+
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+
+commit af6f029d3836eb7264cd3fbb13a6baf0e5fdb5ea upstream.
+
+This enables us to drop the range_cont writeback mode
+use from ext4_da_writepages.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: Jayson R. King <dev@jaysonking.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/ext4/inode.c |   30 +++++++++++++-----------------
+ 1 file changed, 13 insertions(+), 17 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1699,17 +1699,23 @@ static int mpage_da_submit_io(struct mpa
+ 
+       pagevec_init(&pvec, 0);
+       while (index <= end) {
+-              nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
++              /*
++               * We can use PAGECACHE_TAG_DIRTY lookup here because
++               * even though we have cleared the dirty flag on the page
++               * We still keep the page in the radix tree with tag
++               * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
++               * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
++               * which is called via the below writepage callback.
++               */
++              nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
++                                      PAGECACHE_TAG_DIRTY,
++                                      min(end - index,
++                                      (pgoff_t)PAGEVEC_SIZE-1) + 1);
+               if (nr_pages == 0)
+                       break;
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+ 
+-                      index = page->index;
+-                      if (index > end)
+-                              break;
+-                      index++;
+-
+                       BUG_ON(!PageLocked(page));
+                       BUG_ON(PageWriteback(page));
+ 
+@@ -2442,7 +2448,6 @@ static int ext4_da_writepages(struct add
+                             struct writeback_control *wbc)
+ {
+       handle_t *handle = NULL;
+-      loff_t range_start = 0;
+       struct mpage_da_data mpd;
+       struct inode *inode = mapping->host;
+       int needed_blocks, ret = 0, nr_to_writebump = 0;
+@@ -2481,14 +2486,7 @@ static int ext4_da_writepages(struct add
+               wbc->nr_to_write = sbi->s_mb_stream_request;
+       }
+ 
+-      if (!wbc->range_cyclic)
+-              /*
+-               * If range_cyclic is not set force range_cont
+-               * and save the old writeback_index
+-               */
+-              wbc->range_cont = 1;
+ 
+-      range_start =  wbc->range_start;
+       pages_skipped = wbc->pages_skipped;
+ 
+       mpd.wbc = wbc;
+@@ -2559,9 +2557,8 @@ restart_loop:
+               wbc->nr_to_write = to_write;
+       }
+ 
+-      if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
++      if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) {
+               /* We skipped pages in this loop */
+-              wbc->range_start = range_start;
+               wbc->nr_to_write = to_write +
+                               wbc->pages_skipped - pages_skipped;
+               wbc->pages_skipped = pages_skipped;
+@@ -2570,7 +2567,6 @@ restart_loop:
+ 
+ out_writepages:
+       wbc->nr_to_write = to_write - nr_to_writebump;
+-      wbc->range_start = range_start;
+       return ret;
+ }
+ 
diff --git a/queue-2.6.27/percpu-counter-clean-up-percpu_counter_sum_and_set.patch b/queue-2.6.27/percpu-counter-clean-up-percpu_counter_sum_and_set.patch

new file mode 100644 (file)

index 0000000..86d95f9
--- /dev/null
+++ b/queue-2.6.27/percpu-counter-clean-up-percpu_counter_sum_and_set.patch
@@ -0,0 +1,104 @@
+From tytso@mit.edu  Mon Apr 19 10:20:04 2010
+From: Mingming Cao <cmm@us.ibm.com>
+Date: Mon, 15 Mar 2010 20:25:56 -0400
+Subject: percpu counter: clean up percpu_counter_sum_and_set()
+To: stable@kernel.org
+Cc: "Theodore Ts'o" <tytso@mit.edu>, Andrew Morton <akpm@linux-foundation.org>, Ext4 Developers List <linux-ext4@vger.kernel.org>, Mingming Cao <cmm@us.ibm.com>, "Jayson R. King" <dev@jaysonking.com>
+Message-ID: <1268699165-17461-3-git-send-email-tytso@mit.edu>
+
+
+From: Mingming Cao <cmm@us.ibm.com>
+
+commit 1f7c14c62ce63805f9574664a6c6de3633d4a354 upstream.
+
+percpu_counter_sum_and_set() and percpu_counter_sum() is the same except
+the former updates the global counter after accounting.  Since we are
+taking the fbc->lock to calculate the precise value of the counter in
+percpu_counter_sum() anyway, it should simply set fbc->count too, as the
+percpu_counter_sum_and_set() does.
+
+This patch merges these two interfaces into one.
+
+Signed-off-by: Mingming Cao <cmm@us.ibm.com>
+Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: <linux-ext4@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Jayson R. King <dev@jaysonking.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+
+---
+ fs/ext4/balloc.c               |    2 +-
+ include/linux/percpu_counter.h |   12 +++---------
+ lib/percpu_counter.c           |    8 +++-----
+ 3 files changed, 7 insertions(+), 15 deletions(-)
+
+--- a/fs/ext4/balloc.c
++++ b/fs/ext4/balloc.c
+@@ -1778,7 +1778,7 @@ ext4_fsblk_t ext4_has_free_blocks(struct
+ #ifdef CONFIG_SMP
+       if (free_blocks - root_blocks < FBC_BATCH)
+               free_blocks =
+-                      percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
++                      percpu_counter_sum(&sbi->s_freeblocks_counter);
+ #endif
+       if (free_blocks <= root_blocks)
+               /* we don't have free space */
+--- a/include/linux/percpu_counter.h
++++ b/include/linux/percpu_counter.h
+@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percp
+ void percpu_counter_destroy(struct percpu_counter *fbc);
+ void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
+ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
+-s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
++s64 __percpu_counter_sum(struct percpu_counter *fbc);
+ 
+ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
+ {
+@@ -44,19 +44,13 @@ static inline void percpu_counter_add(st
+ 
+ static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
+ {
+-      s64 ret = __percpu_counter_sum(fbc, 0);
++      s64 ret = __percpu_counter_sum(fbc);
+       return ret < 0 ? 0 : ret;
+ }
+ 
+-static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
+-{
+-      return __percpu_counter_sum(fbc, 1);
+-}
+-
+-
+ static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
+ {
+-      return __percpu_counter_sum(fbc, 0);
++      return __percpu_counter_sum(fbc);
+ }
+ 
+ static inline s64 percpu_counter_read(struct percpu_counter *fbc)
+--- a/lib/percpu_counter.c
++++ b/lib/percpu_counter.c
+@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
+  * Add up all the per-cpu counts, return the result.  This is a more accurate
+  * but much slower version of percpu_counter_read_positive()
+  */
+-s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
++s64 __percpu_counter_sum(struct percpu_counter *fbc)
+ {
+       s64 ret;
+       int cpu;
+@@ -62,11 +62,9 @@ s64 __percpu_counter_sum(struct percpu_c
+       for_each_online_cpu(cpu) {
+               s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
+               ret += *pcount;
+-              if (set)
+-                      *pcount = 0;
++              *pcount = 0;
+       }
+-      if (set)
+-              fbc->count = ret;
++      fbc->count = ret;
+ 
+       spin_unlock(&fbc->lock);
+       return ret;
diff --git a/queue-2.6.27/series b/queue-2.6.27/series

index 9091fb02c6d836ddecf1e9fc7c08eff6a0163443..3041f8329e3608a94a193aeaff1615a1dc73817b 100644 (file)
--- a/queue-2.6.27/series
+++ b/queue-2.6.27/series
@@ -1 +1,12 @@
  alsa-mixart-range-checking-proc-file.patch
+ext4-invalidate-pages-if-delalloc-block-allocation-fails.patch
+percpu-counter-clean-up-percpu_counter_sum_and_set.patch
+ext4-make-sure-all-the-block-allocation-paths-reserve-blocks.patch
+ext4-add-percpu-dirty-block-accounting.patch
+ext4-retry-block-reservation.patch
+ext4-retry-block-allocation-if-we-have-free-blocks-left.patch
+ext4-use-tag-dirty-lookup-during-mpage_da_submit_io.patch
+vfs-remove-the-range_cont-writeback-mode.patch
+vfs-add-no_nrwrite_index_update-writeback-control-flag.patch
+ext4-fix-file-fragmentation-during-large-file-write.patch
+ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch
diff --git a/queue-2.6.27/vfs-add-no_nrwrite_index_update-writeback-control-flag.patch b/queue-2.6.27/vfs-add-no_nrwrite_index_update-writeback-control-flag.patch

new file mode 100644 (file)

index 0000000..a6b5286
--- /dev/null
+++ b/queue-2.6.27/vfs-add-no_nrwrite_index_update-writeback-control-flag.patch
@@ -0,0 +1,87 @@
+From tytso@mit.edu  Mon Apr 19 10:23:14 2010
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Mon, 15 Mar 2010 20:26:03 -0400
+Subject: vfs: Add no_nrwrite_index_update writeback control flag
+To: stable@kernel.org
+Cc: linux-fsdevel@vger.kernel.org, Ext4 Developers List <linux-ext4@vger.kernel.org>, "Theodore Ts'o" <tytso@mit.edu>, "Jayson R. King" <dev@jaysonking.com>, "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+Message-ID: <1268699165-17461-10-git-send-email-tytso@mit.edu>
+
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+
+commit 17bc6c30cf6bfffd816bdc53682dd46fc34a2cf4 upstream.
+
+If no_nrwrite_index_update is set we don't update nr_to_write and
+address space writeback_index in write_cache_pages.  This change
+enables a file system to skip these updates in write_cache_pages and do
+them in the writepages() callback.  This patch will be followed by an
+ext4 patch that make use of these new flags.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+CC: linux-fsdevel@vger.kernel.org
+[dev@jaysonking.com: Modified the patch to account for subsequent changes in mainline being cherry-picked earlier for 2.6.27.y.]
+Signed-off-by: Jayson R. King <dev@jaysonking.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ include/linux/writeback.h |    9 +++++++++
+ mm/page-writeback.c       |   14 +++++++++-----
+ 2 files changed, 18 insertions(+), 5 deletions(-)
+
+--- a/include/linux/writeback.h
++++ b/include/linux/writeback.h
+@@ -62,6 +62,15 @@ struct writeback_control {
+       unsigned for_writepages:1;      /* This is a writepages() call */
+       unsigned range_cyclic:1;        /* range_start is cyclic */
+       unsigned more_io:1;             /* more io to be dispatched */
++      /*
++       * write_cache_pages() won't update wbc->nr_to_write and
++       * mapping->writeback_index if no_nrwrite_index_update
++       * is set.  write_cache_pages() may write more than we
++       * requested and we want to make sure nr_to_write and
++       * writeback_index are updated in a consistent manner
++       * so we use a single control to update them
++       */
++      unsigned no_nrwrite_index_update:1;
+ };
+ 
+ /*
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -878,6 +878,7 @@ int write_cache_pages(struct address_spa
+       pgoff_t done_index;
+       int cycled;
+       int range_whole = 0;
++      long nr_to_write = wbc->nr_to_write;
+ 
+       if (wbc->nonblocking && bdi_write_congested(bdi)) {
+               wbc->encountered_congestion = 1;
+@@ -985,9 +986,9 @@ continue_unlock:
+                               }
+                       }
+ 
+-                      if (wbc->nr_to_write > 0) {
+-                              wbc->nr_to_write--;
+-                              if (wbc->nr_to_write == 0 &&
++                      if (nr_to_write > 0) {
++                              nr_to_write--;
++                              if (nr_to_write == 0 &&
+                                   wbc->sync_mode == WB_SYNC_NONE) {
+                                       /*
+                                        * We stop writing back only if we are
+@@ -1024,8 +1025,11 @@ continue_unlock:
+               end = writeback_index - 1;
+               goto retry;
+       }
+-      if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+-              mapping->writeback_index = done_index;
++      if (!wbc->no_nrwrite_index_update) {
++              if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
++                      mapping->writeback_index = done_index;
++              wbc->nr_to_write = nr_to_write;
++      }
+ 
+       return ret;
+ }
diff --git a/queue-2.6.27/vfs-remove-the-range_cont-writeback-mode.patch b/queue-2.6.27/vfs-remove-the-range_cont-writeback-mode.patch

new file mode 100644 (file)

index 0000000..5e5a70c
--- /dev/null
+++ b/queue-2.6.27/vfs-remove-the-range_cont-writeback-mode.patch
@@ -0,0 +1,50 @@
+From tytso@mit.edu  Mon Apr 19 10:22:47 2010
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Mon, 15 Mar 2010 20:26:02 -0400
+Subject: vfs: Remove the range_cont writeback mode.
+To: stable@kernel.org
+Cc: linux-fsdevel@vger.kernel.org, Ext4 Developers List <linux-ext4@vger.kernel.org>, "Theodore Ts'o" <tytso@mit.edu>, "Jayson R. King" <dev@jaysonking.com>, "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+Message-ID: <1268699165-17461-9-git-send-email-tytso@mit.edu>
+
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+
+commit 74baaaaec8b4f22e1ae279f5ecca4ff705b28912 upstream.
+
+Ext4 was the only user of range_cont writeback mode and ext4 switched
+to a different method. So remove the range_cont mode which is not used
+in the kernel.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+CC: linux-fsdevel@vger.kernel.org
+Signed-off-by: Jayson R. King <dev@jaysonking.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ include/linux/writeback.h |    1 -
+ mm/page-writeback.c       |    2 --
+ 2 files changed, 3 deletions(-)
+
+--- a/include/linux/writeback.h
++++ b/include/linux/writeback.h
+@@ -62,7 +62,6 @@ struct writeback_control {
+       unsigned for_writepages:1;      /* This is a writepages() call */
+       unsigned range_cyclic:1;        /* range_start is cyclic */
+       unsigned more_io:1;             /* more io to be dispatched */
+-      unsigned range_cont:1;
+ };
+ 
+ /*
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -1027,8 +1027,6 @@ continue_unlock:
+       if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+               mapping->writeback_index = done_index;
+ 
+-      if (wbc->range_cont)
+-              wbc->range_start = index << PAGE_CACHE_SHIFT;
+       return ret;
+ }
+ EXPORT_SYMBOL(write_cache_pages);
author	Greg Kroah-Hartman <gregkh@suse.de>
	Mon, 19 Apr 2010 17:26:16 +0000 (10:26 -0700)
committer	Greg Kroah-Hartman <gregkh@suse.de>
	Mon, 19 Apr 2010 17:26:16 +0000 (10:26 -0700)
queue-2.6.27/ext4-add-percpu-dirty-block-accounting.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/ext4-fix-file-fragmentation-during-large-file-write.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/ext4-invalidate-pages-if-delalloc-block-allocation-fails.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/ext4-make-sure-all-the-block-allocation-paths-reserve-blocks.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/ext4-retry-block-allocation-if-we-have-free-blocks-left.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/ext4-retry-block-reservation.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/ext4-use-tag-dirty-lookup-during-mpage_da_submit_io.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/percpu-counter-clean-up-percpu_counter_sum_and_set.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/series		patch \| blob \| blame \| history
queue-2.6.27/vfs-add-no_nrwrite_index_update-writeback-control-flag.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.27/vfs-remove-the-range_cont-writeback-mode.patch	[new file with mode: 0644]	patch \| blob