From: Greg Kroah-Hartman Date: Mon, 19 Apr 2010 17:26:16 +0000 (-0700) Subject: .27 ext4 patches X-Git-Tag: v2.6.32.12~35 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=d7b4be12a43a5b3c85891ac50ae3d16114a74550;p=thirdparty%2Fkernel%2Fstable-queue.git .27 ext4 patches --- diff --git a/queue-2.6.27/ext4-add-percpu-dirty-block-accounting.patch b/queue-2.6.27/ext4-add-percpu-dirty-block-accounting.patch new file mode 100644 index 00000000000..a86a935b047 --- /dev/null +++ b/queue-2.6.27/ext4-add-percpu-dirty-block-accounting.patch @@ -0,0 +1,312 @@ +From tytso@mit.edu Mon Apr 19 10:21:01 2010 +From: Aneesh Kumar K.V +Date: Mon, 15 Mar 2010 20:25:58 -0400 +Subject: ext4: Add percpu dirty block accounting. +To: stable@kernel.org +Cc: "Theodore Ts'o" , Ext4 Developers List , Mingming Cao , "Jayson R. King" , "Aneesh Kumar K.V" +Message-ID: <1268699165-17461-5-git-send-email-tytso@mit.edu> + + +From: Aneesh Kumar K.V + +commit 6bc6e63fcd7dac9e633ea29f1fddd9580ab28f3f upstream. + +This patch adds dirty block accounting using percpu_counters. Delayed +allocation block reservation is now done by updating dirty block +counter. In a later patch we switch to non delalloc mode if the +filesystem free blocks is greater than 150% of total filesystem dirty +blocks + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: Mingming Cao +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Jayson R. King +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/balloc.c | 62 ++++++++++++++++++++++++++++++++++-------------------- + fs/ext4/ext4_sb.h | 1 + fs/ext4/inode.c | 22 +++++++++---------- + fs/ext4/mballoc.c | 31 ++++++++++++--------------- + fs/ext4/super.c | 8 ++++++ + 5 files changed, 73 insertions(+), 51 deletions(-) + +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -1757,26 +1757,38 @@ out: + int ext4_claim_free_blocks(struct ext4_sb_info *sbi, + ext4_fsblk_t nblocks) + { +- s64 free_blocks; ++ s64 free_blocks, dirty_blocks; + ext4_fsblk_t root_blocks = 0; + struct percpu_counter *fbc = &sbi->s_freeblocks_counter; ++ struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter; + +- free_blocks = percpu_counter_read(fbc); ++ free_blocks = percpu_counter_read_positive(fbc); ++ dirty_blocks = percpu_counter_read_positive(dbc); + + if (!capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current->fsuid && + (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) + root_blocks = ext4_r_blocks_count(sbi->s_es); + +- if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK) +- free_blocks = percpu_counter_sum(&sbi->s_freeblocks_counter); +- +- if (free_blocks < (root_blocks + nblocks)) ++ if (free_blocks - (nblocks + root_blocks + dirty_blocks) < ++ EXT4_FREEBLOCKS_WATERMARK) { ++ free_blocks = percpu_counter_sum(fbc); ++ dirty_blocks = percpu_counter_sum(dbc); ++ if (dirty_blocks < 0) { ++ printk(KERN_CRIT "Dirty block accounting " ++ "went wrong %lld\n", ++ dirty_blocks); ++ } ++ } ++ /* Check whether we have space after ++ * accounting for current dirty blocks ++ */ ++ if (free_blocks < ((s64)(root_blocks + nblocks) + dirty_blocks)) + /* we don't have free space */ + return -ENOSPC; + +- /* reduce fs free blocks counter */ +- percpu_counter_sub(fbc, nblocks); ++ /* Add the blocks to nblocks */ ++ percpu_counter_add(dbc, nblocks); + return 0; + } + +@@ -1792,23 +1804,28 @@ int ext4_claim_free_blocks(struct ext4_s + ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, + ext4_fsblk_t nblocks) + { +- ext4_fsblk_t free_blocks; ++ ext4_fsblk_t free_blocks, dirty_blocks; + ext4_fsblk_t root_blocks = 0; ++ struct percpu_counter *fbc = &sbi->s_freeblocks_counter; ++ struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter; + +- free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); ++ free_blocks = percpu_counter_read_positive(fbc); ++ dirty_blocks = percpu_counter_read_positive(dbc); + + if (!capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current->fsuid && + (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) + root_blocks = ext4_r_blocks_count(sbi->s_es); + +- if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK) +- free_blocks = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); +- +- if (free_blocks <= root_blocks) ++ if (free_blocks - (nblocks + root_blocks + dirty_blocks) < ++ EXT4_FREEBLOCKS_WATERMARK) { ++ free_blocks = percpu_counter_sum_positive(fbc); ++ dirty_blocks = percpu_counter_sum_positive(dbc); ++ } ++ if (free_blocks <= (root_blocks + dirty_blocks)) + /* we don't have free space */ + return 0; +- if (free_blocks - root_blocks < nblocks) ++ if (free_blocks - (root_blocks + dirty_blocks) < nblocks) + return free_blocks - root_blocks; + return nblocks; + } +@@ -2089,13 +2106,14 @@ allocated: + le16_add_cpu(&gdp->bg_free_blocks_count, -num); + gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); + spin_unlock(sb_bgl_lock(sbi, group_no)); +- if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) { +- /* +- * we allocated less blocks than we +- * claimed. Add the difference back. +- */ +- percpu_counter_add(&sbi->s_freeblocks_counter, *count - num); +- } ++ percpu_counter_sub(&sbi->s_freeblocks_counter, num); ++ /* ++ * Now reduce the dirty block count also. Should not go negative ++ */ ++ if (!EXT4_I(inode)->i_delalloc_reserved_flag) ++ percpu_counter_sub(&sbi->s_dirtyblocks_counter, *count); ++ else ++ percpu_counter_sub(&sbi->s_dirtyblocks_counter, num); + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, group_no); + spin_lock(sb_bgl_lock(sbi, flex_group)); +--- a/fs/ext4/ext4_sb.h ++++ b/fs/ext4/ext4_sb.h +@@ -60,6 +60,7 @@ struct ext4_sb_info { + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; ++ struct percpu_counter s_dirtyblocks_counter; + struct blockgroup_lock s_blockgroup_lock; + + /* root of the per fs reservation window tree */ +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1032,19 +1032,20 @@ static void ext4_da_update_reserve_space + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; + +- /* Account for allocated meta_blocks */ +- mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; +- +- /* update fs free blocks counter for truncate case */ +- percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free); ++ if (mdb_free) { ++ /* Account for allocated meta_blocks */ ++ mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; ++ ++ /* update fs dirty blocks counter */ ++ percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); ++ EXT4_I(inode)->i_allocated_meta_blocks = 0; ++ EXT4_I(inode)->i_reserved_meta_blocks = mdb; ++ } + + /* update per-inode reservations */ + BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); + EXT4_I(inode)->i_reserved_data_blocks -= used; + +- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); +- EXT4_I(inode)->i_reserved_meta_blocks = mdb; +- EXT4_I(inode)->i_allocated_meta_blocks = 0; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + + /* +@@ -1609,8 +1610,8 @@ static void ext4_da_release_space(struct + + release = to_free + mdb_free; + +- /* update fs free blocks counter for truncate case */ +- percpu_counter_add(&sbi->s_freeblocks_counter, release); ++ /* update fs dirty blocks counter for truncate case */ ++ percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); + + /* update per-inode reservations */ + BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); +@@ -2546,7 +2547,6 @@ static int ext4_da_write_begin(struct fi + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; +- + retry: + /* + * With delayed allocation, we don't log the i_disksize update +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -3100,7 +3100,7 @@ void exit_ext4_mballoc(void) + */ + static noinline_for_stack int + ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, +- handle_t *handle) ++ handle_t *handle, unsigned long reserv_blks) + { + struct buffer_head *bitmap_bh = NULL; + struct ext4_super_block *es; +@@ -3188,21 +3188,16 @@ ext4_mb_mark_diskspace_used(struct ext4_ + le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); + gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); + spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); +- ++ percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); + /* +- * free blocks account has already be reduced/reserved +- * at write_begin() time for delayed allocation +- * do not double accounting ++ * Now reduce the dirty block count also. Should not go negative + */ +- if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) && +- ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) { +- /* +- * we allocated less blocks than we calimed +- * Add the difference back +- */ +- percpu_counter_add(&sbi->s_freeblocks_counter, +- ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len); +- } ++ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) ++ /* release all the reserved blocks if non delalloc */ ++ percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); ++ else ++ percpu_counter_sub(&sbi->s_dirtyblocks_counter, ++ ac->ac_b_ex.fe_len); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, +@@ -4636,12 +4631,13 @@ static int ext4_mb_discard_preallocation + ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, + struct ext4_allocation_request *ar, int *errp) + { ++ int freed; + struct ext4_allocation_context *ac = NULL; + struct ext4_sb_info *sbi; + struct super_block *sb; + ext4_fsblk_t block = 0; +- int freed; +- int inquota; ++ unsigned long inquota; ++ unsigned long reserv_blks = 0; + + sb = ar->inode->i_sb; + sbi = EXT4_SB(sb); +@@ -4659,6 +4655,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t + *errp = -ENOSPC; + return 0; + } ++ reserv_blks = ar->len; + } + while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { + ar->flags |= EXT4_MB_HINT_NOPREALLOC; +@@ -4704,7 +4701,7 @@ repeat: + ext4_mb_new_preallocation(ac); + } + if (likely(ac->ac_status == AC_STATUS_FOUND)) { +- *errp = ext4_mb_mark_diskspace_used(ac, handle); ++ *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); + if (*errp == -EAGAIN) { + /* + * drop the reference that we took +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -521,6 +521,7 @@ static void ext4_put_super(struct super_ + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); ++ percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + brelse(sbi->s_sbh); + #ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) +@@ -2280,6 +2281,9 @@ static int ext4_fill_super(struct super_ + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sb)); + } ++ if (!err) { ++ err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); ++ } + if (err) { + printk(KERN_ERR "EXT4-fs: insufficient memory\n"); + goto failed_mount3; +@@ -2517,6 +2521,7 @@ failed_mount3: + percpu_counter_destroy(&sbi->s_freeblocks_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); ++ percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); +@@ -3208,7 +3213,8 @@ static int ext4_statfs(struct dentry *de + buf->f_type = EXT4_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; +- buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); ++ buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - ++ percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); + ext4_free_blocks_count_set(es, buf->f_bfree); + buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); + if (buf->f_bfree < ext4_r_blocks_count(es)) diff --git a/queue-2.6.27/ext4-fix-file-fragmentation-during-large-file-write.patch b/queue-2.6.27/ext4-fix-file-fragmentation-during-large-file-write.patch new file mode 100644 index 00000000000..00bd9778a92 --- /dev/null +++ b/queue-2.6.27/ext4-fix-file-fragmentation-during-large-file-write.patch @@ -0,0 +1,199 @@ +From tytso@mit.edu Mon Apr 19 10:23:42 2010 +From: Aneesh Kumar K.V +Date: Mon, 15 Mar 2010 20:26:04 -0400 +Subject: ext4: Fix file fragmentation during large file write. +To: stable@kernel.org +Cc: Ext4 Developers List , "Theodore Ts'o" , "Jayson R. King" , "Aneesh Kumar K.V" +Message-ID: <1268699165-17461-11-git-send-email-tytso@mit.edu> + + +From: Aneesh Kumar K.V + +commit 22208dedbd7626e5fc4339c417f8d24cc21f79d7 upstream. + +The range_cyclic writeback mode uses the address_space writeback_index +as the start index for writeback. With delayed allocation we were +updating writeback_index wrongly resulting in highly fragmented file. +This patch reduces the number of extents reduced from 4000 to 27 for a +3GB file. + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: Theodore Ts'o +[dev@jaysonking.com: Some changed lines from the original version of this patch were dropped, since they were rolled up with another cherry-picked patch applied to 2.6.27.y earlier.] +Signed-off-by: Jayson R. King +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/inode.c | 88 +++++++++++++++++++++++++++++++++++--------------------- + 1 file changed, 55 insertions(+), 33 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1721,7 +1721,11 @@ static int mpage_da_submit_io(struct mpa + + pages_skipped = mpd->wbc->pages_skipped; + err = mapping->a_ops->writepage(page, mpd->wbc); +- if (!err) ++ if (!err && (pages_skipped == mpd->wbc->pages_skipped)) ++ /* ++ * have successfully written the page ++ * without skipping the same ++ */ + mpd->pages_written++; + /* + * In error case, we have to continue because +@@ -2175,7 +2179,6 @@ static int mpage_da_writepages(struct ad + struct writeback_control *wbc, + struct mpage_da_data *mpd) + { +- long to_write; + int ret; + + if (!mpd->get_block) +@@ -2190,19 +2193,18 @@ static int mpage_da_writepages(struct ad + mpd->pages_written = 0; + mpd->retval = 0; + +- to_write = wbc->nr_to_write; +- + ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); +- + /* + * Handle last extent of pages + */ + if (!mpd->io_done && mpd->next_page != mpd->first_page) { + if (mpage_da_map_blocks(mpd) == 0) + mpage_da_submit_io(mpd); +- } + +- wbc->nr_to_write = to_write - mpd->pages_written; ++ mpd->io_done = 1; ++ ret = MPAGE_DA_EXTENT_TAIL; ++ } ++ wbc->nr_to_write -= mpd->pages_written; + return ret; + } + +@@ -2447,11 +2449,14 @@ static int ext4_da_writepages_trans_bloc + static int ext4_da_writepages(struct address_space *mapping, + struct writeback_control *wbc) + { ++ pgoff_t index; ++ int range_whole = 0; + handle_t *handle = NULL; + struct mpage_da_data mpd; + struct inode *inode = mapping->host; ++ int no_nrwrite_index_update; ++ long pages_written = 0, pages_skipped; + int needed_blocks, ret = 0, nr_to_writebump = 0; +- long to_write, pages_skipped = 0; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + + /* +@@ -2485,16 +2490,26 @@ static int ext4_da_writepages(struct add + nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; + wbc->nr_to_write = sbi->s_mb_stream_request; + } ++ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) ++ range_whole = 1; + +- +- pages_skipped = wbc->pages_skipped; ++ if (wbc->range_cyclic) ++ index = mapping->writeback_index; ++ else ++ index = wbc->range_start >> PAGE_CACHE_SHIFT; + + mpd.wbc = wbc; + mpd.inode = mapping->host; + +-restart_loop: +- to_write = wbc->nr_to_write; +- while (!ret && to_write > 0) { ++ /* ++ * we don't want write_cache_pages to update ++ * nr_to_write and writeback_index ++ */ ++ no_nrwrite_index_update = wbc->no_nrwrite_index_update; ++ wbc->no_nrwrite_index_update = 1; ++ pages_skipped = wbc->pages_skipped; ++ ++ while (!ret && wbc->nr_to_write > 0) { + + /* + * we insert one extent at a time. So we need +@@ -2527,46 +2542,53 @@ restart_loop: + goto out_writepages; + } + } +- to_write -= wbc->nr_to_write; +- + mpd.get_block = ext4_da_get_block_write; + ret = mpage_da_writepages(mapping, wbc, &mpd); + + ext4_journal_stop(handle); + +- if (mpd.retval == -ENOSPC) ++ if (mpd.retval == -ENOSPC) { ++ /* commit the transaction which would ++ * free blocks released in the transaction ++ * and try again ++ */ + jbd2_journal_force_commit_nested(sbi->s_journal); +- +- /* reset the retry count */ +- if (ret == MPAGE_DA_EXTENT_TAIL) { ++ wbc->pages_skipped = pages_skipped; ++ ret = 0; ++ } else if (ret == MPAGE_DA_EXTENT_TAIL) { + /* + * got one extent now try with + * rest of the pages + */ +- to_write += wbc->nr_to_write; ++ pages_written += mpd.pages_written; ++ wbc->pages_skipped = pages_skipped; + ret = 0; +- } else if (wbc->nr_to_write) { ++ } else if (wbc->nr_to_write) + /* + * There is no more writeout needed + * or we requested for a noblocking writeout + * and we found the device congested + */ +- to_write += wbc->nr_to_write; + break; +- } +- wbc->nr_to_write = to_write; +- } +- +- if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) { +- /* We skipped pages in this loop */ +- wbc->nr_to_write = to_write + +- wbc->pages_skipped - pages_skipped; +- wbc->pages_skipped = pages_skipped; +- goto restart_loop; + } ++ if (pages_skipped != wbc->pages_skipped) ++ printk(KERN_EMERG "This should not happen leaving %s " ++ "with nr_to_write = %ld ret = %d\n", ++ __func__, wbc->nr_to_write, ret); ++ ++ /* Update index */ ++ index += pages_written; ++ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) ++ /* ++ * set the writeback_index so that range_cyclic ++ * mode will write it back later ++ */ ++ mapping->writeback_index = index; + + out_writepages: +- wbc->nr_to_write = to_write - nr_to_writebump; ++ if (!no_nrwrite_index_update) ++ wbc->no_nrwrite_index_update = 0; ++ wbc->nr_to_write -= nr_to_writebump; + return ret; + } + diff --git a/queue-2.6.27/ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch b/queue-2.6.27/ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch new file mode 100644 index 00000000000..e3e06329c3d --- /dev/null +++ b/queue-2.6.27/ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch @@ -0,0 +1,103 @@ +From tytso@mit.edu Mon Apr 19 10:24:03 2010 +From: Aneesh Kumar K.V +Date: Mon, 15 Mar 2010 20:26:05 -0400 +Subject: ext4: Implement range_cyclic in ext4_da_writepages instead of write_cache_pages +To: stable@kernel.org +Cc: Ext4 Developers List , "Theodore Ts'o" , "Jayson R. King" , "Aneesh Kumar K.V" +Message-ID: <1268699165-17461-12-git-send-email-tytso@mit.edu> + + +From: Aneesh Kumar K.V + +commit 2acf2c261b823d9d9ed954f348b97620297a36b5 upstream. + +With delayed allocation we lock the page in write_cache_pages() and +try to build an in memory extent of contiguous blocks. This is needed +so that we can get large contiguous blocks request. If range_cyclic +mode is enabled, write_cache_pages() will loop back to the 0 index if +no I/O has been done yet, and try to start writing from the beginning +of the range. That causes an attempt to take the page lock of lower +index page while holding the page lock of higher index page, which can +cause a dead lock with another writeback thread. + +The solution is to implement the range_cyclic behavior in +ext4_da_writepages() instead. + +http://bugzilla.kernel.org/show_bug.cgi?id=12579 + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Jayson R. King +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/inode.c | 21 +++++++++++++++++++-- + 1 file changed, 19 insertions(+), 2 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -2456,6 +2456,7 @@ static int ext4_da_writepages(struct add + struct inode *inode = mapping->host; + int no_nrwrite_index_update; + long pages_written = 0, pages_skipped; ++ int range_cyclic, cycled = 1, io_done = 0; + int needed_blocks, ret = 0, nr_to_writebump = 0; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + +@@ -2493,9 +2494,15 @@ static int ext4_da_writepages(struct add + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + +- if (wbc->range_cyclic) ++ range_cyclic = wbc->range_cyclic; ++ if (wbc->range_cyclic) { + index = mapping->writeback_index; +- else ++ if (index) ++ cycled = 0; ++ wbc->range_start = index << PAGE_CACHE_SHIFT; ++ wbc->range_end = LLONG_MAX; ++ wbc->range_cyclic = 0; ++ } else + index = wbc->range_start >> PAGE_CACHE_SHIFT; + + mpd.wbc = wbc; +@@ -2509,6 +2516,7 @@ static int ext4_da_writepages(struct add + wbc->no_nrwrite_index_update = 1; + pages_skipped = wbc->pages_skipped; + ++retry: + while (!ret && wbc->nr_to_write > 0) { + + /* +@@ -2563,6 +2571,7 @@ static int ext4_da_writepages(struct add + pages_written += mpd.pages_written; + wbc->pages_skipped = pages_skipped; + ret = 0; ++ io_done = 1; + } else if (wbc->nr_to_write) + /* + * There is no more writeout needed +@@ -2571,6 +2580,13 @@ static int ext4_da_writepages(struct add + */ + break; + } ++ if (!io_done && !cycled) { ++ cycled = 1; ++ index = 0; ++ wbc->range_start = index << PAGE_CACHE_SHIFT; ++ wbc->range_end = mapping->writeback_index - 1; ++ goto retry; ++ } + if (pages_skipped != wbc->pages_skipped) + printk(KERN_EMERG "This should not happen leaving %s " + "with nr_to_write = %ld ret = %d\n", +@@ -2578,6 +2594,7 @@ static int ext4_da_writepages(struct add + + /* Update index */ + index += pages_written; ++ wbc->range_cyclic = range_cyclic; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + /* + * set the writeback_index so that range_cyclic diff --git a/queue-2.6.27/ext4-invalidate-pages-if-delalloc-block-allocation-fails.patch b/queue-2.6.27/ext4-invalidate-pages-if-delalloc-block-allocation-fails.patch new file mode 100644 index 00000000000..33b971309ab --- /dev/null +++ b/queue-2.6.27/ext4-invalidate-pages-if-delalloc-block-allocation-fails.patch @@ -0,0 +1,171 @@ +From tytso@mit.edu Mon Apr 19 10:19:40 2010 +From: Aneesh Kumar K.V +Date: Mon, 15 Mar 2010 20:25:55 -0400 +Subject: ext4: invalidate pages if delalloc block allocation fails. +To: stable@kernel.org +Cc: Ext4 Developers List , "Theodore Ts'o" , "Jayson R. King" , "Aneesh Kumar K.V" +Message-ID: <1268699165-17461-2-git-send-email-tytso@mit.edu> + + +From: Aneesh Kumar K.V + +commit c4a0c46ec92c194c873232b88debce4e1a448483 upstream. + +We are a bit agressive in invalidating all the pages. But +it is ok because we really don't know why the block allocation +failed and it is better to come of the writeback path +so that user can look for more info. + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: Jayson R. King +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/inode.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 73 insertions(+), 12 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1821,6 +1821,39 @@ static inline void __unmap_underlying_bl + unmap_underlying_metadata(bdev, bh->b_blocknr + i); + } + ++static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, ++ sector_t logical, long blk_cnt) ++{ ++ int nr_pages, i; ++ pgoff_t index, end; ++ struct pagevec pvec; ++ struct inode *inode = mpd->inode; ++ struct address_space *mapping = inode->i_mapping; ++ ++ index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); ++ end = (logical + blk_cnt - 1) >> ++ (PAGE_CACHE_SHIFT - inode->i_blkbits); ++ while (index <= end) { ++ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); ++ if (nr_pages == 0) ++ break; ++ for (i = 0; i < nr_pages; i++) { ++ struct page *page = pvec.pages[i]; ++ index = page->index; ++ if (index > end) ++ break; ++ index++; ++ ++ BUG_ON(!PageLocked(page)); ++ BUG_ON(PageWriteback(page)); ++ block_invalidatepage(page, 0); ++ ClearPageUptodate(page); ++ unlock_page(page); ++ } ++ } ++ return; ++} ++ + /* + * mpage_da_map_blocks - go through given space + * +@@ -1830,7 +1863,7 @@ static inline void __unmap_underlying_bl + * The function skips space we know is already mapped to disk blocks. + * + */ +-static void mpage_da_map_blocks(struct mpage_da_data *mpd) ++static int mpage_da_map_blocks(struct mpage_da_data *mpd) + { + int err = 0; + struct buffer_head *lbh = &mpd->lbh; +@@ -1841,7 +1874,7 @@ static void mpage_da_map_blocks(struct m + * We consider only non-mapped and non-allocated blocks + */ + if (buffer_mapped(lbh) && !buffer_delay(lbh)) +- return; ++ return 0; + + new.b_state = lbh->b_state; + new.b_blocknr = 0; +@@ -1852,10 +1885,38 @@ static void mpage_da_map_blocks(struct m + * to write simply return + */ + if (!new.b_size) +- return; ++ return 0; + err = mpd->get_block(mpd->inode, next, &new, 1); +- if (err) +- return; ++ if (err) { ++ ++ /* If get block returns with error ++ * we simply return. Later writepage ++ * will redirty the page and writepages ++ * will find the dirty page again ++ */ ++ if (err == -EAGAIN) ++ return 0; ++ /* ++ * get block failure will cause us ++ * to loop in writepages. Because ++ * a_ops->writepage won't be able to ++ * make progress. The page will be redirtied ++ * by writepage and writepages will again ++ * try to write the same. ++ */ ++ printk(KERN_EMERG "%s block allocation failed for inode %lu " ++ "at logical offset %llu with max blocks " ++ "%zd with error %d\n", ++ __func__, mpd->inode->i_ino, ++ (unsigned long long)next, ++ lbh->b_size >> mpd->inode->i_blkbits, err); ++ printk(KERN_EMERG "This should not happen.!! " ++ "Data will be lost\n"); ++ /* invlaidate all the pages */ ++ ext4_da_block_invalidatepages(mpd, next, ++ lbh->b_size >> mpd->inode->i_blkbits); ++ return err; ++ } + BUG_ON(new.b_size == 0); + + if (buffer_new(&new)) +@@ -1868,7 +1929,7 @@ static void mpage_da_map_blocks(struct m + if (buffer_delay(lbh) || buffer_unwritten(lbh)) + mpage_put_bnr_to_bhs(mpd, next, &new); + +- return; ++ return 0; + } + + #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ +@@ -1937,8 +1998,8 @@ flush_it: + * We couldn't merge the block to our extent, so we + * need to flush current extent and start new one + */ +- mpage_da_map_blocks(mpd); +- mpage_da_submit_io(mpd); ++ if (mpage_da_map_blocks(mpd) == 0) ++ mpage_da_submit_io(mpd); + mpd->io_done = 1; + return; + } +@@ -1980,8 +2041,8 @@ static int __mpage_da_writepage(struct p + * and start IO on them using writepage() + */ + if (mpd->next_page != mpd->first_page) { +- mpage_da_map_blocks(mpd); +- mpage_da_submit_io(mpd); ++ if (mpage_da_map_blocks(mpd) == 0) ++ mpage_da_submit_io(mpd); + /* + * skip rest of the page in the page_vec + */ +@@ -2102,8 +2163,8 @@ static int mpage_da_writepages(struct ad + * Handle last extent of pages + */ + if (!mpd.io_done && mpd.next_page != mpd.first_page) { +- mpage_da_map_blocks(&mpd); +- mpage_da_submit_io(&mpd); ++ if (mpage_da_map_blocks(&mpd) == 0) ++ mpage_da_submit_io(&mpd); + } + + wbc->nr_to_write = to_write - mpd.pages_written; diff --git a/queue-2.6.27/ext4-make-sure-all-the-block-allocation-paths-reserve-blocks.patch b/queue-2.6.27/ext4-make-sure-all-the-block-allocation-paths-reserve-blocks.patch new file mode 100644 index 00000000000..56313d64fb2 --- /dev/null +++ b/queue-2.6.27/ext4-make-sure-all-the-block-allocation-paths-reserve-blocks.patch @@ -0,0 +1,218 @@ +From tytso@mit.edu Mon Apr 19 10:20:41 2010 +From: Aneesh Kumar K.V +Date: Mon, 15 Mar 2010 20:25:57 -0400 +Subject: ext4: Make sure all the block allocation paths reserve blocks +To: stable@kernel.org +Cc: Ext4 Developers List , "Theodore Ts'o" , "Jayson R. King" , "Aneesh Kumar K.V" +Message-ID: <1268699165-17461-4-git-send-email-tytso@mit.edu> + + +From: Aneesh Kumar K.V + +commit a30d542a0035b886ffaafd0057ced0a2b28c3a4f upstream. + +With delayed allocation we need to make sure block are reserved before +we attempt to allocate them. Otherwise we get block allocation failure +(ENOSPC) during writepages which cannot be handled. This would mean +silent data loss (We do a printk stating data will be lost). This patch +updates the DIO and fallocate code path to do block reservation before +block allocation. This is needed to make sure parallel DIO and fallocate +request doesn't take block out of delayed reserve space. + +When free blocks count go below a threshold we switch to a slow patch +which looks at other CPU's accumulated percpu counter values. + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Jayson R. King +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/balloc.c | 58 +++++++++++++++++++++++++++++++++++++++--------------- + fs/ext4/ext4.h | 13 ++++++++++++ + fs/ext4/inode.c | 5 ---- + fs/ext4/mballoc.c | 23 ++++++++++++--------- + 4 files changed, 69 insertions(+), 30 deletions(-) + +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -1754,6 +1754,32 @@ out: + return ret; + } + ++int ext4_claim_free_blocks(struct ext4_sb_info *sbi, ++ ext4_fsblk_t nblocks) ++{ ++ s64 free_blocks; ++ ext4_fsblk_t root_blocks = 0; ++ struct percpu_counter *fbc = &sbi->s_freeblocks_counter; ++ ++ free_blocks = percpu_counter_read(fbc); ++ ++ if (!capable(CAP_SYS_RESOURCE) && ++ sbi->s_resuid != current->fsuid && ++ (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) ++ root_blocks = ext4_r_blocks_count(sbi->s_es); ++ ++ if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK) ++ free_blocks = percpu_counter_sum(&sbi->s_freeblocks_counter); ++ ++ if (free_blocks < (root_blocks + nblocks)) ++ /* we don't have free space */ ++ return -ENOSPC; ++ ++ /* reduce fs free blocks counter */ ++ percpu_counter_sub(fbc, nblocks); ++ return 0; ++} ++ + /** + * ext4_has_free_blocks() + * @sbi: in-core super block structure. +@@ -1775,18 +1801,17 @@ ext4_fsblk_t ext4_has_free_blocks(struct + sbi->s_resuid != current->fsuid && + (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) + root_blocks = ext4_r_blocks_count(sbi->s_es); +-#ifdef CONFIG_SMP +- if (free_blocks - root_blocks < FBC_BATCH) +- free_blocks = +- percpu_counter_sum(&sbi->s_freeblocks_counter); +-#endif ++ ++ if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK) ++ free_blocks = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); ++ + if (free_blocks <= root_blocks) + /* we don't have free space */ + return 0; + if (free_blocks - root_blocks < nblocks) + return free_blocks - root_blocks; + return nblocks; +- } ++} + + + /** +@@ -1865,14 +1890,11 @@ ext4_fsblk_t ext4_old_new_blocks(handle_ + /* + * With delalloc we already reserved the blocks + */ +- *count = ext4_has_free_blocks(sbi, *count); +- } +- if (*count == 0) { +- *errp = -ENOSPC; +- return 0; /*return with ENOSPC error */ ++ if (ext4_claim_free_blocks(sbi, *count)) { ++ *errp = -ENOSPC; ++ return 0; /*return with ENOSPC error */ ++ } + } +- num = *count; +- + /* + * Check quota for allocation of this block. + */ +@@ -2067,9 +2089,13 @@ allocated: + le16_add_cpu(&gdp->bg_free_blocks_count, -num); + gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); + spin_unlock(sb_bgl_lock(sbi, group_no)); +- if (!EXT4_I(inode)->i_delalloc_reserved_flag) +- percpu_counter_sub(&sbi->s_freeblocks_counter, num); +- ++ if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) { ++ /* ++ * we allocated less blocks than we ++ * claimed. Add the difference back. ++ */ ++ percpu_counter_add(&sbi->s_freeblocks_counter, *count - num); ++ } + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, group_no); + spin_lock(sb_bgl_lock(sbi, flex_group)); +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1015,6 +1015,8 @@ extern ext4_fsblk_t ext4_new_blocks(hand + unsigned long *count, int *errp); + extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, unsigned long *count, int *errp); ++extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, ++ ext4_fsblk_t nblocks); + extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, + ext4_fsblk_t nblocks); + extern void ext4_free_blocks (handle_t *handle, struct inode *inode, +@@ -1245,6 +1247,17 @@ do { \ + __ext4_std_error((sb), __func__, (errno)); \ + } while (0) + ++#ifdef CONFIG_SMP ++/* Each CPU can accumulate FBC_BATCH blocks in their local ++ * counters. So we need to make sure we have free blocks more ++ * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times. ++ */ ++#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids)) ++#else ++#define EXT4_FREEBLOCKS_WATERMARK 0 ++#endif ++ ++ + /* + * Inodes and files operations + */ +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1564,13 +1564,10 @@ static int ext4_da_reserve_space(struct + md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; + total = md_needed + nrblocks; + +- if (ext4_has_free_blocks(sbi, total) < total) { ++ if (ext4_claim_free_blocks(sbi, total)) { + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return -ENOSPC; + } +- /* reduce fs free blocks counter */ +- percpu_counter_sub(&sbi->s_freeblocks_counter, total); +- + EXT4_I(inode)->i_reserved_data_blocks += nrblocks; + EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -3194,9 +3194,15 @@ ext4_mb_mark_diskspace_used(struct ext4_ + * at write_begin() time for delayed allocation + * do not double accounting + */ +- if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) +- percpu_counter_sub(&sbi->s_freeblocks_counter, +- ac->ac_b_ex.fe_len); ++ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) && ++ ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) { ++ /* ++ * we allocated less blocks than we calimed ++ * Add the difference back ++ */ ++ percpu_counter_add(&sbi->s_freeblocks_counter, ++ ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len); ++ } + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, +@@ -4649,14 +4655,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t + /* + * With delalloc we already reserved the blocks + */ +- ar->len = ext4_has_free_blocks(sbi, ar->len); +- } +- +- if (ar->len == 0) { +- *errp = -ENOSPC; +- return 0; ++ if (ext4_claim_free_blocks(sbi, ar->len)) { ++ *errp = -ENOSPC; ++ return 0; ++ } + } +- + while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { + ar->flags |= EXT4_MB_HINT_NOPREALLOC; + ar->len--; diff --git a/queue-2.6.27/ext4-retry-block-allocation-if-we-have-free-blocks-left.patch b/queue-2.6.27/ext4-retry-block-allocation-if-we-have-free-blocks-left.patch new file mode 100644 index 00000000000..76a9962be1f --- /dev/null +++ b/queue-2.6.27/ext4-retry-block-allocation-if-we-have-free-blocks-left.patch @@ -0,0 +1,200 @@ +From tytso@mit.edu Mon Apr 19 10:22:08 2010 +From: Aneesh Kumar K.V +Date: Mon, 15 Mar 2010 20:26:00 -0400 +Subject: ext4: Retry block allocation if we have free blocks left +To: stable@kernel.org +Cc: "Theodore Ts'o" , Ext4 Developers List , Mingming Cao , "Jayson R. King" , "Aneesh Kumar K.V" +Message-ID: <1268699165-17461-7-git-send-email-tytso@mit.edu> + + +From: Aneesh Kumar K.V + +commit df22291ff0fde0d350cf15dac3e5cc33ac528875 upstream. + +When we truncate files, the meta-data blocks released are not reused +untill we commit the truncate transaction. That means delayed get_block +request will return ENOSPC even if we have free blocks left. Force a +journal commit and retry block allocation if we get ENOSPC with free +blocks left. + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: Mingming Cao +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Jayson R. King +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/inode.c | 81 +++++++++++++++++++++++++++++++++++++++----------------- + 1 file changed, 57 insertions(+), 24 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1661,6 +1661,7 @@ struct mpage_da_data { + struct writeback_control *wbc; + int io_done; + long pages_written; ++ int retval; + }; + + /* +@@ -1858,6 +1859,24 @@ static void ext4_da_block_invalidatepage + return; + } + ++static void ext4_print_free_blocks(struct inode *inode) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ++ printk(KERN_EMERG "Total free blocks count %lld\n", ++ ext4_count_free_blocks(inode->i_sb)); ++ printk(KERN_EMERG "Free/Dirty block details\n"); ++ printk(KERN_EMERG "free_blocks=%lld\n", ++ percpu_counter_sum(&sbi->s_freeblocks_counter)); ++ printk(KERN_EMERG "dirty_blocks=%lld\n", ++ percpu_counter_sum(&sbi->s_dirtyblocks_counter)); ++ printk(KERN_EMERG "Block reservation details\n"); ++ printk(KERN_EMERG "i_reserved_data_blocks=%lu\n", ++ EXT4_I(inode)->i_reserved_data_blocks); ++ printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n", ++ EXT4_I(inode)->i_reserved_meta_blocks); ++ return; ++} ++ + /* + * mpage_da_map_blocks - go through given space + * +@@ -1872,7 +1891,7 @@ static int mpage_da_map_blocks(struct m + int err = 0; + struct buffer_head new; + struct buffer_head *lbh = &mpd->lbh; +- sector_t next = lbh->b_blocknr; ++ sector_t next; + + /* + * We consider only non-mapped and non-allocated blocks +@@ -1882,6 +1901,7 @@ static int mpage_da_map_blocks(struct m + new.b_state = lbh->b_state; + new.b_blocknr = 0; + new.b_size = lbh->b_size; ++ next = lbh->b_blocknr; + /* + * If we didn't accumulate anything + * to write simply return +@@ -1898,6 +1918,13 @@ static int mpage_da_map_blocks(struct m + */ + if (err == -EAGAIN) + return 0; ++ ++ if (err == -ENOSPC && ++ ext4_count_free_blocks(mpd->inode->i_sb)) { ++ mpd->retval = err; ++ return 0; ++ } ++ + /* + * get block failure will cause us + * to loop in writepages. Because +@@ -1915,8 +1942,7 @@ static int mpage_da_map_blocks(struct m + printk(KERN_EMERG "This should not happen.!! " + "Data will be lost\n"); + if (err == -ENOSPC) { +- printk(KERN_CRIT "Total free blocks count %lld\n", +- ext4_count_free_blocks(mpd->inode->i_sb)); ++ ext4_print_free_blocks(mpd->inode); + } + /* invlaidate all the pages */ + ext4_da_block_invalidatepages(mpd, next, +@@ -2141,39 +2167,36 @@ static int __mpage_da_writepage(struct p + */ + static int mpage_da_writepages(struct address_space *mapping, + struct writeback_control *wbc, +- get_block_t get_block) ++ struct mpage_da_data *mpd) + { +- struct mpage_da_data mpd; + long to_write; + int ret; + +- if (!get_block) ++ if (!mpd->get_block) + return generic_writepages(mapping, wbc); + +- mpd.wbc = wbc; +- mpd.inode = mapping->host; +- mpd.lbh.b_size = 0; +- mpd.lbh.b_state = 0; +- mpd.lbh.b_blocknr = 0; +- mpd.first_page = 0; +- mpd.next_page = 0; +- mpd.get_block = get_block; +- mpd.io_done = 0; +- mpd.pages_written = 0; ++ mpd->lbh.b_size = 0; ++ mpd->lbh.b_state = 0; ++ mpd->lbh.b_blocknr = 0; ++ mpd->first_page = 0; ++ mpd->next_page = 0; ++ mpd->io_done = 0; ++ mpd->pages_written = 0; ++ mpd->retval = 0; + + to_write = wbc->nr_to_write; + +- ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); ++ ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); + + /* + * Handle last extent of pages + */ +- if (!mpd.io_done && mpd.next_page != mpd.first_page) { +- if (mpage_da_map_blocks(&mpd) == 0) +- mpage_da_submit_io(&mpd); ++ if (!mpd->io_done && mpd->next_page != mpd->first_page) { ++ if (mpage_da_map_blocks(mpd) == 0) ++ mpage_da_submit_io(mpd); + } + +- wbc->nr_to_write = to_write - mpd.pages_written; ++ wbc->nr_to_write = to_write - mpd->pages_written; + return ret; + } + +@@ -2420,6 +2443,7 @@ static int ext4_da_writepages(struct add + { + handle_t *handle = NULL; + loff_t range_start = 0; ++ struct mpage_da_data mpd; + struct inode *inode = mapping->host; + int needed_blocks, ret = 0, nr_to_writebump = 0; + long to_write, pages_skipped = 0; +@@ -2467,6 +2491,9 @@ static int ext4_da_writepages(struct add + range_start = wbc->range_start; + pages_skipped = wbc->pages_skipped; + ++ mpd.wbc = wbc; ++ mpd.inode = mapping->host; ++ + restart_loop: + to_write = wbc->nr_to_write; + while (!ret && to_write > 0) { +@@ -2502,11 +2529,17 @@ restart_loop: + goto out_writepages; + } + } +- + to_write -= wbc->nr_to_write; +- ret = mpage_da_writepages(mapping, wbc, +- ext4_da_get_block_write); ++ ++ mpd.get_block = ext4_da_get_block_write; ++ ret = mpage_da_writepages(mapping, wbc, &mpd); ++ + ext4_journal_stop(handle); ++ ++ if (mpd.retval == -ENOSPC) ++ jbd2_journal_force_commit_nested(sbi->s_journal); ++ ++ /* reset the retry count */ + if (ret == MPAGE_DA_EXTENT_TAIL) { + /* + * got one extent now try with diff --git a/queue-2.6.27/ext4-retry-block-reservation.patch b/queue-2.6.27/ext4-retry-block-reservation.patch new file mode 100644 index 00000000000..d4e3cde902d --- /dev/null +++ b/queue-2.6.27/ext4-retry-block-reservation.patch @@ -0,0 +1,131 @@ +From tytso@mit.edu Mon Apr 19 10:21:18 2010 +From: Aneesh Kumar K.V +Date: Mon, 15 Mar 2010 20:25:59 -0400 +Subject: ext4: Retry block reservation +To: stable@kernel.org +Cc: "Theodore Ts'o" , Ext4 Developers List , Mingming Cao , "Jayson R. King" , "Aneesh Kumar K.V" +Message-ID: <1268699165-17461-6-git-send-email-tytso@mit.edu> + + +From: Aneesh Kumar K.V + +commit 030ba6bc67b4f2bc5cd174f57785a1745c929abe upstream. + +During block reservation if we don't have enough blocks left, retry +block reservation with smaller block counts. This makes sure we try +fallocate and DIO with smaller request size and don't fail early. The +delayed allocation reservation cannot try with smaller block count. So +retry block reservation to handle temporary disk full conditions. Also +print free blocks details if we fail block allocation during writepages. + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: Mingming Cao +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Jayson R. King +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/balloc.c | 8 +++++++- + fs/ext4/inode.c | 14 +++++++++++--- + fs/ext4/mballoc.c | 7 ++++++- + 3 files changed, 24 insertions(+), 5 deletions(-) + +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -1907,10 +1907,16 @@ ext4_fsblk_t ext4_old_new_blocks(handle_ + /* + * With delalloc we already reserved the blocks + */ +- if (ext4_claim_free_blocks(sbi, *count)) { ++ while (*count && ext4_claim_free_blocks(sbi, *count)) { ++ /* let others to free the space */ ++ yield(); ++ *count = *count >> 1; ++ } ++ if (!*count) { + *errp = -ENOSPC; + return 0; /*return with ENOSPC error */ + } ++ num = *count; + } + /* + * Check quota for allocation of this block. +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1549,6 +1549,7 @@ static int ext4_journalled_write_end(str + + static int ext4_da_reserve_space(struct inode *inode, int nrblocks) + { ++ int retries = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + unsigned long md_needed, mdblocks, total = 0; + +@@ -1557,6 +1558,7 @@ static int ext4_da_reserve_space(struct + * in order to allocate nrblocks + * worse case is one extent per block + */ ++repeat: + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; + mdblocks = ext4_calc_metadata_amount(inode, total); +@@ -1567,6 +1569,10 @@ static int ext4_da_reserve_space(struct + + if (ext4_claim_free_blocks(sbi, total)) { + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); ++ if (ext4_should_retry_alloc(inode->i_sb, &retries)) { ++ yield(); ++ goto repeat; ++ } + return -ENOSPC; + } + EXT4_I(inode)->i_reserved_data_blocks += nrblocks; +@@ -1864,20 +1870,18 @@ static void ext4_da_block_invalidatepage + static int mpage_da_map_blocks(struct mpage_da_data *mpd) + { + int err = 0; ++ struct buffer_head new; + struct buffer_head *lbh = &mpd->lbh; + sector_t next = lbh->b_blocknr; +- struct buffer_head new; + + /* + * We consider only non-mapped and non-allocated blocks + */ + if (buffer_mapped(lbh) && !buffer_delay(lbh)) + return 0; +- + new.b_state = lbh->b_state; + new.b_blocknr = 0; + new.b_size = lbh->b_size; +- + /* + * If we didn't accumulate anything + * to write simply return +@@ -1910,6 +1914,10 @@ static int mpage_da_map_blocks(struct m + lbh->b_size >> mpd->inode->i_blkbits, err); + printk(KERN_EMERG "This should not happen.!! " + "Data will be lost\n"); ++ if (err == -ENOSPC) { ++ printk(KERN_CRIT "Total free blocks count %lld\n", ++ ext4_count_free_blocks(mpd->inode->i_sb)); ++ } + /* invlaidate all the pages */ + ext4_da_block_invalidatepages(mpd, next, + lbh->b_size >> mpd->inode->i_blkbits); +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -4651,7 +4651,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t + /* + * With delalloc we already reserved the blocks + */ +- if (ext4_claim_free_blocks(sbi, ar->len)) { ++ while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { ++ /* let others to free the space */ ++ yield(); ++ ar->len = ar->len >> 1; ++ } ++ if (!ar->len) { + *errp = -ENOSPC; + return 0; + } diff --git a/queue-2.6.27/ext4-use-tag-dirty-lookup-during-mpage_da_submit_io.patch b/queue-2.6.27/ext4-use-tag-dirty-lookup-during-mpage_da_submit_io.patch new file mode 100644 index 00000000000..b908a0badc5 --- /dev/null +++ b/queue-2.6.27/ext4-use-tag-dirty-lookup-during-mpage_da_submit_io.patch @@ -0,0 +1,99 @@ +From tytso@mit.edu Mon Apr 19 10:22:28 2010 +From: Aneesh Kumar K.V +Date: Mon, 15 Mar 2010 20:26:01 -0400 +Subject: ext4: Use tag dirty lookup during mpage_da_submit_io +To: stable@kernel.org +Cc: Ext4 Developers List , "Theodore Ts'o" , "Jayson R. King" , "Aneesh Kumar K.V" +Message-ID: <1268699165-17461-8-git-send-email-tytso@mit.edu> + + +From: Aneesh Kumar K.V + +commit af6f029d3836eb7264cd3fbb13a6baf0e5fdb5ea upstream. + +This enables us to drop the range_cont writeback mode +use from ext4_da_writepages. + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: Jayson R. King +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/inode.c | 30 +++++++++++++----------------- + 1 file changed, 13 insertions(+), 17 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1699,17 +1699,23 @@ static int mpage_da_submit_io(struct mpa + + pagevec_init(&pvec, 0); + while (index <= end) { +- nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); ++ /* ++ * We can use PAGECACHE_TAG_DIRTY lookup here because ++ * even though we have cleared the dirty flag on the page ++ * We still keep the page in the radix tree with tag ++ * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io. ++ * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback ++ * which is called via the below writepage callback. ++ */ ++ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, ++ PAGECACHE_TAG_DIRTY, ++ min(end - index, ++ (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + +- index = page->index; +- if (index > end) +- break; +- index++; +- + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + +@@ -2442,7 +2448,6 @@ static int ext4_da_writepages(struct add + struct writeback_control *wbc) + { + handle_t *handle = NULL; +- loff_t range_start = 0; + struct mpage_da_data mpd; + struct inode *inode = mapping->host; + int needed_blocks, ret = 0, nr_to_writebump = 0; +@@ -2481,14 +2486,7 @@ static int ext4_da_writepages(struct add + wbc->nr_to_write = sbi->s_mb_stream_request; + } + +- if (!wbc->range_cyclic) +- /* +- * If range_cyclic is not set force range_cont +- * and save the old writeback_index +- */ +- wbc->range_cont = 1; + +- range_start = wbc->range_start; + pages_skipped = wbc->pages_skipped; + + mpd.wbc = wbc; +@@ -2559,9 +2557,8 @@ restart_loop: + wbc->nr_to_write = to_write; + } + +- if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { ++ if (!wbc->range_cyclic && (pages_skipped != wbc->pages_skipped)) { + /* We skipped pages in this loop */ +- wbc->range_start = range_start; + wbc->nr_to_write = to_write + + wbc->pages_skipped - pages_skipped; + wbc->pages_skipped = pages_skipped; +@@ -2570,7 +2567,6 @@ restart_loop: + + out_writepages: + wbc->nr_to_write = to_write - nr_to_writebump; +- wbc->range_start = range_start; + return ret; + } + diff --git a/queue-2.6.27/percpu-counter-clean-up-percpu_counter_sum_and_set.patch b/queue-2.6.27/percpu-counter-clean-up-percpu_counter_sum_and_set.patch new file mode 100644 index 00000000000..86d95f9e164 --- /dev/null +++ b/queue-2.6.27/percpu-counter-clean-up-percpu_counter_sum_and_set.patch @@ -0,0 +1,104 @@ +From tytso@mit.edu Mon Apr 19 10:20:04 2010 +From: Mingming Cao +Date: Mon, 15 Mar 2010 20:25:56 -0400 +Subject: percpu counter: clean up percpu_counter_sum_and_set() +To: stable@kernel.org +Cc: "Theodore Ts'o" , Andrew Morton , Ext4 Developers List , Mingming Cao , "Jayson R. King" +Message-ID: <1268699165-17461-3-git-send-email-tytso@mit.edu> + + +From: Mingming Cao + +commit 1f7c14c62ce63805f9574664a6c6de3633d4a354 upstream. + +percpu_counter_sum_and_set() and percpu_counter_sum() is the same except +the former updates the global counter after accounting. Since we are +taking the fbc->lock to calculate the precise value of the counter in +percpu_counter_sum() anyway, it should simply set fbc->count too, as the +percpu_counter_sum_and_set() does. + +This patch merges these two interfaces into one. + +Signed-off-by: Mingming Cao +Acked-by: Peter Zijlstra +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Jayson R. King +Signed-off-by: Theodore Ts'o + +--- + fs/ext4/balloc.c | 2 +- + include/linux/percpu_counter.h | 12 +++--------- + lib/percpu_counter.c | 8 +++----- + 3 files changed, 7 insertions(+), 15 deletions(-) + +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -1778,7 +1778,7 @@ ext4_fsblk_t ext4_has_free_blocks(struct + #ifdef CONFIG_SMP + if (free_blocks - root_blocks < FBC_BATCH) + free_blocks = +- percpu_counter_sum_and_set(&sbi->s_freeblocks_counter); ++ percpu_counter_sum(&sbi->s_freeblocks_counter); + #endif + if (free_blocks <= root_blocks) + /* we don't have free space */ +--- a/include/linux/percpu_counter.h ++++ b/include/linux/percpu_counter.h +@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percp + void percpu_counter_destroy(struct percpu_counter *fbc); + void percpu_counter_set(struct percpu_counter *fbc, s64 amount); + void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch); +-s64 __percpu_counter_sum(struct percpu_counter *fbc, int set); ++s64 __percpu_counter_sum(struct percpu_counter *fbc); + + static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) + { +@@ -44,19 +44,13 @@ static inline void percpu_counter_add(st + + static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) + { +- s64 ret = __percpu_counter_sum(fbc, 0); ++ s64 ret = __percpu_counter_sum(fbc); + return ret < 0 ? 0 : ret; + } + +-static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc) +-{ +- return __percpu_counter_sum(fbc, 1); +-} +- +- + static inline s64 percpu_counter_sum(struct percpu_counter *fbc) + { +- return __percpu_counter_sum(fbc, 0); ++ return __percpu_counter_sum(fbc); + } + + static inline s64 percpu_counter_read(struct percpu_counter *fbc) +--- a/lib/percpu_counter.c ++++ b/lib/percpu_counter.c +@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add); + * Add up all the per-cpu counts, return the result. This is a more accurate + * but much slower version of percpu_counter_read_positive() + */ +-s64 __percpu_counter_sum(struct percpu_counter *fbc, int set) ++s64 __percpu_counter_sum(struct percpu_counter *fbc) + { + s64 ret; + int cpu; +@@ -62,11 +62,9 @@ s64 __percpu_counter_sum(struct percpu_c + for_each_online_cpu(cpu) { + s32 *pcount = per_cpu_ptr(fbc->counters, cpu); + ret += *pcount; +- if (set) +- *pcount = 0; ++ *pcount = 0; + } +- if (set) +- fbc->count = ret; ++ fbc->count = ret; + + spin_unlock(&fbc->lock); + return ret; diff --git a/queue-2.6.27/series b/queue-2.6.27/series index 9091fb02c6d..3041f8329e3 100644 --- a/queue-2.6.27/series +++ b/queue-2.6.27/series @@ -1 +1,12 @@ alsa-mixart-range-checking-proc-file.patch +ext4-invalidate-pages-if-delalloc-block-allocation-fails.patch +percpu-counter-clean-up-percpu_counter_sum_and_set.patch +ext4-make-sure-all-the-block-allocation-paths-reserve-blocks.patch +ext4-add-percpu-dirty-block-accounting.patch +ext4-retry-block-reservation.patch +ext4-retry-block-allocation-if-we-have-free-blocks-left.patch +ext4-use-tag-dirty-lookup-during-mpage_da_submit_io.patch +vfs-remove-the-range_cont-writeback-mode.patch +vfs-add-no_nrwrite_index_update-writeback-control-flag.patch +ext4-fix-file-fragmentation-during-large-file-write.patch +ext4-implement-range_cyclic-in-ext4_da_writepages-instead-of-write_cache_pages.patch diff --git a/queue-2.6.27/vfs-add-no_nrwrite_index_update-writeback-control-flag.patch b/queue-2.6.27/vfs-add-no_nrwrite_index_update-writeback-control-flag.patch new file mode 100644 index 00000000000..a6b528652cd --- /dev/null +++ b/queue-2.6.27/vfs-add-no_nrwrite_index_update-writeback-control-flag.patch @@ -0,0 +1,87 @@ +From tytso@mit.edu Mon Apr 19 10:23:14 2010 +From: Aneesh Kumar K.V +Date: Mon, 15 Mar 2010 20:26:03 -0400 +Subject: vfs: Add no_nrwrite_index_update writeback control flag +To: stable@kernel.org +Cc: linux-fsdevel@vger.kernel.org, Ext4 Developers List , "Theodore Ts'o" , "Jayson R. King" , "Aneesh Kumar K.V" +Message-ID: <1268699165-17461-10-git-send-email-tytso@mit.edu> + + +From: Aneesh Kumar K.V + +commit 17bc6c30cf6bfffd816bdc53682dd46fc34a2cf4 upstream. + +If no_nrwrite_index_update is set we don't update nr_to_write and +address space writeback_index in write_cache_pages. This change +enables a file system to skip these updates in write_cache_pages and do +them in the writepages() callback. This patch will be followed by an +ext4 patch that make use of these new flags. + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: "Theodore Ts'o" +CC: linux-fsdevel@vger.kernel.org +[dev@jaysonking.com: Modified the patch to account for subsequent changes in mainline being cherry-picked earlier for 2.6.27.y.] +Signed-off-by: Jayson R. King +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/writeback.h | 9 +++++++++ + mm/page-writeback.c | 14 +++++++++----- + 2 files changed, 18 insertions(+), 5 deletions(-) + +--- a/include/linux/writeback.h ++++ b/include/linux/writeback.h +@@ -62,6 +62,15 @@ struct writeback_control { + unsigned for_writepages:1; /* This is a writepages() call */ + unsigned range_cyclic:1; /* range_start is cyclic */ + unsigned more_io:1; /* more io to be dispatched */ ++ /* ++ * write_cache_pages() won't update wbc->nr_to_write and ++ * mapping->writeback_index if no_nrwrite_index_update ++ * is set. write_cache_pages() may write more than we ++ * requested and we want to make sure nr_to_write and ++ * writeback_index are updated in a consistent manner ++ * so we use a single control to update them ++ */ ++ unsigned no_nrwrite_index_update:1; + }; + + /* +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -878,6 +878,7 @@ int write_cache_pages(struct address_spa + pgoff_t done_index; + int cycled; + int range_whole = 0; ++ long nr_to_write = wbc->nr_to_write; + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; +@@ -985,9 +986,9 @@ continue_unlock: + } + } + +- if (wbc->nr_to_write > 0) { +- wbc->nr_to_write--; +- if (wbc->nr_to_write == 0 && ++ if (nr_to_write > 0) { ++ nr_to_write--; ++ if (nr_to_write == 0 && + wbc->sync_mode == WB_SYNC_NONE) { + /* + * We stop writing back only if we are +@@ -1024,8 +1025,11 @@ continue_unlock: + end = writeback_index - 1; + goto retry; + } +- if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) +- mapping->writeback_index = done_index; ++ if (!wbc->no_nrwrite_index_update) { ++ if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) ++ mapping->writeback_index = done_index; ++ wbc->nr_to_write = nr_to_write; ++ } + + return ret; + } diff --git a/queue-2.6.27/vfs-remove-the-range_cont-writeback-mode.patch b/queue-2.6.27/vfs-remove-the-range_cont-writeback-mode.patch new file mode 100644 index 00000000000..5e5a70c269e --- /dev/null +++ b/queue-2.6.27/vfs-remove-the-range_cont-writeback-mode.patch @@ -0,0 +1,50 @@ +From tytso@mit.edu Mon Apr 19 10:22:47 2010 +From: Aneesh Kumar K.V +Date: Mon, 15 Mar 2010 20:26:02 -0400 +Subject: vfs: Remove the range_cont writeback mode. +To: stable@kernel.org +Cc: linux-fsdevel@vger.kernel.org, Ext4 Developers List , "Theodore Ts'o" , "Jayson R. King" , "Aneesh Kumar K.V" +Message-ID: <1268699165-17461-9-git-send-email-tytso@mit.edu> + + +From: Aneesh Kumar K.V + +commit 74baaaaec8b4f22e1ae279f5ecca4ff705b28912 upstream. + +Ext4 was the only user of range_cont writeback mode and ext4 switched +to a different method. So remove the range_cont mode which is not used +in the kernel. + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: "Theodore Ts'o" +CC: linux-fsdevel@vger.kernel.org +Signed-off-by: Jayson R. King +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/writeback.h | 1 - + mm/page-writeback.c | 2 -- + 2 files changed, 3 deletions(-) + +--- a/include/linux/writeback.h ++++ b/include/linux/writeback.h +@@ -62,7 +62,6 @@ struct writeback_control { + unsigned for_writepages:1; /* This is a writepages() call */ + unsigned range_cyclic:1; /* range_start is cyclic */ + unsigned more_io:1; /* more io to be dispatched */ +- unsigned range_cont:1; + }; + + /* +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -1027,8 +1027,6 @@ continue_unlock: + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = done_index; + +- if (wbc->range_cont) +- wbc->range_start = index << PAGE_CACHE_SHIFT; + return ret; + } + EXPORT_SYMBOL(write_cache_pages);