From: Greg Kroah-Hartman Date: Thu, 10 Dec 2009 18:37:08 +0000 (-0800) Subject: .31 patches X-Git-Tag: v2.6.31.8~8 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7f041dd70b3e1072f0a6b07b92198167045b6c98;p=thirdparty%2Fkernel%2Fstable-queue.git .31 patches --- diff --git a/queue-2.6.31/0001-ext4-Fix-memory-leak-fix-when-mounting-an-ext4-files.patch b/queue-2.6.31/0001-ext4-Fix-memory-leak-fix-when-mounting-an-ext4-files.patch new file mode 100644 index 00000000000..e504f73ceaf --- /dev/null +++ b/queue-2.6.31/0001-ext4-Fix-memory-leak-fix-when-mounting-an-ext4-files.patch @@ -0,0 +1,73 @@ +From b94ab13e6409f51707fe29273b7659d9096eb41c Mon Sep 17 00:00:00 2001 +From: Aneesh Kumar K.V +Date: Fri, 17 Jul 2009 09:01:04 -0400 +Subject: [PATCH 01/85] ext4: Fix memory leak fix when mounting an ext4 filesystem + +(cherry picked from commit 024eab4d5bf7e3168a2b71038b3e04e6b1f376ed) + +The allocation of the ext4_group_info array was moved to a new +function ext4_mb_add_group_info() in commit 5f21b0e6 so that online +resize would use a common (and correct) codepath. Unfortunately, the +call to the new ext4_mb_add_group_info() function was added without +removing the code which originally allocated the array. This caused a +memory leak each time an ext4 filesystem was mounted. + +The fix is simple; remove the code that did the original allocation, +since it is no longer needed. + +Reported-by: Catalin Marinas +Tested-by: Catalin Marinas +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 19 ------------------- + 1 file changed, 19 deletions(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -2571,13 +2571,11 @@ static int ext4_mb_init_backend(struct s + { + ext4_group_t ngroups = ext4_get_groups_count(sb); + ext4_group_t i; +- int metalen; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int num_meta_group_infos; + int num_meta_group_infos_max; + int array_size; +- struct ext4_group_info **meta_group_info; + struct ext4_group_desc *desc; + + /* This is the number of blocks used by GDT */ +@@ -2622,22 +2620,6 @@ static int ext4_mb_init_backend(struct s + goto err_freesgi; + } + EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; +- +- metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); +- for (i = 0; i < num_meta_group_infos; i++) { +- if ((i + 1) == num_meta_group_infos) +- metalen = sizeof(*meta_group_info) * +- (ngroups - +- (i << EXT4_DESC_PER_BLOCK_BITS(sb))); +- meta_group_info = kmalloc(metalen, GFP_KERNEL); +- if (meta_group_info == NULL) { +- printk(KERN_ERR "EXT4-fs: can't allocate mem for a " +- "buddy group\n"); +- goto err_freemeta; +- } +- sbi->s_group_info[i] = meta_group_info; +- } +- + for (i = 0; i < ngroups; i++) { + desc = ext4_get_group_desc(sb, i, NULL); + if (desc == NULL) { +@@ -2655,7 +2637,6 @@ err_freebuddy: + while (i-- > 0) + kfree(ext4_get_group_info(sb, i)); + i = num_meta_group_infos; +-err_freemeta: + while (i-- > 0) + kfree(sbi->s_group_info[i]); + iput(sbi->s_buddy_cache); diff --git a/queue-2.6.31/0002-ext4-Avoid-null-pointer-dereference-when-decoding-ER.patch b/queue-2.6.31/0002-ext4-Avoid-null-pointer-dereference-when-decoding-ER.patch new file mode 100644 index 00000000000..96c11d628e4 --- /dev/null +++ b/queue-2.6.31/0002-ext4-Avoid-null-pointer-dereference-when-decoding-ER.patch @@ -0,0 +1,29 @@ +From e8abf4864c0fbf86141cb72e4eb2acce4bd4b7fa Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Mon, 27 Jul 2009 23:09:47 -0400 +Subject: [PATCH 02/85] ext4: Avoid null pointer dereference when decoding EROFS w/o a journal + +(cherry picked from commit 78f1ddbb498283c2445c11b0dfa666424c301803) + +We need to check to make sure a journal is present before checking the +journal flags in ext4_decode_error(). + +Signed-off-by: Eric Sesterhenn +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/super.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -344,7 +344,8 @@ static const char *ext4_decode_error(str + errstr = "Out of memory"; + break; + case -EROFS: +- if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT) ++ if (!sb || (EXT4_SB(sb)->s_journal && ++ EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)) + errstr = "Journal has aborted"; + else + errstr = "Readonly filesystem"; diff --git a/queue-2.6.31/0003-jbd2-Fail-to-load-a-journal-if-it-is-too-short.patch b/queue-2.6.31/0003-jbd2-Fail-to-load-a-journal-if-it-is-too-short.patch new file mode 100644 index 00000000000..213f2075315 --- /dev/null +++ b/queue-2.6.31/0003-jbd2-Fail-to-load-a-journal-if-it-is-too-short.patch @@ -0,0 +1,32 @@ +From 558b413137123c701eda6e62d927bff476655e3c Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Fri, 17 Jul 2009 10:40:01 -0400 +Subject: [PATCH 03/85] jbd2: Fail to load a journal if it is too short + +(cherry picked from commit f6f50e28f0cb8d7bcdfaacc83129f005dede11b1) + +Due to on disk corruption, it can happen that journal is too short. Fail +to load it in such case so that we don't oops somewhere later. + +Signed-off-by: Jan Kara +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/jbd2/journal.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/fs/jbd2/journal.c ++++ b/fs/jbd2/journal.c +@@ -1187,6 +1187,12 @@ static int journal_reset(journal_t *jour + + first = be32_to_cpu(sb->s_first); + last = be32_to_cpu(sb->s_maxlen); ++ if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) { ++ printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n", ++ first, last); ++ journal_fail_superblock(journal); ++ return -EINVAL; ++ } + + journal->j_first = first; + journal->j_last = last; diff --git a/queue-2.6.31/0004-jbd2-round-commit-timer-up-to-avoid-uncommitted-tran.patch b/queue-2.6.31/0004-jbd2-round-commit-timer-up-to-avoid-uncommitted-tran.patch new file mode 100644 index 00000000000..e7bae81bc39 --- /dev/null +++ b/queue-2.6.31/0004-jbd2-round-commit-timer-up-to-avoid-uncommitted-tran.patch @@ -0,0 +1,31 @@ +From e32c5e90d8b7cf1370c15db75245dab7135db931 Mon Sep 17 00:00:00 2001 +From: Andreas Dilger +Date: Mon, 10 Aug 2009 22:51:53 -0400 +Subject: [PATCH 04/85] jbd2: round commit timer up to avoid uncommitted transaction + +(cherry picked from commit b1f485f20eb9b02cc7d2009556287f3939d480cc) + +fix jiffie rounding in jbd commit timer setup code. Rounding down +could cause the timer to be fired before the corresponding transaction +has expired. That transaction can stay not committed forever if no +new transaction is created or expicit sync/umount happens. + +Signed-off-by: Alex Zhuravlev (Tomas) +Signed-off-by: Andreas Dilger +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/jbd2/transaction.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/jbd2/transaction.c ++++ b/fs/jbd2/transaction.c +@@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, + INIT_LIST_HEAD(&transaction->t_private_list); + + /* Set up the commit timer for the new transaction. */ +- journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); ++ journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); + add_timer(&journal->j_commit_timer); + + J_ASSERT(journal->j_running_transaction == NULL); diff --git a/queue-2.6.31/0005-ext4-fix-journal-ref-count-in-move_extent_par_page.patch b/queue-2.6.31/0005-ext4-fix-journal-ref-count-in-move_extent_par_page.patch new file mode 100644 index 00000000000..e15e0017741 --- /dev/null +++ b/queue-2.6.31/0005-ext4-fix-journal-ref-count-in-move_extent_par_page.patch @@ -0,0 +1,31 @@ +From d4ed91275abd6adef2149e9bb824076ec75638fe Mon Sep 17 00:00:00 2001 +From: Peng Tao +Date: Mon, 10 Aug 2009 23:05:28 -0400 +Subject: [PATCH 05/85] ext4: fix journal ref count in move_extent_par_page + +(cherry picked from commit 91cc219ad963731191247c5f2db4118be2bc341a) + +move_extent_par_page calls a_ops->write_begin() to increase journal +handler's reference count. However, if either mext_replace_branches() +or ext4_get_block fails, the increased reference count isn't +decreased. This will cause a later attempt to umount of the fs to hang +forever. The patch addresses the issue by calling ext4_journal_stop() +if page is not NULL (which means a_ops->write_end() isn't invoked). + +Signed-off-by: Peng Tao +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/move_extent.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -871,6 +871,7 @@ out: + if (PageLocked(page)) + unlock_page(page); + page_cache_release(page); ++ ext4_journal_stop(handle); + } + out2: + ext4_journal_stop(handle); diff --git a/queue-2.6.31/0006-ext4-Fix-bugs-in-mballoc-s-stream-allocation-mode.patch b/queue-2.6.31/0006-ext4-Fix-bugs-in-mballoc-s-stream-allocation-mode.patch new file mode 100644 index 00000000000..83bc07912d5 --- /dev/null +++ b/queue-2.6.31/0006-ext4-Fix-bugs-in-mballoc-s-stream-allocation-mode.patch @@ -0,0 +1,103 @@ +From 4637f3ad479792230ef8290c9741ab23a7631ae2 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sun, 9 Aug 2009 22:01:13 -0400 +Subject: [PATCH 06/85] ext4: Fix bugs in mballoc's stream allocation mode + +(cherry picked from commit 4ba74d00a20256e22f159cb288ff34b587608917) + +The logic around sbi->s_mb_last_group and sbi->s_mb_last_start was all +screwed up. These fields were getting unconditionally all the time, +set even when stream allocation had not taken place, and if they were +being used when the file was smaller than s_mb_stream_request, which +is when the allocation should _not_ be doing stream allocation. + +Fix this by determining whether or not we stream allocation should +take place once, in ext4_mb_group_or_file(), and setting a flag which +gets used in ext4_mb_regular_allocator() and ext4_mb_use_best_found(). +This simplifies the code and assures that we are consistently using +(or not using) the stream allocation logic. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 2 ++ + fs/ext4/mballoc.c | 23 ++++++++++------------- + 2 files changed, 12 insertions(+), 13 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -88,6 +88,8 @@ typedef unsigned int ext4_group_t; + #define EXT4_MB_HINT_TRY_GOAL 512 + /* blocks already pre-reserved by delayed allocation */ + #define EXT4_MB_DELALLOC_RESERVED 1024 ++/* We are doing stream allocation */ ++#define EXT4_MB_STREAM_ALLOC 2048 + + + struct ext4_allocation_request { +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -1360,7 +1360,7 @@ static void ext4_mb_use_best_found(struc + ac->alloc_semp = e4b->alloc_semp; + e4b->alloc_semp = NULL; + /* store last allocated for subsequent stream allocation */ +- if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { ++ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { + spin_lock(&sbi->s_md_lock); + sbi->s_mb_last_group = ac->ac_f_ex.fe_group; + sbi->s_mb_last_start = ac->ac_f_ex.fe_start; +@@ -1938,7 +1938,6 @@ ext4_mb_regular_allocator(struct ext4_al + struct ext4_sb_info *sbi; + struct super_block *sb; + struct ext4_buddy e4b; +- loff_t size, isize; + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); +@@ -1974,20 +1973,16 @@ ext4_mb_regular_allocator(struct ext4_al + } + + bsbits = ac->ac_sb->s_blocksize_bits; +- /* if stream allocation is enabled, use global goal */ +- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; +- isize = i_size_read(ac->ac_inode) >> bsbits; +- if (size < isize) +- size = isize; + +- if (size < sbi->s_mb_stream_request && +- (ac->ac_flags & EXT4_MB_HINT_DATA)) { ++ /* if stream allocation is enabled, use global goal */ ++ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { + /* TBD: may be hot point */ + spin_lock(&sbi->s_md_lock); + ac->ac_g_ex.fe_group = sbi->s_mb_last_group; + ac->ac_g_ex.fe_start = sbi->s_mb_last_start; + spin_unlock(&sbi->s_md_lock); + } ++ + /* Let's just scan groups to find more-less suitable blocks */ + cr = ac->ac_2order ? 0 : 1; + /* +@@ -4155,16 +4150,18 @@ static void ext4_mb_group_or_file(struct + if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) + return; + ++ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) ++ return; ++ + size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; + isize = i_size_read(ac->ac_inode) >> bsbits; + size = max(size, isize); + + /* don't use group allocation for large files */ +- if (size >= sbi->s_mb_stream_request) +- return; +- +- if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) ++ if (size >= sbi->s_mb_stream_request) { ++ ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; ++ } + + BUG_ON(ac->ac_lg != NULL); + /* diff --git a/queue-2.6.31/0007-ext4-Avoid-group-preallocation-for-closed-files.patch b/queue-2.6.31/0007-ext4-Avoid-group-preallocation-for-closed-files.patch new file mode 100644 index 00000000000..23d35dc9678 --- /dev/null +++ b/queue-2.6.31/0007-ext4-Avoid-group-preallocation-for-closed-files.patch @@ -0,0 +1,107 @@ +From b8b8f0a0ba7a6bfd266a1498ee8cfb72e864a453 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Fri, 18 Sep 2009 13:34:02 -0400 +Subject: [PATCH 07/85] ext4: Avoid group preallocation for closed files + +(cherry picked from commit 50797481a7bdee548589506d7d7b48b08bc14dcd) + +Currently the group preallocation code tries to find a large (512) +free block from which to do per-cpu group allocation for small files. +The problem with this scheme is that it leaves the filesystem horribly +fragmented. In the worst case, if the filesystem is unmounted and +remounted (after a system shutdown, for example) we forget the fact +that wee were using a particular (now-partially filled) 512 block +extent. So the next time we try to allocate space for a small file, +we will find *another* completely free 512 block chunk to allocate +small files. Given that there are 32,768 blocks in a block group, +after 64 iterations of "mount, write one 4k file in a directory, +unmount", the block group will have 64 files, each separated by 511 +blocks, and the block group will no longer have any free 512 +completely free chunks of blocks for group preallocation space. + +So if we try to allocate blocks for a file that has been closed, such +that we know the final size of the file, and the filesystem is not +busy, avoid using group preallocation. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 30 +++++++++++++++++++++++++++++- + fs/ext4/mballoc.c | 10 +++++++++- + 2 files changed, 38 insertions(+), 2 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -952,6 +952,7 @@ struct ext4_sb_info { + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; ++ atomic_t s_lock_busy; + + /* locality groups */ + struct ext4_locality_group *s_locality_groups; +@@ -1593,15 +1594,42 @@ struct ext4_group_info { + #define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) + ++#define EXT4_MAX_CONTENTION 8 ++#define EXT4_CONTENTION_THRESHOLD 2 ++ + static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) + { + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); + } + ++/* ++ * Returns true if the filesystem is busy enough that attempts to ++ * access the block group locks has run into contention. ++ */ ++static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) ++{ ++ return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); ++} ++ + static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) + { +- spin_lock(ext4_group_lock_ptr(sb, group)); ++ spinlock_t *lock = ext4_group_lock_ptr(sb, group); ++ if (spin_trylock(lock)) ++ /* ++ * We're able to grab the lock right away, so drop the ++ * lock contention counter. ++ */ ++ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); ++ else { ++ /* ++ * The lock is busy, so bump the contention counter, ++ * and then wait on the spin lock. ++ */ ++ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, ++ EXT4_MAX_CONTENTION); ++ spin_lock(lock); ++ } + } + + static inline void ext4_unlock_group(struct super_block *sb, +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -4154,9 +4154,17 @@ static void ext4_mb_group_or_file(struct + return; + + size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; +- isize = i_size_read(ac->ac_inode) >> bsbits; ++ isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) ++ >> bsbits; + size = max(size, isize); + ++ if ((size == isize) && ++ !ext4_fs_is_busy(sbi) && ++ (atomic_read(&ac->ac_inode->i_writecount) == 0)) { ++ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; ++ return; ++ } ++ + /* don't use group allocation for large files */ + if (size >= sbi->s_mb_stream_request) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; diff --git a/queue-2.6.31/0008-jbd2-Annotate-transaction-start-also-for-jbd2_journa.patch b/queue-2.6.31/0008-jbd2-Annotate-transaction-start-also-for-jbd2_journa.patch new file mode 100644 index 00000000000..4b4304635af --- /dev/null +++ b/queue-2.6.31/0008-jbd2-Annotate-transaction-start-also-for-jbd2_journa.patch @@ -0,0 +1,47 @@ +From ddd54044cba62d714d41d11c1bf69738113ba7b7 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Mon, 17 Aug 2009 21:23:17 -0400 +Subject: [PATCH 08/85] jbd2: Annotate transaction start also for jbd2_journal_restart() + +(cherry picked from commit 9599b0e597d810be9b8f759ea6e9619c4f983c5e) + +lockdep annotation for a transaction start has been at the end of +jbd2_journal_start(). But a transaction is also started from +jbd2_journal_restart(). Move the lockdep annotation to start_this_handle() +which covers both cases. + +Signed-off-by: Jan Kara +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/jbd2/transaction.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/fs/jbd2/transaction.c ++++ b/fs/jbd2/transaction.c +@@ -238,6 +238,8 @@ repeat_locked: + __jbd2_log_space_left(journal)); + spin_unlock(&transaction->t_handle_lock); + spin_unlock(&journal->j_state_lock); ++ ++ lock_map_acquire(&handle->h_lockdep_map); + out: + if (unlikely(new_transaction)) /* It's usually NULL */ + kfree(new_transaction); +@@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t * + handle = ERR_PTR(err); + goto out; + } +- +- lock_map_acquire(&handle->h_lockdep_map); + out: + return handle; + } +@@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handl + __jbd2_log_start_commit(journal, transaction->t_tid); + spin_unlock(&journal->j_state_lock); + ++ lock_map_release(&handle->h_lockdep_map); + handle->h_buffer_credits = nblocks; + ret = start_this_handle(journal, handle); + return ret; diff --git a/queue-2.6.31/0009-ext4-Fix-possible-deadlock-between-ext4_truncate-and.patch b/queue-2.6.31/0009-ext4-Fix-possible-deadlock-between-ext4_truncate-and.patch new file mode 100644 index 00000000000..22d84127b8d --- /dev/null +++ b/queue-2.6.31/0009-ext4-Fix-possible-deadlock-between-ext4_truncate-and.patch @@ -0,0 +1,136 @@ +From f9c3d81fbd609a2e31213ef4ee0c4ddf40a3e9da Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Mon, 17 Aug 2009 22:17:20 -0400 +Subject: [PATCH 09/85] ext4: Fix possible deadlock between ext4_truncate() and ext4_get_blocks() + +During truncate we are sometimes forced to start a new transaction as +the amount of blocks to be journaled is both quite large and hard to +predict. So far we restarted a transaction while holding i_data_sem +and that violates lock ordering because i_data_sem ranks below a +transaction start (and it can lead to a real deadlock with +ext4_get_blocks() mapping blocks in some page while having a +transaction open). + +(cherry picked from commit 487caeef9fc08c0565e082c40a8aaf58dad92bbb) + +We fix the problem by dropping the i_data_sem before restarting the +transaction and acquire it afterwards. It's slightly subtle that this +works: + +1) By the time ext4_truncate() is called, all the page cache for the +truncated part of the file is dropped so get_block() should not be +called on it (we only have to invalidate extent cache after we +reacquire i_data_sem because some extent from not-truncated part could +extend also into the part we are going to truncate). + +2) Writes, migrate or defrag hold i_mutex so they are stopped for all +the time of the truncate. + +This bug has been found and analyzed by Theodore Tso . + +Signed-off-by: Jan Kara +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 1 + + fs/ext4/extents.c | 15 ++++++++++++--- + fs/ext4/inode.c | 23 +++++++++++++++++++---- + 3 files changed, 32 insertions(+), 7 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1370,6 +1370,7 @@ extern int ext4_change_inode_journal_fla + extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); + extern int ext4_can_truncate(struct inode *inode); + extern void ext4_truncate(struct inode *); ++extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); + extern void ext4_set_inode_flags(struct inode *); + extern void ext4_get_inode_flags(struct ext4_inode_info *); + extern int ext4_alloc_da_blocks(struct inode *inode); +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct + ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); + } + +-static int ext4_ext_journal_restart(handle_t *handle, int needed) ++static int ext4_ext_truncate_extend_restart(handle_t *handle, ++ struct inode *inode, ++ int needed) + { + int err; + +@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(hand + err = ext4_journal_extend(handle, needed); + if (err <= 0) + return err; +- return ext4_journal_restart(handle, needed); ++ err = ext4_truncate_restart_trans(handle, inode, needed); ++ /* ++ * We have dropped i_data_sem so someone might have cached again ++ * an extent we are going to truncate. ++ */ ++ ext4_ext_invalidate_cache(inode); ++ ++ return err; + } + + /* +@@ -2138,7 +2147,7 @@ ext4_ext_rm_leaf(handle_t *handle, struc + } + credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); + +- err = ext4_ext_journal_restart(handle, credits); ++ err = ext4_ext_truncate_extend_restart(handle, inode, credits); + if (err) + goto out; + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -192,11 +192,24 @@ static int try_to_extend_transaction(han + * so before we call here everything must be consistently dirtied against + * this transaction. + */ +-static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) ++ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, ++ int nblocks) + { ++ int ret; ++ ++ /* ++ * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this ++ * moment, get_block can be called only for blocks inside i_size since ++ * page cache has been already dropped and writes are blocked by ++ * i_mutex. So we can safely drop the i_data_sem here. ++ */ + BUG_ON(EXT4_JOURNAL(inode) == NULL); + jbd_debug(2, "restarting handle %p\n", handle); +- return ext4_journal_restart(handle, blocks_for_truncate(inode)); ++ up_write(&EXT4_I(inode)->i_data_sem); ++ ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); ++ down_write(&EXT4_I(inode)->i_data_sem); ++ ++ return ret; + } + + /* +@@ -3659,7 +3672,8 @@ static void ext4_clear_blocks(handle_t * + ext4_handle_dirty_metadata(handle, inode, bh); + } + ext4_mark_inode_dirty(handle, inode); +- ext4_journal_test_restart(handle, inode); ++ ext4_truncate_restart_trans(handle, inode, ++ blocks_for_truncate(inode)); + if (bh) { + BUFFER_TRACE(bh, "retaking write access"); + ext4_journal_get_write_access(handle, bh); +@@ -3870,7 +3884,8 @@ static void ext4_free_branches(handle_t + return; + if (try_to_extend_transaction(handle, inode)) { + ext4_mark_inode_dirty(handle, inode); +- ext4_journal_test_restart(handle, inode); ++ ext4_truncate_restart_trans(handle, inode, ++ blocks_for_truncate(inode)); + } + + ext4_free_blocks(handle, inode, nr, 1, 1); diff --git a/queue-2.6.31/0010-ext4-reject-too-large-filesystems-on-32-bit-kernels.patch b/queue-2.6.31/0010-ext4-reject-too-large-filesystems-on-32-bit-kernels.patch new file mode 100644 index 00000000000..dcc144cab42 --- /dev/null +++ b/queue-2.6.31/0010-ext4-reject-too-large-filesystems-on-32-bit-kernels.patch @@ -0,0 +1,49 @@ +From a81c41ac6e920bdb87b51f20afde929cb08c00a1 Mon Sep 17 00:00:00 2001 +From: Eric Sandeen +Date: Mon, 17 Aug 2009 23:48:51 -0400 +Subject: [PATCH 10/85] ext4: reject too-large filesystems on 32-bit kernels + +(cherry picked from commit bf43d84b185e2ff54598f8c58a5a8e63148b6e90) + +ext4 will happily mount a > 16T filesystem on a 32-bit box, but +this is not safe; writes to the block device will wrap past 16T +and the page cache can't index past 16T (232 index * 4k pages). + +Adding another test to the existing "too many sectors" test +should do the trick. + +Add a comment, a relevant return value, and fix the reference +to the CONFIG_LBD(AF) option as well. + +Signed-off-by: Eric Sandeen +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/super.c | 13 ++++++++++--- + 1 file changed, 10 insertions(+), 3 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -2550,12 +2550,19 @@ static int ext4_fill_super(struct super_ + goto failed_mount; + } + +- if (ext4_blocks_count(es) > +- (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { ++ /* ++ * Test whether we have more sectors than will fit in sector_t, ++ * and whether the max offset is addressable by the page cache. ++ */ ++ if ((ext4_blocks_count(es) > ++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) || ++ (ext4_blocks_count(es) > ++ (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) { + ext4_msg(sb, KERN_ERR, "filesystem" +- " too large to mount safely"); ++ " too large to mount safely on this system"); + if (sizeof(sector_t) < 8) + ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); ++ ret = -EFBIG; + goto failed_mount; + } + diff --git a/queue-2.6.31/0011-ext4-Add-feature-set-check-helper-for-mount-remount-.patch b/queue-2.6.31/0011-ext4-Add-feature-set-check-helper-for-mount-remount-.patch new file mode 100644 index 00000000000..827bbdf24e6 --- /dev/null +++ b/queue-2.6.31/0011-ext4-Add-feature-set-check-helper-for-mount-remount-.patch @@ -0,0 +1,161 @@ +From 40a86fd67acf2d0019332f31c1086ba5f22478cc Mon Sep 17 00:00:00 2001 +From: Eric Sandeen +Date: Tue, 18 Aug 2009 00:20:23 -0400 +Subject: [PATCH 11/85] ext4: Add feature set check helper for mount & remount paths + +(cherry picked from commit a13fb1a4533f26c1e2b0204d5283b696689645af) + +A user reported that although his root ext4 filesystem was mounting +fine, other filesystems would not mount, with the: + +"Filesystem with huge files cannot be mounted RDWR without CONFIG_LBDAF" + +error on his 32-bit box built without CONFIG_LBDAF. This is because +the test at mount time for this situation was not being re-checked +on remount, and the normal boot process makes an ro->rw transition, +so this was being missed. + +Refactor to make a common helper function to test the filesystem +features against the type of mount request (RO vs. RW) so that we +stay consistent. + +Addresses Red-Hat-Bugzilla: #517650 + +Signed-off-by: Eric Sandeen +Signed-off-by: "Theodore Ts'o" +--- + fs/ext4/super.c | 91 ++++++++++++++++++++++++++++++-------------------------- + 1 file changed, 49 insertions(+), 42 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -2254,6 +2254,49 @@ static struct kobj_type ext4_ktype = { + .release = ext4_sb_release, + }; + ++/* ++ * Check whether this filesystem can be mounted based on ++ * the features present and the RDONLY/RDWR mount requested. ++ * Returns 1 if this filesystem can be mounted as requested, ++ * 0 if it cannot be. ++ */ ++static int ext4_feature_set_ok(struct super_block *sb, int readonly) ++{ ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) { ++ ext4_msg(sb, KERN_ERR, ++ "Couldn't mount because of " ++ "unsupported optional features (%x)", ++ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & ++ ~EXT4_FEATURE_INCOMPAT_SUPP)); ++ return 0; ++ } ++ ++ if (readonly) ++ return 1; ++ ++ /* Check that feature set is OK for a read-write mount */ ++ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) { ++ ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " ++ "unsupported optional features (%x)", ++ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & ++ ~EXT4_FEATURE_RO_COMPAT_SUPP)); ++ return 0; ++ } ++ /* ++ * Large file size enabled file system can only be mounted ++ * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF ++ */ ++ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { ++ if (sizeof(blkcnt_t) < sizeof(u64)) { ++ ext4_msg(sb, KERN_ERR, "Filesystem with huge files " ++ "cannot be mounted RDWR without " ++ "CONFIG_LBDAF"); ++ return 0; ++ } ++ } ++ return 1; ++} ++ + static int ext4_fill_super(struct super_block *sb, void *data, int silent) + __releases(kernel_lock) + __acquires(kernel_lock) +@@ -2275,7 +2318,6 @@ static int ext4_fill_super(struct super_ + unsigned int db_count; + unsigned int i; + int needs_recovery, has_huge_files; +- int features; + __u64 blocks_count; + int err; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; +@@ -2402,39 +2444,9 @@ static int ext4_fill_super(struct super_ + * previously didn't change the revision level when setting the flags, + * so there is a chance incompat flags are set on a rev 0 filesystem. + */ +- features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); +- if (features) { +- ext4_msg(sb, KERN_ERR, +- "Couldn't mount because of " +- "unsupported optional features (%x)", +- (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & +- ~EXT4_FEATURE_INCOMPAT_SUPP)); ++ if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY))) + goto failed_mount; +- } +- features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); +- if (!(sb->s_flags & MS_RDONLY) && features) { +- ext4_msg(sb, KERN_ERR, +- "Couldn't mount RDWR because of " +- "unsupported optional features (%x)", +- (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & +- ~EXT4_FEATURE_RO_COMPAT_SUPP)); +- goto failed_mount; +- } +- has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, +- EXT4_FEATURE_RO_COMPAT_HUGE_FILE); +- if (has_huge_files) { +- /* +- * Large file size enabled file system can only be +- * mount if kernel is build with CONFIG_LBDAF +- */ +- if (sizeof(root->i_blocks) < sizeof(u64) && +- !(sb->s_flags & MS_RDONLY)) { +- ext4_msg(sb, KERN_ERR, "Filesystem with huge " +- "files cannot be mounted read-write " +- "without CONFIG_LBDAF"); +- goto failed_mount; +- } +- } ++ + blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); + + if (blocksize < EXT4_MIN_BLOCK_SIZE || +@@ -2470,6 +2482,8 @@ static int ext4_fill_super(struct super_ + } + } + ++ has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, ++ EXT4_FEATURE_RO_COMPAT_HUGE_FILE); + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, + has_huge_files); + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); +@@ -3485,18 +3499,11 @@ static int ext4_remount(struct super_blo + if (sbi->s_journal) + ext4_mark_recovery_complete(sb, es); + } else { +- int ret; +- if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, +- ~EXT4_FEATURE_RO_COMPAT_SUPP))) { +- ext4_msg(sb, KERN_WARNING, "couldn't " +- "remount RDWR because of unsupported " +- "optional features (%x)", +- (le32_to_cpu(sbi->s_es->s_feature_ro_compat) & +- ~EXT4_FEATURE_RO_COMPAT_SUPP)); ++ /* Make sure we can mount this feature set readwrite */ ++ if (!ext4_feature_set_ok(sb, 0)) { + err = -EROFS; + goto restore_opts; + } +- + /* + * Make sure the group descriptor checksums + * are sane. If they aren't, refuse to remount r/w. diff --git a/queue-2.6.31/0012-ext4-Add-missing-unlock_new_inode-call-in-extent-mig.patch b/queue-2.6.31/0012-ext4-Add-missing-unlock_new_inode-call-in-extent-mig.patch new file mode 100644 index 00000000000..6bb4ab9affc --- /dev/null +++ b/queue-2.6.31/0012-ext4-Add-missing-unlock_new_inode-call-in-extent-mig.patch @@ -0,0 +1,50 @@ +From 10974fedd2891664ca8212ff7441e258cdfb0e6c Mon Sep 17 00:00:00 2001 +From: Aneesh Kumar K.V +Date: Tue, 25 Aug 2009 22:36:05 -0400 +Subject: [PATCH 12/85] ext4: Add missing unlock_new_inode() call in extent migration code + +(cherry picked from commit a8526e84ac758ac6da45cf273aa1538a6a7aa3de) + +We need to unlock the new inode before iput. This patch fixes the +following warning when calling chattr +e to migrate a file to use +extents. It also fixes problems in when e4defrag attempts to +defragment an inode. + +[ 470.400044] ------------[ cut here ]------------ +[ 470.400065] WARNING: at fs/inode.c:1210 generic_delete_inode+0x65/0x16a() +[ 470.400072] Hardware name: N/A +..... +... +[ 470.400353] Pid: 4451, comm: chattr Not tainted 2.6.31-rc7-red-debug #4 +[ 470.400359] Call Trace: +[ 470.400372] [] warn_slowpath_common+0x77/0x8f +[ 470.400385] [] warn_slowpath_null+0xf/0x11 +[ 470.400395] [] generic_delete_inode+0x65/0x16a +[ 470.400405] [] generic_drop_inode+0x17/0x1bd +[ 470.400413] [] iput+0x61/0x65 +[ 470.400455] [] ext4_ext_migrate+0x5eb/0x66a [ext4] +[ 470.400492] [] ext4_ioctl+0x340/0x756 [ext4] +[ 470.400507] [] vfs_ioctl+0x1d/0x82 +[ 470.400517] [] do_vfs_ioctl+0x483/0x4c9 +[ 470.400527] [] ? trace_hardirqs_on+0xd/0xf +[ 470.400537] [] sys_ioctl+0x51/0x74 +[ 470.400549] [] system_call_fastpath+0x16/0x1b +[ 470.400557] ---[ end trace ab85723542352dac ]--- + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: "Theodore Ts'o" +--- + fs/ext4/migrate.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/migrate.c ++++ b/fs/ext4/migrate.c +@@ -618,7 +618,7 @@ err_out: + tmp_inode->i_nlink = 0; + + ext4_journal_stop(handle); +- ++ unlock_new_inode(tmp_inode); + iput(tmp_inode); + + return retval; diff --git a/queue-2.6.31/0013-ext4-Allow-rename-to-create-more-than-EXT4_LINK_MAX-.patch b/queue-2.6.31/0013-ext4-Allow-rename-to-create-more-than-EXT4_LINK_MAX-.patch new file mode 100644 index 00000000000..17efb6074b1 --- /dev/null +++ b/queue-2.6.31/0013-ext4-Allow-rename-to-create-more-than-EXT4_LINK_MAX-.patch @@ -0,0 +1,27 @@ +From 596184d7832b868c8ebce9de87058f4ee101d887 Mon Sep 17 00:00:00 2001 +From: Aneesh Kumar K.V +Date: Fri, 28 Aug 2009 21:43:15 -0400 +Subject: [PATCH 13/85] ext4: Allow rename to create more than EXT4_LINK_MAX subdirectories + +(cherry picked from commit 2c94eb86c66e1eaaa1e7d8a2120f4fad5e7e7736) + +Use EXT4_DIR_LINK_MAX so that rename() can move a directory into new +parent directory without running into the EXT4_LINK_MAX limit. + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: "Theodore Ts'o" +--- + fs/ext4/namei.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -2413,7 +2413,7 @@ static int ext4_rename(struct inode *old + goto end_rename; + retval = -EMLINK; + if (!new_inode && new_dir != old_dir && +- new_dir->i_nlink >= EXT4_LINK_MAX) ++ EXT4_DIR_LINK_MAX(new_dir)) + goto end_rename; + } + if (!new_bh) { diff --git a/queue-2.6.31/0014-ext4-Limit-number-of-links-that-can-be-created-by-ex.patch b/queue-2.6.31/0014-ext4-Limit-number-of-links-that-can-be-created-by-ex.patch new file mode 100644 index 00000000000..06b80577106 --- /dev/null +++ b/queue-2.6.31/0014-ext4-Limit-number-of-links-that-can-be-created-by-ex.patch @@ -0,0 +1,27 @@ +From fc318a8ca9eb7946de7212790a9eb080f681eeaa Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sat, 29 Aug 2009 21:08:08 -0400 +Subject: [PATCH 14/85] ext4: Limit number of links that can be created by ext4_link() + +(cherry picked from commit b05ab1dc3795e6f997fb0d34f38fce5012533c3e) + +In ext4_link we need to check using EXT4_LINK_MAX, and not +EXT4_DIR_LINK_MAX(), since ext4_link() is creating hard links of +regular files, and not directories. + +Signed-off-by: "Theodore Ts'o" +--- + fs/ext4/namei.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -2310,7 +2310,7 @@ static int ext4_link(struct dentry *old_ + struct inode *inode = old_dentry->d_inode; + int err, retries = 0; + +- if (EXT4_DIR_LINK_MAX(inode)) ++ if (inode->i_nlink >= EXT4_LINK_MAX) + return -EMLINK; + + /* diff --git a/queue-2.6.31/0015-ext4-Restore-wbc-range_start-in-ext4_da_writepages.patch b/queue-2.6.31/0015-ext4-Restore-wbc-range_start-in-ext4_da_writepages.patch new file mode 100644 index 00000000000..5dd37654a39 --- /dev/null +++ b/queue-2.6.31/0015-ext4-Restore-wbc-range_start-in-ext4_da_writepages.patch @@ -0,0 +1,39 @@ +From 2b208d0bb3de20ea09cdcfe51e44b242739b075d Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Mon, 31 Aug 2009 17:00:59 -0400 +Subject: [PATCH 15/85] ext4: Restore wbc->range_start in ext4_da_writepages() + +(cherry picked from commit de89de6e0cf4b1eb13f27137cf2aa40d287aabdf) + +To solve a lock inversion problem, we implement part of the +range_cyclic algorithm in ext4_da_writepages(). (See commit 2acf2c26 +for more details.) + +As part of that change wbc->range_start was modified by ext4's +writepages function, which causes its callers to get confused since +they aren't expecting the filesystem to modify it. The simplest fix +is to save and restore wbc->range_start in ext4_da_writepages. + +Signed-off-by: "Theodore Ts'o" +--- + fs/ext4/inode.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -2750,6 +2750,7 @@ static int ext4_da_writepages(struct add + long pages_skipped; + int range_cyclic, cycled = 1, io_done = 0; + int needed_blocks, ret = 0, nr_to_writebump = 0; ++ loff_t range_start = wbc->range_start; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + + trace_ext4_da_writepages(inode, wbc); +@@ -2918,6 +2919,7 @@ out_writepages: + if (!no_nrwrite_index_update) + wbc->no_nrwrite_index_update = 0; + wbc->nr_to_write -= nr_to_writebump; ++ wbc->range_start = range_start; + trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); + return ret; + } diff --git a/queue-2.6.31/0016-ext4-fix-cache-flush-in-ext4_sync_file.patch b/queue-2.6.31/0016-ext4-fix-cache-flush-in-ext4_sync_file.patch new file mode 100644 index 00000000000..47df45412c8 --- /dev/null +++ b/queue-2.6.31/0016-ext4-fix-cache-flush-in-ext4_sync_file.patch @@ -0,0 +1,35 @@ +From 889c3508467677a534209736351f028a5b925cc2 Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Sat, 5 Sep 2009 21:42:42 -0400 +Subject: [PATCH 16/85] ext4: fix cache flush in ext4_sync_file + +(cherry picked from commit 5f3481e9a80c240f169b36ea886e2325b9aeb745) + +We need to flush the write cache unconditionally in ->fsync, otherwise +writes into already allocated blocks can get lost. Writes into fully +allocated files are very common when using disk images for +virtualization, and without this fix can easily lose data after +an fdatasync, which is the typical implementation for a cache flush on +the virtual drive. + +Signed-off-by: Christoph Hellwig +Acked-by: Eric Sandeen +Signed-off-by: "Theodore Ts'o" +--- + fs/ext4/fsync.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/ext4/fsync.c ++++ b/fs/ext4/fsync.c +@@ -92,9 +92,9 @@ int ext4_sync_file(struct file *file, st + .nr_to_write = 0, /* sys_fsync did this */ + }; + ret = sync_inode(inode, &wbc); +- if (journal && (journal->j_flags & JBD2_BARRIER)) +- blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + } + out: ++ if (journal && (journal->j_flags & JBD2_BARRIER)) ++ blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + return ret; + } diff --git a/queue-2.6.31/0017-ext4-Fix-wrong-comparisons-in-mext_check_arguments.patch b/queue-2.6.31/0017-ext4-Fix-wrong-comparisons-in-mext_check_arguments.patch new file mode 100644 index 00000000000..337197c2c14 --- /dev/null +++ b/queue-2.6.31/0017-ext4-Fix-wrong-comparisons-in-mext_check_arguments.patch @@ -0,0 +1,101 @@ +From 5b927cfba5b756c8f401e30fe2890b22a0b1fbb8 Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Wed, 16 Sep 2009 14:28:22 -0400 +Subject: [PATCH 17/85] ext4: Fix wrong comparisons in mext_check_arguments() + +(cherry picked from commit 70d5d3dcea47c16058d2b093c29e07fdf61b56ad) + +The mext_check_arguments() function in move_extents.c has wrong +comparisons. orig_start which is passed from user-space is block +unit, but i_size of inode is byte unit, therefore the checks do not +work fine. This mis-check leads to the overflow of 'len' and then +hits BUG_ON() in ext4_move_extents(). The patch fixes this issue. + +Signed-off-by: Akira Fujita +Reviewed-by: Greg Freemyer +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/move_extent.c | 46 +++++++++++++++++++++++++++------------------- + 1 file changed, 27 insertions(+), 19 deletions(-) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -898,6 +898,10 @@ mext_check_arguments(struct inode *orig_ + struct inode *donor_inode, __u64 orig_start, + __u64 donor_start, __u64 *len, __u64 moved_len) + { ++ ext4_lblk_t orig_blocks, donor_blocks; ++ unsigned int blkbits = orig_inode->i_blkbits; ++ unsigned int blocksize = 1 << blkbits; ++ + /* Regular file check */ + if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { + ext4_debug("ext4 move extent: The argument files should be " +@@ -972,43 +976,47 @@ mext_check_arguments(struct inode *orig_ + } + + if (orig_inode->i_size > donor_inode->i_size) { +- if (orig_start >= donor_inode->i_size) { ++ donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits; ++ /* TODO: eliminate this artificial restriction */ ++ if (orig_start >= donor_blocks) { + ext4_debug("ext4 move extent: orig start offset " +- "[%llu] should be less than donor file size " +- "[%lld] [ino:orig %lu, donor_inode %lu]\n", +- orig_start, donor_inode->i_size, ++ "[%llu] should be less than donor file blocks " ++ "[%u] [ino:orig %lu, donor %lu]\n", ++ orig_start, donor_blocks, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + +- if (orig_start + *len > donor_inode->i_size) { ++ /* TODO: eliminate this artificial restriction */ ++ if (orig_start + *len > donor_blocks) { + ext4_debug("ext4 move extent: End offset [%llu] should " +- "be less than donor file size [%lld]." +- "So adjust length from %llu to %lld " ++ "be less than donor file blocks [%u]." ++ "So adjust length from %llu to %llu " + "[ino:orig %lu, donor %lu]\n", +- orig_start + *len, donor_inode->i_size, +- *len, donor_inode->i_size - orig_start, ++ orig_start + *len, donor_blocks, ++ *len, donor_blocks - orig_start, + orig_inode->i_ino, donor_inode->i_ino); +- *len = donor_inode->i_size - orig_start; ++ *len = donor_blocks - orig_start; + } + } else { +- if (orig_start >= orig_inode->i_size) { ++ orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits; ++ if (orig_start >= orig_blocks) { + ext4_debug("ext4 move extent: start offset [%llu] " +- "should be less than original file size " +- "[%lld] [inode:orig %lu, donor %lu]\n", +- orig_start, orig_inode->i_size, ++ "should be less than original file blocks " ++ "[%u] [ino:orig %lu, donor %lu]\n", ++ orig_start, orig_blocks, + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } + +- if (orig_start + *len > orig_inode->i_size) { ++ if (orig_start + *len > orig_blocks) { + ext4_debug("ext4 move extent: Adjust length " +- "from %llu to %lld. Because it should be " +- "less than original file size " ++ "from %llu to %llu. Because it should be " ++ "less than original file blocks " + "[ino:orig %lu, donor %lu]\n", +- *len, orig_inode->i_size - orig_start, ++ *len, orig_blocks - orig_start, + orig_inode->i_ino, donor_inode->i_ino); +- *len = orig_inode->i_size - orig_start; ++ *len = orig_blocks - orig_start; + } + } + diff --git a/queue-2.6.31/0018-ext4-Remove-unneeded-BUG_ON-in-ext4_move_extents.patch b/queue-2.6.31/0018-ext4-Remove-unneeded-BUG_ON-in-ext4_move_extents.patch new file mode 100644 index 00000000000..7a8389761a0 --- /dev/null +++ b/queue-2.6.31/0018-ext4-Remove-unneeded-BUG_ON-in-ext4_move_extents.patch @@ -0,0 +1,32 @@ +From 453ad06301204bd60bd132733506972f02f986a2 Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Sat, 5 Sep 2009 22:11:55 -0400 +Subject: [PATCH 18/85] ext4: Remove unneeded BUG_ON() in ext4_move_extents() + +(cherry picked from commit daea696dbac0e33af3cfe304efbfb8d74e0effe6) + +The ext4_move_extents() functions checks with BUG_ON() whether the +exchanged blocks count accords with request blocks count. But, if the +target range (orig_start + len) includes sparse block(s), 'moved_len' +(exchanged blocks count) does not agree with 'len' (request blocks +count), since sparse block is not counted in 'moved_len'. This causes +us to hit the BUG_ON(), even though the function succeeded. + +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/move_extent.c | 3 --- + 1 file changed, 3 deletions(-) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -1322,8 +1322,5 @@ out2: + if (ret) + return ret; + +- /* All of the specified blocks must be exchanged in succeed */ +- BUG_ON(*moved_len != len); +- + return 0; + } diff --git a/queue-2.6.31/0019-ext4-Return-exchanged-blocks-count-to-user-space-in-.patch b/queue-2.6.31/0019-ext4-Return-exchanged-blocks-count-to-user-space-in-.patch new file mode 100644 index 00000000000..f4c9464da4c --- /dev/null +++ b/queue-2.6.31/0019-ext4-Return-exchanged-blocks-count-to-user-space-in-.patch @@ -0,0 +1,33 @@ +From ed4c4355f2cdf69d08662aedaeac636ff879f631 Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Sat, 5 Sep 2009 22:46:29 -0400 +Subject: [PATCH 19/85] ext4: Return exchanged blocks count to user space in failure + +(cherry picked from commit 8d6669133d8cdbb7cbe0e1f0f3744e7802a84afe) + +Return exchanged blocks count (moved_len) to user space, +if ext4_move_extents() failed on the way. + +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ioctl.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +--- a/fs/ext4/ioctl.c ++++ b/fs/ext4/ioctl.c +@@ -243,10 +243,9 @@ setversion_out: + me.donor_start, me.len, &me.moved_len); + fput(donor_filp); + +- if (!err) +- if (copy_to_user((struct move_extent *)arg, +- &me, sizeof(me))) +- return -EFAULT; ++ if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) ++ return -EFAULT; ++ + return err; + } + diff --git a/queue-2.6.31/0020-ext4-Take-page-lock-before-looking-at-attached-buffe.patch b/queue-2.6.31/0020-ext4-Take-page-lock-before-looking-at-attached-buffe.patch new file mode 100644 index 00000000000..70492920ba2 --- /dev/null +++ b/queue-2.6.31/0020-ext4-Take-page-lock-before-looking-at-attached-buffe.patch @@ -0,0 +1,43 @@ +From bc476d1a48a5a417aed2d3ad666346872d8bdee7 Mon Sep 17 00:00:00 2001 +From: Aneesh Kumar K.V +Date: Wed, 9 Sep 2009 22:36:03 -0400 +Subject: [PATCH 20/85] ext4: Take page lock before looking at attached buffer_heads flags + +(cherry picked from commit a827eaffff07c7d58a4cb32158cbeb4849f4e33a) + +In order to check whether the buffer_heads are mapped we need to hold +page lock. Otherwise a reclaim can cleanup the attached buffer_heads. + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 13 +++++++++++-- + 1 file changed, 11 insertions(+), 2 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -5298,12 +5298,21 @@ int ext4_page_mkwrite(struct vm_area_str + else + len = PAGE_CACHE_SIZE; + ++ lock_page(page); ++ /* ++ * return if we have all the buffers mapped. This avoid ++ * the need to call write_begin/write_end which does a ++ * journal_start/journal_stop which can block and take ++ * long time ++ */ + if (page_has_buffers(page)) { +- /* return if we have all the buffers mapped */ + if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, +- ext4_bh_unmapped)) ++ ext4_bh_unmapped)) { ++ unlock_page(page); + goto out_unlock; ++ } + } ++ unlock_page(page); + /* + * OK, we need to fill the hole... Do write_begin write_end + * to do block allocation/reservation.We are not holding diff --git a/queue-2.6.31/0021-ext4-print-more-sysadmin-friendly-message-in-check_b.patch b/queue-2.6.31/0021-ext4-print-more-sysadmin-friendly-message-in-check_b.patch new file mode 100644 index 00000000000..f1faf3c4e83 --- /dev/null +++ b/queue-2.6.31/0021-ext4-print-more-sysadmin-friendly-message-in-check_b.patch @@ -0,0 +1,64 @@ +From 55f3892371dc7439f9d18df3c0ed0dd96a31263c Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Tue, 8 Sep 2009 08:21:26 -0400 +Subject: [PATCH 21/85] ext4: print more sysadmin-friendly message in check_block_validity() + +(cherry picked from commit 80e42468d65475e92651e62175bb7807773321d0) + +Drop the WARN_ON(1), as he stack trace is not appropriate, since it is +triggered by file system corruption, and it misleads users into +thinking there is a kernel bug. In addition, change the message +displayed by ext4_error() to make it clear that this is a file system +corruption problem. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1122,16 +1122,15 @@ static void ext4_da_update_reserve_space + ext4_discard_preallocations(inode); + } + +-static int check_block_validity(struct inode *inode, sector_t logical, +- sector_t phys, int len) ++static int check_block_validity(struct inode *inode, const char *msg, ++ sector_t logical, sector_t phys, int len) + { + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { +- ext4_error(inode->i_sb, "check_block_validity", ++ ext4_error(inode->i_sb, msg, + "inode #%lu logical block %llu mapped to %llu " + "(size %d)", inode->i_ino, + (unsigned long long) logical, + (unsigned long long) phys, len); +- WARN_ON(1); + return -EIO; + } + return 0; +@@ -1183,8 +1182,8 @@ int ext4_get_blocks(handle_t *handle, st + up_read((&EXT4_I(inode)->i_data_sem)); + + if (retval > 0 && buffer_mapped(bh)) { +- int ret = check_block_validity(inode, block, +- bh->b_blocknr, retval); ++ int ret = check_block_validity(inode, "file system corruption", ++ block, bh->b_blocknr, retval); + if (ret != 0) + return ret; + } +@@ -1265,8 +1264,9 @@ int ext4_get_blocks(handle_t *handle, st + + up_write((&EXT4_I(inode)->i_data_sem)); + if (retval > 0 && buffer_mapped(bh)) { +- int ret = check_block_validity(inode, block, +- bh->b_blocknr, retval); ++ int ret = check_block_validity(inode, "file system " ++ "corruption after allocation", ++ block, bh->b_blocknr, retval); + if (ret != 0) + return ret; + } diff --git a/queue-2.6.31/0022-ext4-Use-bforget-in-no-journal-mode-for-ext4_journal.patch b/queue-2.6.31/0022-ext4-Use-bforget-in-no-journal-mode-for-ext4_journal.patch new file mode 100644 index 00000000000..eda778c0dad --- /dev/null +++ b/queue-2.6.31/0022-ext4-Use-bforget-in-no-journal-mode-for-ext4_journal.patch @@ -0,0 +1,45 @@ +From 8263746e63ecdc586b2d0c969d905dd78645fa6f Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Wed, 9 Sep 2009 21:32:41 -0400 +Subject: [PATCH 22/85] ext4: Use bforget() in no journal mode for ext4_journal_{forget,revoke}() + +(cherry picked from commit c7acb4c16646943180bd221c167a077e0a084f9c) + +When ext4 is using a journal, a metadata block which is deallocated +must be passed into the journal layer so it can be dropped from the +current transaction and/or revoked. This is done by calling the +functions ext4_journal_forget() and ext4_journal_revoke(), which call +jbd2_journal_forget(), and jbd2_journal_revoke(), respectively. + +Since the jbd2_journal_forget() and jbd2_journal_revoke() call +bforget(), if ext4 is not using a journal, ext4_journal_forget() and +ext4_journal_revoke() must call bforget() to avoid a dirty metadata +block overwriting a block after it has been reallocated and reused for +another inode's data block. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4_jbd2.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/ext4/ext4_jbd2.c ++++ b/fs/ext4/ext4_jbd2.c +@@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *wh + handle, err); + } + else +- brelse(bh); ++ bforget(bh); + return err; + } + +@@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *wh + handle, err); + } + else +- brelse(bh); ++ bforget(bh); + return err; + } + diff --git a/queue-2.6.31/0023-ext4-Assure-that-metadata-blocks-are-written-during-.patch b/queue-2.6.31/0023-ext4-Assure-that-metadata-blocks-are-written-during-.patch new file mode 100644 index 00000000000..c5f44b76e4d --- /dev/null +++ b/queue-2.6.31/0023-ext4-Assure-that-metadata-blocks-are-written-during-.patch @@ -0,0 +1,67 @@ +From d20eaa373ef416b7f364dd67fb1412ce22a53dcf Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sat, 12 Sep 2009 13:41:55 -0400 +Subject: [PATCH 23/85] ext4: Assure that metadata blocks are written during fsync in no journal mode + +(cherry picked from commit fe188c0e084bdf3038dc0ac963c21d764f53f7da) + +When there is no journal present, we must attach buffer heads +associated with extent tree and indirect blocks to the inode's +mapping->private_list via mark_buffer_dirty_inode() so that +ext4_sync_file() --- which is called to service fsync() and +fdatasync() system calls --- can write out the inode's metadata blocks +by calling sync_mapping_buffers(). + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4_jbd2.c | 5 ++++- + fs/ext4/fsync.c | 9 +++++++-- + 2 files changed, 11 insertions(+), 3 deletions(-) + +--- a/fs/ext4/ext4_jbd2.c ++++ b/fs/ext4/ext4_jbd2.c +@@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const c + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } else { +- mark_buffer_dirty(bh); ++ if (inode && bh) ++ mark_buffer_dirty_inode(bh, inode); ++ else ++ mark_buffer_dirty(bh); + if (inode && inode_needs_sync(inode)) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { +--- a/fs/ext4/fsync.c ++++ b/fs/ext4/fsync.c +@@ -50,7 +50,7 @@ int ext4_sync_file(struct file *file, st + { + struct inode *inode = dentry->d_inode; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; +- int ret = 0; ++ int err, ret = 0; + + J_ASSERT(ext4_journal_current_handle() == NULL); + +@@ -79,6 +79,9 @@ int ext4_sync_file(struct file *file, st + goto out; + } + ++ if (!journal) ++ ret = sync_mapping_buffers(inode->i_mapping); ++ + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + goto out; + +@@ -91,7 +94,9 @@ int ext4_sync_file(struct file *file, st + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, /* sys_fsync did this */ + }; +- ret = sync_inode(inode, &wbc); ++ err = sync_inode(inode, &wbc); ++ if (ret == 0) ++ ret = err; + } + out: + if (journal && (journal->j_flags & JBD2_BARRIER)) diff --git a/queue-2.6.31/0024-ext4-Make-non-journal-fsync-work-properly.patch b/queue-2.6.31/0024-ext4-Make-non-journal-fsync-work-properly.patch new file mode 100644 index 00000000000..46500573f1f --- /dev/null +++ b/queue-2.6.31/0024-ext4-Make-non-journal-fsync-work-properly.patch @@ -0,0 +1,117 @@ +From ffdd962c92b2407658e6784844d9c0e48eb175e4 Mon Sep 17 00:00:00 2001 +From: Frank Mayhar +Date: Wed, 9 Sep 2009 22:33:47 -0400 +Subject: [PATCH 24/85] ext4: Make non-journal fsync work properly + +(cherry picked from commit 91ac6f43317c0bf99969665f98016548011dfa38) + +Teach ext4_write_inode() and ext4_do_update_inode() about non-journal +mode: If we're not using a journal, ext4_write_inode() now calls +ext4_do_update_inode() (after getting the iloc via ext4_get_inode_loc()) +with a new "do_sync" parameter. If that parameter is nonzero _and_ we're +not using a journal, ext4_do_update_inode() calls sync_dirty_buffer() +instead of ext4_handle_dirty_metadata(). + +This problem was found in power-fail testing, checking the amount of +loss of files and blocks after a power failure when using fsync() and +when not using fsync(). It turned out that using fsync() was actually +worse than not doing so, possibly because it increased the likelihood +that the inodes would remain unflushed and would therefore be lost at +the power failure. + +Signed-off-by: Frank Mayhar +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 54 ++++++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 40 insertions(+), 14 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -4550,7 +4550,8 @@ static int ext4_inode_blocks_set(handle_ + */ + static int ext4_do_update_inode(handle_t *handle, + struct inode *inode, +- struct ext4_iloc *iloc) ++ struct ext4_iloc *iloc, ++ int do_sync) + { + struct ext4_inode *raw_inode = ext4_raw_inode(iloc); + struct ext4_inode_info *ei = EXT4_I(inode); +@@ -4652,10 +4653,22 @@ static int ext4_do_update_inode(handle_t + raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + } + +- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); +- rc = ext4_handle_dirty_metadata(handle, inode, bh); +- if (!err) +- err = rc; ++ /* ++ * If we're not using a journal and we were called from ++ * ext4_write_inode() to sync the inode (making do_sync true), ++ * we can just use sync_dirty_buffer() directly to do our dirty ++ * work. Testing s_journal here is a bit redundant but it's ++ * worth it to avoid potential future trouble. ++ */ ++ if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) { ++ BUFFER_TRACE(bh, "call sync_dirty_buffer"); ++ sync_dirty_buffer(bh); ++ } else { ++ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); ++ rc = ext4_handle_dirty_metadata(handle, inode, bh); ++ if (!err) ++ err = rc; ++ } + ei->i_state &= ~EXT4_STATE_NEW; + + out_brelse: +@@ -4701,19 +4714,32 @@ out_brelse: + */ + int ext4_write_inode(struct inode *inode, int wait) + { ++ int err; ++ + if (current->flags & PF_MEMALLOC) + return 0; + +- if (ext4_journal_current_handle()) { +- jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); +- dump_stack(); +- return -EIO; +- } ++ if (EXT4_SB(inode->i_sb)->s_journal) { ++ if (ext4_journal_current_handle()) { ++ jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); ++ dump_stack(); ++ return -EIO; ++ } + +- if (!wait) +- return 0; ++ if (!wait) ++ return 0; ++ ++ err = ext4_force_commit(inode->i_sb); ++ } else { ++ struct ext4_iloc iloc; + +- return ext4_force_commit(inode->i_sb); ++ err = ext4_get_inode_loc(inode, &iloc); ++ if (err) ++ return err; ++ err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE, ++ inode, &iloc, wait); ++ } ++ return err; + } + + /* +@@ -5007,7 +5033,7 @@ int ext4_mark_iloc_dirty(handle_t *handl + get_bh(iloc->bh); + + /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ +- err = ext4_do_update_inode(handle, inode, iloc); ++ err = ext4_do_update_inode(handle, inode, iloc, 0); + put_bh(iloc->bh); + return err; + } diff --git a/queue-2.6.31/0025-ext4-move-ext4_mb_init_group-function-earlier-in-the.patch b/queue-2.6.31/0025-ext4-move-ext4_mb_init_group-function-earlier-in-the.patch new file mode 100644 index 00000000000..4cca045b03a --- /dev/null +++ b/queue-2.6.31/0025-ext4-move-ext4_mb_init_group-function-earlier-in-the.patch @@ -0,0 +1,215 @@ +From 4aa4b8f3fa08b0748052052a7b4aadec83c28f8b Mon Sep 17 00:00:00 2001 +From: Aneesh Kumar K.V +Date: Wed, 9 Sep 2009 23:47:46 -0400 +Subject: [PATCH 25/85] ext4: move ext4_mb_init_group() function earlier in the mballoc.c + +(cherry picked from commit b6a758ec3af3ec236dbfdcf6a06b84ac8f94957e) + +This moves the function around so that it can be called from +ext4_mb_load_buddy(). + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 182 +++++++++++++++++++++++++++--------------------------- + 1 file changed, 91 insertions(+), 91 deletions(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -908,6 +908,97 @@ out: + return err; + } + ++static noinline_for_stack ++int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) ++{ ++ ++ int ret = 0; ++ void *bitmap; ++ int blocks_per_page; ++ int block, pnum, poff; ++ int num_grp_locked = 0; ++ struct ext4_group_info *this_grp; ++ struct ext4_sb_info *sbi = EXT4_SB(sb); ++ struct inode *inode = sbi->s_buddy_cache; ++ struct page *page = NULL, *bitmap_page = NULL; ++ ++ mb_debug("init group %lu\n", group); ++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; ++ this_grp = ext4_get_group_info(sb, group); ++ /* ++ * This ensures we don't add group ++ * to this buddy cache via resize ++ */ ++ num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); ++ if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { ++ /* ++ * somebody initialized the group ++ * return without doing anything ++ */ ++ ret = 0; ++ goto err; ++ } ++ /* ++ * the buddy cache inode stores the block bitmap ++ * and buddy information in consecutive blocks. ++ * So for each group we need two blocks. ++ */ ++ block = group * 2; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page) { ++ BUG_ON(page->mapping != inode->i_mapping); ++ ret = ext4_mb_init_cache(page, NULL); ++ if (ret) { ++ unlock_page(page); ++ goto err; ++ } ++ unlock_page(page); ++ } ++ if (page == NULL || !PageUptodate(page)) { ++ ret = -EIO; ++ goto err; ++ } ++ mark_page_accessed(page); ++ bitmap_page = page; ++ bitmap = page_address(page) + (poff * sb->s_blocksize); ++ ++ /* init buddy cache */ ++ block++; ++ pnum = block / blocks_per_page; ++ poff = block % blocks_per_page; ++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); ++ if (page == bitmap_page) { ++ /* ++ * If both the bitmap and buddy are in ++ * the same page we don't need to force ++ * init the buddy ++ */ ++ unlock_page(page); ++ } else if (page) { ++ BUG_ON(page->mapping != inode->i_mapping); ++ ret = ext4_mb_init_cache(page, bitmap); ++ if (ret) { ++ unlock_page(page); ++ goto err; ++ } ++ unlock_page(page); ++ } ++ if (page == NULL || !PageUptodate(page)) { ++ ret = -EIO; ++ goto err; ++ } ++ mark_page_accessed(page); ++err: ++ ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); ++ if (bitmap_page) ++ page_cache_release(bitmap_page); ++ if (page) ++ page_cache_release(page); ++ return ret; ++} ++ + static noinline_for_stack int + ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b) +@@ -1837,97 +1928,6 @@ void ext4_mb_put_buddy_cache_lock(struct + + } + +-static noinline_for_stack +-int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +-{ +- +- int ret; +- void *bitmap; +- int blocks_per_page; +- int block, pnum, poff; +- int num_grp_locked = 0; +- struct ext4_group_info *this_grp; +- struct ext4_sb_info *sbi = EXT4_SB(sb); +- struct inode *inode = sbi->s_buddy_cache; +- struct page *page = NULL, *bitmap_page = NULL; +- +- mb_debug("init group %lu\n", group); +- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; +- this_grp = ext4_get_group_info(sb, group); +- /* +- * This ensures we don't add group +- * to this buddy cache via resize +- */ +- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); +- if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { +- /* +- * somebody initialized the group +- * return without doing anything +- */ +- ret = 0; +- goto err; +- } +- /* +- * the buddy cache inode stores the block bitmap +- * and buddy information in consecutive blocks. +- * So for each group we need two blocks. +- */ +- block = group * 2; +- pnum = block / blocks_per_page; +- poff = block % blocks_per_page; +- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); +- if (page) { +- BUG_ON(page->mapping != inode->i_mapping); +- ret = ext4_mb_init_cache(page, NULL); +- if (ret) { +- unlock_page(page); +- goto err; +- } +- unlock_page(page); +- } +- if (page == NULL || !PageUptodate(page)) { +- ret = -EIO; +- goto err; +- } +- mark_page_accessed(page); +- bitmap_page = page; +- bitmap = page_address(page) + (poff * sb->s_blocksize); +- +- /* init buddy cache */ +- block++; +- pnum = block / blocks_per_page; +- poff = block % blocks_per_page; +- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); +- if (page == bitmap_page) { +- /* +- * If both the bitmap and buddy are in +- * the same page we don't need to force +- * init the buddy +- */ +- unlock_page(page); +- } else if (page) { +- BUG_ON(page->mapping != inode->i_mapping); +- ret = ext4_mb_init_cache(page, bitmap); +- if (ret) { +- unlock_page(page); +- goto err; +- } +- unlock_page(page); +- } +- if (page == NULL || !PageUptodate(page)) { +- ret = -EIO; +- goto err; +- } +- mark_page_accessed(page); +-err: +- ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); +- if (bitmap_page) +- page_cache_release(bitmap_page); +- if (page) +- page_cache_release(page); +- return ret; +-} +- + static noinline_for_stack int + ext4_mb_regular_allocator(struct ext4_allocation_context *ac) + { diff --git a/queue-2.6.31/0026-ext4-check-for-need-init-flag-in-ext4_mb_load_buddy.patch b/queue-2.6.31/0026-ext4-check-for-need-init-flag-in-ext4_mb_load_buddy.patch new file mode 100644 index 00000000000..54be13519aa --- /dev/null +++ b/queue-2.6.31/0026-ext4-check-for-need-init-flag-in-ext4_mb_load_buddy.patch @@ -0,0 +1,79 @@ +From a508d6aa722ae358cf885dd33be7f02d18ba9e39 Mon Sep 17 00:00:00 2001 +From: Aneesh Kumar K.V +Date: Wed, 9 Sep 2009 23:34:50 -0400 +Subject: [PATCH 26/85] ext4: check for need init flag in ext4_mb_load_buddy + +(cherry picked from commit f41c0750538667b87a19c93952e5d42fcc069bd7) + +We should check for need init flag with the group's alloc_sem held, to +make sure while we are loading the buddy cache and holding a reference +to it, a file system resize can't add new blocks to same group. + +The patch also drops the need init flag check in +ext4_mb_regular_allocator() because doing the check without holding +alloc_sem is racy. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 39 ++++++++++++++++++--------------------- + 1 file changed, 18 insertions(+), 21 deletions(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -1032,8 +1032,26 @@ ext4_mb_load_buddy(struct super_block *s + * groups mapped by the page is blocked + * till we are done with allocation + */ ++repeat_load_buddy: + down_read(e4b->alloc_semp); + ++ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { ++ /* we need to check for group need init flag ++ * with alloc_semp held so that we can be sure ++ * that new blocks didn't get added to the group ++ * when we are loading the buddy cache ++ */ ++ up_read(e4b->alloc_semp); ++ /* ++ * we need full data about the group ++ * to make a good selection ++ */ ++ ret = ext4_mb_init_group(sb, group); ++ if (ret) ++ return ret; ++ goto repeat_load_buddy; ++ } ++ + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. +@@ -2010,27 +2028,6 @@ repeat: + if (grp->bb_free == 0) + continue; + +- /* +- * if the group is already init we check whether it is +- * a good group and if not we don't load the buddy +- */ +- if (EXT4_MB_GRP_NEED_INIT(grp)) { +- /* +- * we need full data about the group +- * to make a good selection +- */ +- err = ext4_mb_init_group(sb, group); +- if (err) +- goto out; +- } +- +- /* +- * If the particular group doesn't satisfy our +- * criteria we continue with the next group +- */ +- if (!ext4_mb_good_group(ac, group, cr)) +- continue; +- + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) + goto out; diff --git a/queue-2.6.31/0027-ext4-Don-t-update-superblock-write-time-when-filesys.patch b/queue-2.6.31/0027-ext4-Don-t-update-superblock-write-time-when-filesys.patch new file mode 100644 index 00000000000..613b613ed5b --- /dev/null +++ b/queue-2.6.31/0027-ext4-Don-t-update-superblock-write-time-when-filesys.patch @@ -0,0 +1,41 @@ +From 3caa2ec6f2cc09d45329c06cc2cc85c3f3ef4d78 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Thu, 10 Sep 2009 17:31:04 -0400 +Subject: [PATCH 27/85] ext4: Don't update superblock write time when filesystem is read-only + +(cherry picked from commit 71290b368ad5e1e0b0b300c9d5638490a9fd1a2d) + +This avoids updating the superblock write time when we are mounting +the root file system read/only but we need to replay the journal; at +that point, for people who are east of GMT and who make their clock +tick in localtime for Windows bug-for-bug compatibility, and this will +cause e2fsck to complain and force a full file system check. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/super.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -3230,7 +3230,18 @@ static int ext4_commit_super(struct supe + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } +- es->s_wtime = cpu_to_le32(get_seconds()); ++ /* ++ * If the file system is mounted read-only, don't update the ++ * superblock write time. This avoids updating the superblock ++ * write time when we are mounting the root file system ++ * read/only but we need to replay the journal; at that point, ++ * for people who are east of GMT and who make their clock ++ * tick in localtime for Windows bug-for-bug compatibility, ++ * the clock is set in the future, and this will cause e2fsck ++ * to complain and force a full file system check. ++ */ ++ if (!(sb->s_flags & MS_RDONLY)) ++ es->s_wtime = cpu_to_le32(get_seconds()); + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - diff --git a/queue-2.6.31/0028-ext4-Always-set-dx_node-s-fake_dirent-explicitly.patch b/queue-2.6.31/0028-ext4-Always-set-dx_node-s-fake_dirent-explicitly.patch new file mode 100644 index 00000000000..9dd7d0c22da --- /dev/null +++ b/queue-2.6.31/0028-ext4-Always-set-dx_node-s-fake_dirent-explicitly.patch @@ -0,0 +1,32 @@ +From 0bbc74216d516daa782b47f3b87e31ef36a6ae4d Mon Sep 17 00:00:00 2001 +From: Andreas Schlick +Date: Thu, 10 Sep 2009 23:16:07 -0400 +Subject: [PATCH 28/85] ext4: Always set dx_node's fake_dirent explicitly. + +(cherry picked from commit 1f7bebb9e911d870fa8f997ddff838e82b5715ea) + +When ext4_dx_add_entry() has to split an index node, it has to ensure that +name_len of dx_node's fake_dirent is also zero, because otherwise e2fsck +won't recognise it as an intermediate htree node and consider the htree to +be corrupted. + +Signed-off-by: Andreas Schlick +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/namei.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -1590,9 +1590,9 @@ static int ext4_dx_add_entry(handle_t *h + goto cleanup; + node2 = (struct dx_node *)(bh2->b_data); + entries2 = node2->entries; ++ memset(&node2->fake, 0, sizeof(struct fake_dirent)); + node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, + sb->s_blocksize); +- node2->fake.inode = 0; + BUFFER_TRACE(frame->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, frame->bh); + if (err) diff --git a/queue-2.6.31/0029-ext4-Fix-initalization-of-s_flex_groups.patch b/queue-2.6.31/0029-ext4-Fix-initalization-of-s_flex_groups.patch new file mode 100644 index 00000000000..11ebcea1b70 --- /dev/null +++ b/queue-2.6.31/0029-ext4-Fix-initalization-of-s_flex_groups.patch @@ -0,0 +1,44 @@ +From c2c92dc7916eae0ded23057404a9af50f9890b5e Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Fri, 11 Sep 2009 16:51:28 -0400 +Subject: [PATCH 29/85] ext4: Fix initalization of s_flex_groups + +(cherry picked from commit 7ad9bb651fc2036ea94bed94da76a4b08959a911) + +The s_flex_groups array should have been initialized using atomic_add +to sum up the free counts from the block groups that make up a +flex_bg. By using atomic_set, the value of the s_flex_groups array +was set to the values of the last block group in the flex_bg. + +The impact of this bug is that the block and inode allocation +algorithms might not pick the best flex_bg for new allocation. + +Thanks to Damien Guibouret for pointing out this problem! + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/super.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1696,12 +1696,12 @@ static int ext4_fill_flex_info(struct su + gdp = ext4_get_group_desc(sb, i, NULL); + + flex_group = ext4_flex_group(sbi, i); +- atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, +- ext4_free_inodes_count(sb, gdp)); +- atomic_set(&sbi->s_flex_groups[flex_group].free_blocks, +- ext4_free_blks_count(sb, gdp)); +- atomic_set(&sbi->s_flex_groups[flex_group].used_dirs, +- ext4_used_dirs_count(sb, gdp)); ++ atomic_add(ext4_free_inodes_count(sb, gdp), ++ &sbi->s_flex_groups[flex_group].free_inodes); ++ atomic_add(ext4_free_blks_count(sb, gdp), ++ &sbi->s_flex_groups[flex_group].free_blocks); ++ atomic_add(ext4_used_dirs_count(sb, gdp), ++ &sbi->s_flex_groups[flex_group].used_dirs); + } + + return 1; diff --git a/queue-2.6.31/0030-ext4-Fix-include-trace-events-ext4.h-to-work-with-Sy.patch b/queue-2.6.31/0030-ext4-Fix-include-trace-events-ext4.h-to-work-with-Sy.patch new file mode 100644 index 00000000000..19242e6d3cf --- /dev/null +++ b/queue-2.6.31/0030-ext4-Fix-include-trace-events-ext4.h-to-work-with-Sy.patch @@ -0,0 +1,50 @@ +From b27c27c1ae02b5ad2b96ec0043822270a980717d Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Mon, 14 Sep 2009 22:59:50 -0400 +Subject: [PATCH 30/85] ext4: Fix include/trace/events/ext4.h to work with Systemtap + +(cherry picked from commit 3661d28615ea580c1db02a972fd4d3898df1cb01) + +Using relative pathnames in #include statements interacts badly with +SystemTap, since the fs/ext4/*.h header files are not packaged up as +part of a distribution kernel's header files. Since systemtap doesn't +use TP_fast_assign(), we can use a blind structure definition and then +make sure the needed header files are defined before the ext4 source +files #include the trace/events/ext4.h header file. + +https://bugzilla.redhat.com/show_bug.cgi?id=512478 + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/super.c | 1 + + include/trace/events/ext4.h | 6 ++++-- + 2 files changed, 5 insertions(+), 2 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -45,6 +45,7 @@ + #include "ext4_jbd2.h" + #include "xattr.h" + #include "acl.h" ++#include "mballoc.h" + + #define CREATE_TRACE_POINTS + #include +--- a/include/trace/events/ext4.h ++++ b/include/trace/events/ext4.h +@@ -5,10 +5,12 @@ + #define _TRACE_EXT4_H + + #include +-#include "../../../fs/ext4/ext4.h" +-#include "../../../fs/ext4/mballoc.h" + #include + ++struct ext4_allocation_context; ++struct ext4_allocation_request; ++struct ext4_prealloc_space; ++ + TRACE_EVENT(ext4_free_inode, + TP_PROTO(struct inode *inode), + diff --git a/queue-2.6.31/0031-ext4-Fix-small-typo-for-move_extent_per_page.patch b/queue-2.6.31/0031-ext4-Fix-small-typo-for-move_extent_per_page.patch new file mode 100644 index 00000000000..c5db7a410c7 --- /dev/null +++ b/queue-2.6.31/0031-ext4-Fix-small-typo-for-move_extent_per_page.patch @@ -0,0 +1,37 @@ +From e0ab5e1c37b2b98baa93b87d7ac05f11c981c55b Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Sat, 5 Sep 2009 23:12:41 -0400 +Subject: [PATCH 31/85] ext4: Fix small typo for move_extent_per_page() + +(cherry picked from commit 44fc48f7048ab9657b524938a832fec4e0acea98) + +This function means moving extents every page, so change its name from +move_exgtent_par_page(). + +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/move_extent.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -740,7 +740,7 @@ out: + * on success, or a negative error value on failure. + */ + static int +-move_extent_par_page(struct file *o_filp, struct inode *donor_inode, ++move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + pgoff_t orig_page_offset, int data_offset_in_page, + int block_len_in_page, int uninit) + { +@@ -1267,7 +1267,7 @@ ext4_move_extents(struct file *o_filp, s + while (orig_page_offset <= seq_end_page) { + + /* Swap original branches with new branches */ +- ret = move_extent_par_page(o_filp, donor_inode, ++ ret = move_extent_per_page(o_filp, donor_inode, + orig_page_offset, + data_offset_in_page, + block_len_in_page, uninit); diff --git a/queue-2.6.31/0032-ext4-Replace-get_ext_path-macro-with-an-inline-funci.patch b/queue-2.6.31/0032-ext4-Replace-get_ext_path-macro-with-an-inline-funci.patch new file mode 100644 index 00000000000..f527ace084d --- /dev/null +++ b/queue-2.6.31/0032-ext4-Replace-get_ext_path-macro-with-an-inline-funci.patch @@ -0,0 +1,146 @@ +From 0614ebefe0146affc8ff005c06c2f1cf81560493 Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Wed, 16 Sep 2009 13:46:38 -0400 +Subject: [PATCH 32/85] ext4: Replace get_ext_path macro with an inline funciton + +(cherry picked from commit e8505970af46658ece2545e9bc1fe594998fdcdf) + +Replace get_ext_path macro with an inline function, +since this macro looks like a function call but its arguments +get modified. Ted pointed this out, thanks. + +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/move_extent.c | 55 ++++++++++++++++++++++++++++++-------------------- + 1 file changed, 34 insertions(+), 21 deletions(-) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -19,14 +19,29 @@ + #include "ext4_extents.h" + #include "ext4.h" + +-#define get_ext_path(path, inode, block, ret) \ +- do { \ +- path = ext4_ext_find_extent(inode, block, path); \ +- if (IS_ERR(path)) { \ +- ret = PTR_ERR(path); \ +- path = NULL; \ +- } \ +- } while (0) ++/** ++ * get_ext_path - Find an extent path for designated logical block number. ++ * ++ * @inode: an inode which is searched ++ * @lblock: logical block number to find an extent path ++ * @path: pointer to an extent path pointer (for output) ++ * ++ * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value ++ * on failure. ++ */ ++static inline int ++get_ext_path(struct inode *inode, ext4_lblk_t lblock, ++ struct ext4_ext_path **path) ++{ ++ int ret = 0; ++ ++ *path = ext4_ext_find_extent(inode, lblock, *path); ++ if (IS_ERR(*path)) { ++ ret = PTR_ERR(*path); ++ *path = NULL; ++ } ++ return ret; ++} + + /** + * copy_extent_status - Copy the extent's initialization status +@@ -283,7 +298,7 @@ mext_insert_across_blocks(handle_t *hand + } + + if (new_flag) { +- get_ext_path(orig_path, orig_inode, eblock, err); ++ err = get_ext_path(orig_inode, eblock, &orig_path); + if (orig_path == NULL) + goto out; + +@@ -293,8 +308,8 @@ mext_insert_across_blocks(handle_t *hand + } + + if (end_flag) { +- get_ext_path(orig_path, orig_inode, +- le32_to_cpu(end_ext->ee_block) - 1, err); ++ err = get_ext_path(orig_inode, ++ le32_to_cpu(end_ext->ee_block) - 1, &orig_path); + if (orig_path == NULL) + goto out; + +@@ -631,12 +646,12 @@ mext_replace_branches(handle_t *handle, + mext_double_down_write(orig_inode, donor_inode); + + /* Get the original extent for the block "orig_off" */ +- get_ext_path(orig_path, orig_inode, orig_off, err); ++ err = get_ext_path(orig_inode, orig_off, &orig_path); + if (orig_path == NULL) + goto out; + + /* Get the donor extent for the head */ +- get_ext_path(donor_path, donor_inode, donor_off, err); ++ err = get_ext_path(donor_inode, donor_off, &donor_path); + if (donor_path == NULL) + goto out; + depth = ext_depth(orig_inode); +@@ -678,7 +693,7 @@ mext_replace_branches(handle_t *handle, + + if (orig_path) + ext4_ext_drop_refs(orig_path); +- get_ext_path(orig_path, orig_inode, orig_off, err); ++ err = get_ext_path(orig_inode, orig_off, &orig_path); + if (orig_path == NULL) + goto out; + depth = ext_depth(orig_inode); +@@ -692,8 +707,7 @@ mext_replace_branches(handle_t *handle, + + if (donor_path) + ext4_ext_drop_refs(donor_path); +- get_ext_path(donor_path, donor_inode, +- donor_off, err); ++ err = get_ext_path(donor_inode, donor_off, &donor_path); + if (donor_path == NULL) + goto out; + depth = ext_depth(donor_inode); +@@ -1154,12 +1168,12 @@ ext4_move_extents(struct file *o_filp, s + if (file_end < block_end) + len -= block_end - file_end; + +- get_ext_path(orig_path, orig_inode, block_start, ret); ++ ret = get_ext_path(orig_inode, block_start, &orig_path); + if (orig_path == NULL) + goto out2; + + /* Get path structure to check the hole */ +- get_ext_path(holecheck_path, orig_inode, block_start, ret); ++ ret = get_ext_path(orig_inode, block_start, &holecheck_path); + if (holecheck_path == NULL) + goto out; + +@@ -1289,8 +1303,7 @@ ext4_move_extents(struct file *o_filp, s + /* Decrease buffer counter */ + if (holecheck_path) + ext4_ext_drop_refs(holecheck_path); +- get_ext_path(holecheck_path, orig_inode, +- seq_start, ret); ++ ret = get_ext_path(orig_inode, seq_start, &holecheck_path); + if (holecheck_path == NULL) + break; + depth = holecheck_path->p_depth; +@@ -1298,7 +1311,7 @@ ext4_move_extents(struct file *o_filp, s + /* Decrease buffer counter */ + if (orig_path) + ext4_ext_drop_refs(orig_path); +- get_ext_path(orig_path, orig_inode, seq_start, ret); ++ ret = get_ext_path(orig_inode, seq_start, &orig_path); + if (orig_path == NULL) + break; + diff --git a/queue-2.6.31/0033-ext4-Replace-BUG_ON-with-ext4_error-in-move_extents..patch b/queue-2.6.31/0033-ext4-Replace-BUG_ON-with-ext4_error-in-move_extents..patch new file mode 100644 index 00000000000..30444917bbc --- /dev/null +++ b/queue-2.6.31/0033-ext4-Replace-BUG_ON-with-ext4_error-in-move_extents..patch @@ -0,0 +1,356 @@ +From 940398388f08f6a67816ca0b446d1a03699c5f93 Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Wed, 16 Sep 2009 13:46:35 -0400 +Subject: [PATCH 33/85] ext4: Replace BUG_ON() with ext4_error() in move_extents.c + +(cherry picked from commit 2147b1a6a48e28399120ca51d4a91840a278611f) + +Replace BUG_ON calls with a call to ext4_error() +to print an error message if EXT4_IOC_MOVE_EXT failed +with some kind of reasons. This will help to debug. +Ted pointed this out, thanks. + +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/move_extent.c | 149 ++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 109 insertions(+), 40 deletions(-) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -128,6 +128,31 @@ mext_next_extent(struct inode *inode, st + } + + /** ++ * mext_check_null_inode - NULL check for two inodes ++ * ++ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. ++ */ ++static int ++mext_check_null_inode(struct inode *inode1, struct inode *inode2, ++ const char *function) ++{ ++ int ret = 0; ++ ++ if (inode1 == NULL) { ++ ext4_error(inode2->i_sb, function, ++ "Both inodes should not be NULL: " ++ "inode1 NULL inode2 %lu", inode2->i_ino); ++ ret = -EIO; ++ } else if (inode2 == NULL) { ++ ext4_error(inode1->i_sb, function, ++ "Both inodes should not be NULL: " ++ "inode1 %lu inode2 NULL", inode1->i_ino); ++ ret = -EIO; ++ } ++ return ret; ++} ++ ++/** + * mext_double_down_read - Acquire two inodes' read semaphore + * + * @orig_inode: original inode structure +@@ -139,8 +164,6 @@ mext_double_down_read(struct inode *orig + { + struct inode *first = orig_inode, *second = donor_inode; + +- BUG_ON(orig_inode == NULL || donor_inode == NULL); +- + /* + * Use the inode number to provide the stable locking order instead + * of its address, because the C language doesn't guarantee you can +@@ -167,8 +190,6 @@ mext_double_down_write(struct inode *ori + { + struct inode *first = orig_inode, *second = donor_inode; + +- BUG_ON(orig_inode == NULL || donor_inode == NULL); +- + /* + * Use the inode number to provide the stable locking order instead + * of its address, because the C language doesn't guarantee you can +@@ -193,8 +214,6 @@ mext_double_down_write(struct inode *ori + static void + mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) + { +- BUG_ON(orig_inode == NULL || donor_inode == NULL); +- + up_read(&EXT4_I(orig_inode)->i_data_sem); + up_read(&EXT4_I(donor_inode)->i_data_sem); + } +@@ -209,8 +228,6 @@ mext_double_up_read(struct inode *orig_i + static void + mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) + { +- BUG_ON(orig_inode == NULL || donor_inode == NULL); +- + up_write(&EXT4_I(orig_inode)->i_data_sem); + up_write(&EXT4_I(donor_inode)->i_data_sem); + } +@@ -534,7 +551,15 @@ mext_leaf_block(handle_t *handle, struct + * oext |-----------| + * new_ext |-------| + */ +- BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end); ++ if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { ++ ext4_error(orig_inode->i_sb, __func__, ++ "new_ext_end(%u) should be less than or equal to " ++ "oext->ee_block(%u) + oext_alen(%d) - 1", ++ new_ext_end, le32_to_cpu(oext->ee_block), ++ oext_alen); ++ ret = -EIO; ++ goto out; ++ } + + /* + * Case: new_ext is smaller than original extent +@@ -558,6 +583,7 @@ mext_leaf_block(handle_t *handle, struct + + ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, + o_end, &start_ext, &new_ext, &end_ext); ++out: + return ret; + } + +@@ -668,7 +694,20 @@ mext_replace_branches(handle_t *handle, + /* Loop for the donor extents */ + while (1) { + /* The extent for donor must be found. */ +- BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block)); ++ if (!dext) { ++ ext4_error(donor_inode->i_sb, __func__, ++ "The extent for donor must be found"); ++ err = -EIO; ++ goto out; ++ } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { ++ ext4_error(donor_inode->i_sb, __func__, ++ "Donor offset(%u) and the first block of donor " ++ "extent(%u) should be equal", ++ donor_off, ++ le32_to_cpu(tmp_dext.ee_block)); ++ err = -EIO; ++ goto out; ++ } + + /* Set donor extent to orig extent */ + err = mext_leaf_block(handle, orig_inode, +@@ -1050,18 +1089,23 @@ mext_check_arguments(struct inode *orig_ + * @inode1: the inode structure + * @inode2: the inode structure + * +- * Lock two inodes' i_mutex by i_ino order. This function is moved from +- * fs/inode.c. ++ * Lock two inodes' i_mutex by i_ino order. ++ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + */ +-static void ++static int + mext_inode_double_lock(struct inode *inode1, struct inode *inode2) + { +- if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { +- if (inode1) +- mutex_lock(&inode1->i_mutex); +- else if (inode2) +- mutex_lock(&inode2->i_mutex); +- return; ++ int ret = 0; ++ ++ BUG_ON(inode1 == NULL && inode2 == NULL); ++ ++ ret = mext_check_null_inode(inode1, inode2, __func__); ++ if (ret < 0) ++ goto out; ++ ++ if (inode1 == inode2) { ++ mutex_lock(&inode1->i_mutex); ++ goto out; + } + + if (inode1->i_ino < inode2->i_ino) { +@@ -1071,6 +1115,9 @@ mext_inode_double_lock(struct inode *ino + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); + } ++ ++out: ++ return ret; + } + + /** +@@ -1079,17 +1126,28 @@ mext_inode_double_lock(struct inode *ino + * @inode1: the inode that is released first + * @inode2: the inode that is released second + * +- * This function is moved from fs/inode.c. ++ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0. + */ + +-static void ++static int + mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) + { ++ int ret = 0; ++ ++ BUG_ON(inode1 == NULL && inode2 == NULL); ++ ++ ret = mext_check_null_inode(inode1, inode2, __func__); ++ if (ret < 0) ++ goto out; ++ + if (inode1) + mutex_unlock(&inode1->i_mutex); + + if (inode2 && inode2 != inode1) + mutex_unlock(&inode2->i_mutex); ++ ++out: ++ return ret; + } + + /** +@@ -1146,21 +1204,23 @@ ext4_move_extents(struct file *o_filp, s + ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; + ext4_lblk_t rest_blocks; + pgoff_t orig_page_offset = 0, seq_end_page; +- int ret, depth, last_extent = 0; ++ int ret1, ret2, depth, last_extent = 0; + int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + int data_offset_in_page; + int block_len_in_page; + int uninit; + + /* protect orig and donor against a truncate */ +- mext_inode_double_lock(orig_inode, donor_inode); ++ ret1 = mext_inode_double_lock(orig_inode, donor_inode); ++ if (ret1 < 0) ++ return ret1; + + mext_double_down_read(orig_inode, donor_inode); + /* Check the filesystem environment whether move_extent can be done */ +- ret = mext_check_arguments(orig_inode, donor_inode, orig_start, ++ ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, + donor_start, &len, *moved_len); + mext_double_up_read(orig_inode, donor_inode); +- if (ret) ++ if (ret1) + goto out2; + + file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; +@@ -1168,19 +1228,19 @@ ext4_move_extents(struct file *o_filp, s + if (file_end < block_end) + len -= block_end - file_end; + +- ret = get_ext_path(orig_inode, block_start, &orig_path); ++ ret1 = get_ext_path(orig_inode, block_start, &orig_path); + if (orig_path == NULL) + goto out2; + + /* Get path structure to check the hole */ +- ret = get_ext_path(orig_inode, block_start, &holecheck_path); ++ ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); + if (holecheck_path == NULL) + goto out; + + depth = ext_depth(orig_inode); + ext_cur = holecheck_path[depth].p_ext; + if (ext_cur == NULL) { +- ret = -EINVAL; ++ ret1 = -EINVAL; + goto out; + } + +@@ -1193,13 +1253,13 @@ ext4_move_extents(struct file *o_filp, s + last_extent = mext_next_extent(orig_inode, + holecheck_path, &ext_cur); + if (last_extent < 0) { +- ret = last_extent; ++ ret1 = last_extent; + goto out; + } + last_extent = mext_next_extent(orig_inode, orig_path, + &ext_dummy); + if (last_extent < 0) { +- ret = last_extent; ++ ret1 = last_extent; + goto out; + } + } +@@ -1209,7 +1269,7 @@ ext4_move_extents(struct file *o_filp, s + if (le32_to_cpu(ext_cur->ee_block) > block_end) { + ext4_debug("ext4 move extent: The specified range of file " + "may be the hole\n"); +- ret = -EINVAL; ++ ret1 = -EINVAL; + goto out; + } + +@@ -1229,7 +1289,7 @@ ext4_move_extents(struct file *o_filp, s + last_extent = mext_next_extent(orig_inode, holecheck_path, + &ext_cur); + if (last_extent < 0) { +- ret = last_extent; ++ ret1 = last_extent; + break; + } + add_blocks = ext4_ext_get_actual_len(ext_cur); +@@ -1281,16 +1341,23 @@ ext4_move_extents(struct file *o_filp, s + while (orig_page_offset <= seq_end_page) { + + /* Swap original branches with new branches */ +- ret = move_extent_per_page(o_filp, donor_inode, ++ ret1 = move_extent_per_page(o_filp, donor_inode, + orig_page_offset, + data_offset_in_page, + block_len_in_page, uninit); +- if (ret < 0) ++ if (ret1 < 0) + goto out; + orig_page_offset++; + /* Count how many blocks we have exchanged */ + *moved_len += block_len_in_page; +- BUG_ON(*moved_len > len); ++ if (*moved_len > len) { ++ ext4_error(orig_inode->i_sb, __func__, ++ "We replaced blocks too much! " ++ "sum of replaced: %llu requested: %llu", ++ *moved_len, len); ++ ret1 = -EIO; ++ goto out; ++ } + + data_offset_in_page = 0; + rest_blocks -= block_len_in_page; +@@ -1303,7 +1370,7 @@ ext4_move_extents(struct file *o_filp, s + /* Decrease buffer counter */ + if (holecheck_path) + ext4_ext_drop_refs(holecheck_path); +- ret = get_ext_path(orig_inode, seq_start, &holecheck_path); ++ ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); + if (holecheck_path == NULL) + break; + depth = holecheck_path->p_depth; +@@ -1311,7 +1378,7 @@ ext4_move_extents(struct file *o_filp, s + /* Decrease buffer counter */ + if (orig_path) + ext4_ext_drop_refs(orig_path); +- ret = get_ext_path(orig_inode, seq_start, &orig_path); ++ ret1 = get_ext_path(orig_inode, seq_start, &orig_path); + if (orig_path == NULL) + break; + +@@ -1330,10 +1397,12 @@ out: + kfree(holecheck_path); + } + out2: +- mext_inode_double_unlock(orig_inode, donor_inode); ++ ret2 = mext_inode_double_unlock(orig_inode, donor_inode); + +- if (ret) +- return ret; ++ if (ret1) ++ return ret1; ++ else if (ret2) ++ return ret2; + + return 0; + } diff --git a/queue-2.6.31/0034-ext4-Add-null-extent-check-to-ext_get_path.patch b/queue-2.6.31/0034-ext4-Add-null-extent-check-to-ext_get_path.patch new file mode 100644 index 00000000000..4fae53b4254 --- /dev/null +++ b/queue-2.6.31/0034-ext4-Add-null-extent-check-to-ext_get_path.patch @@ -0,0 +1,146 @@ +From a38bdc94597ca6b76cb0b5cf1bdfdd845a5690a6 Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Wed, 16 Sep 2009 14:25:07 -0400 +Subject: [PATCH 34/85] ext4: Add null extent check to ext_get_path + +(cherry picked from commit 347fa6f1c7cb5df2b38d3c9167cfe242ce0cd1da) + +There is the possibility that path structure which is taken +by ext4_ext_find_extent() indicates null extents. +Because during data block exchanging in ext4_move_extents(), +constitution of an extent tree may be changed. +As a solution, the patch adds null extent check +to ext_get_path(). + +Reported-by: Peng Tao +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/move_extent.c | 34 ++++++++++++++++------------------ + 1 file changed, 16 insertions(+), 18 deletions(-) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -39,7 +39,9 @@ get_ext_path(struct inode *inode, ext4_l + if (IS_ERR(*path)) { + ret = PTR_ERR(*path); + *path = NULL; +- } ++ } else if ((*path)[ext_depth(inode)].p_ext == NULL) ++ ret = -ENODATA; ++ + return ret; + } + +@@ -316,7 +318,7 @@ mext_insert_across_blocks(handle_t *hand + + if (new_flag) { + err = get_ext_path(orig_inode, eblock, &orig_path); +- if (orig_path == NULL) ++ if (err) + goto out; + + if (ext4_ext_insert_extent(handle, orig_inode, +@@ -327,7 +329,7 @@ mext_insert_across_blocks(handle_t *hand + if (end_flag) { + err = get_ext_path(orig_inode, + le32_to_cpu(end_ext->ee_block) - 1, &orig_path); +- if (orig_path == NULL) ++ if (err) + goto out; + + if (ext4_ext_insert_extent(handle, orig_inode, +@@ -673,12 +675,12 @@ mext_replace_branches(handle_t *handle, + + /* Get the original extent for the block "orig_off" */ + err = get_ext_path(orig_inode, orig_off, &orig_path); +- if (orig_path == NULL) ++ if (err) + goto out; + + /* Get the donor extent for the head */ + err = get_ext_path(donor_inode, donor_off, &donor_path); +- if (donor_path == NULL) ++ if (err) + goto out; + depth = ext_depth(orig_inode); + oext = orig_path[depth].p_ext; +@@ -733,7 +735,7 @@ mext_replace_branches(handle_t *handle, + if (orig_path) + ext4_ext_drop_refs(orig_path); + err = get_ext_path(orig_inode, orig_off, &orig_path); +- if (orig_path == NULL) ++ if (err) + goto out; + depth = ext_depth(orig_inode); + oext = orig_path[depth].p_ext; +@@ -747,7 +749,7 @@ mext_replace_branches(handle_t *handle, + if (donor_path) + ext4_ext_drop_refs(donor_path); + err = get_ext_path(donor_inode, donor_off, &donor_path); +- if (donor_path == NULL) ++ if (err) + goto out; + depth = ext_depth(donor_inode); + dext = donor_path[depth].p_ext; +@@ -1221,7 +1223,7 @@ ext4_move_extents(struct file *o_filp, s + donor_start, &len, *moved_len); + mext_double_up_read(orig_inode, donor_inode); + if (ret1) +- goto out2; ++ goto out; + + file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; + block_end = block_start + len - 1; +@@ -1229,20 +1231,16 @@ ext4_move_extents(struct file *o_filp, s + len -= block_end - file_end; + + ret1 = get_ext_path(orig_inode, block_start, &orig_path); +- if (orig_path == NULL) +- goto out2; ++ if (ret1) ++ goto out; + + /* Get path structure to check the hole */ + ret1 = get_ext_path(orig_inode, block_start, &holecheck_path); +- if (holecheck_path == NULL) ++ if (ret1) + goto out; + + depth = ext_depth(orig_inode); + ext_cur = holecheck_path[depth].p_ext; +- if (ext_cur == NULL) { +- ret1 = -EINVAL; +- goto out; +- } + + /* + * Get proper extent whose ee_block is beyond block_start +@@ -1371,7 +1369,7 @@ ext4_move_extents(struct file *o_filp, s + if (holecheck_path) + ext4_ext_drop_refs(holecheck_path); + ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path); +- if (holecheck_path == NULL) ++ if (ret1) + break; + depth = holecheck_path->p_depth; + +@@ -1379,7 +1377,7 @@ ext4_move_extents(struct file *o_filp, s + if (orig_path) + ext4_ext_drop_refs(orig_path); + ret1 = get_ext_path(orig_inode, seq_start, &orig_path); +- if (orig_path == NULL) ++ if (ret1) + break; + + ext_cur = holecheck_path[depth].p_ext; +@@ -1396,7 +1394,7 @@ out: + ext4_ext_drop_refs(holecheck_path); + kfree(holecheck_path); + } +-out2: ++ + ret2 = mext_inode_double_unlock(orig_inode, donor_inode); + + if (ret1) diff --git a/queue-2.6.31/0035-ext4-Fix-different-block-exchange-issue-in-EXT4_IOC_.patch b/queue-2.6.31/0035-ext4-Fix-different-block-exchange-issue-in-EXT4_IOC_.patch new file mode 100644 index 00000000000..ab6a7dc62d7 --- /dev/null +++ b/queue-2.6.31/0035-ext4-Fix-different-block-exchange-issue-in-EXT4_IOC_.patch @@ -0,0 +1,126 @@ +From 94a3721bf6f39d1adea6b85ab4905255bf84a10f Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Wed, 16 Sep 2009 14:25:39 -0400 +Subject: [PATCH 35/85] ext4: Fix different block exchange issue in EXT4_IOC_MOVE_EXT + +(cherry picked from commit c40ce3c9ea97425a12d7e44031a98fe50add6fc1) + +If logical block offset of original file which is passed to +EXT4_IOC_MOVE_EXT is different from donor file's, +a calculation error occurs in ext4_calc_swap_extents(), +therefore wrong block is exchanged between original file and donor file. +As a result, we hit ext4_error() in check_block_validity(). +To detect the logical offset difference in EXT4_IOC_MOVE_EXT, +add checks to mext_calc_swap_extents() and handle it as error, +since data exchange must be done between the same blocks in EXT4_IOC_MOVE_EXT. + +Reported-by: Peng Tao +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/move_extent.c | 46 +++++++++++++++++++++++++++++++++++++--------- + 1 file changed, 37 insertions(+), 9 deletions(-) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -597,8 +597,10 @@ out: + * @orig_off: block offset of original inode + * @donor_off: block offset of donor inode + * @max_count: the maximun length of extents ++ * ++ * Return 0 on success, or a negative error value on failure. + */ +-static void ++static int + mext_calc_swap_extents(struct ext4_extent *tmp_dext, + struct ext4_extent *tmp_oext, + ext4_lblk_t orig_off, ext4_lblk_t donor_off, +@@ -607,6 +609,19 @@ mext_calc_swap_extents(struct ext4_exten + ext4_lblk_t diff, orig_diff; + struct ext4_extent dext_old, oext_old; + ++ BUG_ON(orig_off != donor_off); ++ ++ /* original and donor extents have to cover the same block offset */ ++ if (orig_off < le32_to_cpu(tmp_oext->ee_block) || ++ le32_to_cpu(tmp_oext->ee_block) + ++ ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off) ++ return -ENODATA; ++ ++ if (orig_off < le32_to_cpu(tmp_dext->ee_block) || ++ le32_to_cpu(tmp_dext->ee_block) + ++ ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off) ++ return -ENODATA; ++ + dext_old = *tmp_dext; + oext_old = *tmp_oext; + +@@ -634,6 +649,8 @@ mext_calc_swap_extents(struct ext4_exten + + copy_extent_status(&oext_old, tmp_dext); + copy_extent_status(&dext_old, tmp_oext); ++ ++ return 0; + } + + /** +@@ -690,8 +707,10 @@ mext_replace_branches(handle_t *handle, + dext = donor_path[depth].p_ext; + tmp_dext = *dext; + +- mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, ++ err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + donor_off, count); ++ if (err) ++ goto out; + + /* Loop for the donor extents */ + while (1) { +@@ -760,9 +779,10 @@ mext_replace_branches(handle_t *handle, + } + tmp_dext = *dext; + +- mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, +- donor_off, +- count - replaced_count); ++ err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, ++ donor_off, count - replaced_count); ++ if (err) ++ goto out; + } + + out: +@@ -1243,11 +1263,15 @@ ext4_move_extents(struct file *o_filp, s + ext_cur = holecheck_path[depth].p_ext; + + /* +- * Get proper extent whose ee_block is beyond block_start +- * if block_start was within the hole. ++ * Get proper starting location of block replacement if block_start was ++ * within the hole. + */ + if (le32_to_cpu(ext_cur->ee_block) + + ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { ++ /* ++ * The hole exists between extents or the tail of ++ * original file. ++ */ + last_extent = mext_next_extent(orig_inode, + holecheck_path, &ext_cur); + if (last_extent < 0) { +@@ -1260,8 +1284,12 @@ ext4_move_extents(struct file *o_filp, s + ret1 = last_extent; + goto out; + } +- } +- seq_start = block_start; ++ seq_start = le32_to_cpu(ext_cur->ee_block); ++ } else if (le32_to_cpu(ext_cur->ee_block) > block_start) ++ /* The hole exists at the beginning of original file. */ ++ seq_start = le32_to_cpu(ext_cur->ee_block); ++ else ++ seq_start = block_start; + + /* No blocks within the specified range. */ + if (le32_to_cpu(ext_cur->ee_block) > block_end) { diff --git a/queue-2.6.31/0036-ext4-limit-block-allocations-for-indirect-block-file.patch b/queue-2.6.31/0036-ext4-limit-block-allocations-for-indirect-block-file.patch new file mode 100644 index 00000000000..cfc85a44636 --- /dev/null +++ b/queue-2.6.31/0036-ext4-limit-block-allocations-for-indirect-block-file.patch @@ -0,0 +1,178 @@ +From 203425242df0e5bfa7bd86d934500583390f5fe6 Mon Sep 17 00:00:00 2001 +From: Eric Sandeen +Date: Wed, 16 Sep 2009 14:45:10 -0400 +Subject: [PATCH 36/85] ext4: limit block allocations for indirect-block files to < 2^32 + +(cherry picked from commit fb0a387dcdcd21aab1b09ee7fd80b7c979bdbbfd) + +Today, the ext4 allocator will happily allocate blocks past +2^32 for indirect-block files, which results in the block +numbers getting truncated, and corruption ensues. + +This patch limits such allocations to < 2^32, and adds +BUG_ONs if we do get blocks larger than that. + +This should address RH Bug 519471, ext4 bitmap allocator +must limit blocks to < 2^32 + +* ext4_find_goal() is modified to choose a goal < UINT_MAX, + so that our starting point is in an acceptable range. + +* ext4_xattr_block_set() is modified such that the goal block + is < UINT_MAX, as above. + +* ext4_mb_regular_allocator() is modified so that the group + search does not continue into groups which are too high + +* ext4_mb_use_preallocated() has a check that we don't use + preallocated space which is too far out + +* ext4_alloc_blocks() and ext4_xattr_block_set() add some BUG_ONs + +No attempt has been made to limit inode locations to < 2^32, +so we may wind up with blocks far from their inodes. Doing +this much already will lead to some odd ENOSPC issues when the +"lower 32" gets full, and further restricting inodes could +make that even weirder. + +For high inodes, choosing a goal of the original, % UINT_MAX, +may be a bit odd, but then we're in an odd situation anyway, +and I don't know of a better heuristic. + +Signed-off-by: Eric Sandeen +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 4 ++++ + fs/ext4/inode.c | 11 ++++++++++- + fs/ext4/mballoc.c | 9 +++++++++ + fs/ext4/super.c | 2 ++ + fs/ext4/xattr.c | 15 +++++++++++++-- + 5 files changed, 38 insertions(+), 3 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -388,6 +388,9 @@ struct ext4_mount_options { + #endif + }; + ++/* Max physical block we can addres w/o extents */ ++#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF ++ + /* + * Structure of an inode on the disk + */ +@@ -843,6 +846,7 @@ struct ext4_sb_info { + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ ++ ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ + unsigned long s_overhead_last; /* Last calculated overhead */ + unsigned long s_blocks_last; /* Last seen block count */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -564,15 +564,21 @@ static ext4_fsblk_t ext4_find_near(struc + * + * Normally this function find the preferred place for block allocation, + * returns it. ++ * Because this is only used for non-extent files, we limit the block nr ++ * to 32 bits. + */ + static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, + Indirect *partial) + { ++ ext4_fsblk_t goal; ++ + /* + * XXX need to get goal block from mballoc's data structures + */ + +- return ext4_find_near(inode, partial); ++ goal = ext4_find_near(inode, partial); ++ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; ++ return goal; + } + + /** +@@ -653,6 +659,8 @@ static int ext4_alloc_blocks(handle_t *h + if (*err) + goto failed_out; + ++ BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS); ++ + target -= count; + /* allocate blocks for indirect blocks */ + while (index < indirect_blks && count) { +@@ -687,6 +695,7 @@ static int ext4_alloc_blocks(handle_t *h + ar.flags = EXT4_MB_HINT_DATA; + + current_block = ext4_mb_new_blocks(handle, &ar, err); ++ BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS); + + if (*err && (target == blks)) { + /* +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -1960,6 +1960,10 @@ ext4_mb_regular_allocator(struct ext4_al + sb = ac->ac_sb; + sbi = EXT4_SB(sb); + ngroups = ext4_get_groups_count(sb); ++ /* non-extent files are limited to low blocks/groups */ ++ if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL)) ++ ngroups = sbi->s_blockfile_groups; ++ + BUG_ON(ac->ac_status == AC_STATUS_FOUND); + + /* first, try the goal */ +@@ -3355,6 +3359,11 @@ ext4_mb_use_preallocated(struct ext4_all + ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) + continue; + ++ /* non-extent files can't have physical blocks past 2^32 */ ++ if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) && ++ pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) ++ continue; ++ + /* found preallocated blocks, use them */ + spin_lock(&pa->pa_lock); + if (pa->pa_deleted == 0 && pa->pa_free) { +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -2618,6 +2618,8 @@ static int ext4_fill_super(struct super_ + goto failed_mount; + } + sbi->s_groups_count = blocks_count; ++ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count, ++ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); + db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / + EXT4_DESC_PER_BLOCK(sb); + sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -810,12 +810,23 @@ inserted: + get_bh(new_bh); + } else { + /* We need to allocate a new block */ +- ext4_fsblk_t goal = ext4_group_first_block_no(sb, ++ ext4_fsblk_t goal, block; ++ ++ goal = ext4_group_first_block_no(sb, + EXT4_I(inode)->i_block_group); +- ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode, ++ ++ /* non-extent files can't have physical blocks past 2^32 */ ++ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) ++ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; ++ ++ block = ext4_new_meta_blocks(handle, inode, + goal, NULL, &error); + if (error) + goto cleanup; ++ ++ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) ++ BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); ++ + ea_idebug(inode, "creating block %d", block); + + new_bh = sb_getblk(sb, block); diff --git a/queue-2.6.31/0037-ext4-store-EXT4_EXT_MIGRATE-in-i_state-instead-of-i_.patch b/queue-2.6.31/0037-ext4-store-EXT4_EXT_MIGRATE-in-i_state-instead-of-i_.patch new file mode 100644 index 00000000000..39939bd7627 --- /dev/null +++ b/queue-2.6.31/0037-ext4-store-EXT4_EXT_MIGRATE-in-i_state-instead-of-i_.patch @@ -0,0 +1,107 @@ +From e4e180d52766144b75baec9734d26af92237a3f0 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Thu, 17 Sep 2009 08:32:22 -0400 +Subject: [PATCH 37/85] ext4: store EXT4_EXT_MIGRATE in i_state instead of i_flags + +(cherry picked from commit 1b9c12f44c1eb614fd3b8822bfe8f1f5d8e53737) + +EXT4_EXT_MIGRATE is only intended to be used for an in-memory flag, +and the hex value assigned to it collides with FS_DIRECTIO_FL (which +is also stored in i_flags). There's no reason for the +EXT4_EXT_MIGRATE bit to be stored in i_flags, so we switch it to use +i_state instead. + +Cc: "Aneesh Kumar K.V" +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 2 +- + fs/ext4/inode.c | 6 ++---- + fs/ext4/migrate.c | 20 ++++++++++---------- + 3 files changed, 13 insertions(+), 15 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -253,7 +253,6 @@ struct flex_groups { + #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ + #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ + #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +-#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */ + #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + + #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ +@@ -291,6 +290,7 @@ static inline __u32 ext4_mask_flags(umod + #define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ + #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ + #define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ ++#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ + + /* Used to pass group descriptor data when online resize is done */ + struct ext4_new_group_input { +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1256,8 +1256,7 @@ int ext4_get_blocks(handle_t *handle, st + * i_data's format changing. Force the migrate + * to fail by clearing migrate flags + */ +- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & +- ~EXT4_EXT_MIGRATE; ++ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; + } + } + +@@ -4608,8 +4607,7 @@ static int ext4_do_update_inode(handle_t + if (ext4_inode_blocks_set(handle, raw_inode, ei)) + goto out_brelse; + raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); +- /* clear the migrate flag in the raw_inode */ +- raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE); ++ raw_inode->i_flags = cpu_to_le32(ei->i_flags); + if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != + cpu_to_le32(EXT4_OS_HURD)) + raw_inode->i_file_acl_high = +--- a/fs/ext4/migrate.c ++++ b/fs/ext4/migrate.c +@@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(hand + + down_write(&EXT4_I(inode)->i_data_sem); + /* +- * if EXT4_EXT_MIGRATE is cleared a block allocation ++ * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation + * happened after we started the migrate. We need to + * fail the migrate + */ +- if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) { ++ if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) { + retval = -EAGAIN; + up_write(&EXT4_I(inode)->i_data_sem); + goto err_out; + } else +- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & +- ~EXT4_EXT_MIGRATE; ++ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; + /* + * We have the extent map build with the tmp inode. + * Now copy the i_data across +@@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode + * when we add extents we extent the journal + */ + /* +- * Even though we take i_mutex we can still cause block allocation +- * via mmap write to holes. If we have allocated new blocks we fail +- * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. +- * The flag is updated with i_data_sem held to prevent racing with +- * block allocation. ++ * Even though we take i_mutex we can still cause block ++ * allocation via mmap write to holes. If we have allocated ++ * new blocks we fail migrate. New block allocation will ++ * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated ++ * with i_data_sem held to prevent racing with block ++ * allocation. + */ + down_read((&EXT4_I(inode)->i_data_sem)); +- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE; ++ EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE; + up_read((&EXT4_I(inode)->i_data_sem)); + + handle = ext4_journal_start(inode, 1); diff --git a/queue-2.6.31/0038-ext4-Fix-the-alloc-on-close-after-a-truncate-huerist.patch b/queue-2.6.31/0038-ext4-Fix-the-alloc-on-close-after-a-truncate-huerist.patch new file mode 100644 index 00000000000..b25200f93f0 --- /dev/null +++ b/queue-2.6.31/0038-ext4-Fix-the-alloc-on-close-after-a-truncate-huerist.patch @@ -0,0 +1,37 @@ +From 8612c1f51598a672b35ecff9143213b8149fe996 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Thu, 17 Sep 2009 09:34:16 -0400 +Subject: [PATCH 38/85] ext4: Fix the alloc on close after a truncate hueristic + +(cherry picked from commit 5534fb5bb35a62a94e0bd1fa2421f7fb6e894f10) + +In an attempt to avoid doing an unneeded flush after opening a +(previously non-existent) file with O_CREAT|O_TRUNC, the code only +triggered the hueristic if ei->disksize was non-zero. Turns out that +the VFS doesn't call ->truncate() if the file doesn't exist, and +ei->disksize is always zero even if the file previously existed. So +remove the test, since it isn't necessary and in fact disabled the +hueristic. + +Thanks to Clemens Eisserer that he was seeing problems with files +written using kwrite and eclipse after sudden crashes caused by a +buggy Intel video driver. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3983,8 +3983,7 @@ void ext4_truncate(struct inode *inode) + if (!ext4_can_truncate(inode)) + return; + +- if (ei->i_disksize && inode->i_size == 0 && +- !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ++ if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) + ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; + + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { diff --git a/queue-2.6.31/0039-ext4-Fix-hueristic-which-avoids-group-preallocation-.patch b/queue-2.6.31/0039-ext4-Fix-hueristic-which-avoids-group-preallocation-.patch new file mode 100644 index 00000000000..01123940914 --- /dev/null +++ b/queue-2.6.31/0039-ext4-Fix-hueristic-which-avoids-group-preallocation-.patch @@ -0,0 +1,36 @@ +From e197c005dc8af29d1b7d11f7ce1feabe970936db Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Mon, 28 Sep 2009 00:06:20 -0400 +Subject: [PATCH 39/85] ext4: Fix hueristic which avoids group preallocation for closed files + +(cherry picked from commit 71780577306fd1e76c7a92e3b308db624d03adb9) + +The hueristic was designed to avoid using locality group preallocation +when writing the last segment of a closed file. Fix it by move +setting size to the maximum of size and isize until after we check +whether size == isize. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -4162,7 +4162,6 @@ static void ext4_mb_group_or_file(struct + size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; + isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) + >> bsbits; +- size = max(size, isize); + + if ((size == isize) && + !ext4_fs_is_busy(sbi) && +@@ -4172,6 +4171,7 @@ static void ext4_mb_group_or_file(struct + } + + /* don't use group allocation for large files */ ++ size = max(size, isize); + if (size >= sbi->s_mb_stream_request) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; diff --git a/queue-2.6.31/0040-ext4-Adjust-ext4_da_writepages-to-write-out-larger-c.patch b/queue-2.6.31/0040-ext4-Adjust-ext4_da_writepages-to-write-out-larger-c.patch new file mode 100644 index 00000000000..f12580e23c4 --- /dev/null +++ b/queue-2.6.31/0040-ext4-Adjust-ext4_da_writepages-to-write-out-larger-c.patch @@ -0,0 +1,344 @@ +From 62ed027952686652b22a75de0d64be2ae00633d1 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Tue, 29 Sep 2009 13:31:31 -0400 +Subject: [PATCH 40/85] ext4: Adjust ext4_da_writepages() to write out larger contiguous chunks + +(cherry picked from commit 55138e0bc29c0751e2152df9ad35deea542f29b3) + +Work around problems in the writeback code to force out writebacks in +larger chunks than just 4mb, which is just too small. This also works +around limitations in the ext4 block allocator, which can't allocate +more than 2048 blocks at a time. So we need to defeat the round-robin +characteristics of the writeback code and try to write out as many +blocks in one inode before allowing the writeback code to move on to +another inode. We add a a new per-filesystem tunable, +max_writeback_mb_bump, which caps this to a default of 128mb per +inode. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 17 ++++++ + fs/ext4/inode.c | 121 +++++++++++++++++++++++++++++++++----------- + fs/ext4/super.c | 3 + + include/trace/events/ext4.h | 54 +++++++++++++++++-- + 4 files changed, 161 insertions(+), 34 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -114,6 +114,22 @@ struct ext4_allocation_request { + }; + + /* ++ * Delayed allocation stuff ++ */ ++ ++struct mpage_da_data { ++ struct inode *inode; ++ sector_t b_blocknr; /* start block number of extent */ ++ size_t b_size; /* size of extent */ ++ unsigned long b_state; /* state of the extent */ ++ unsigned long first_page, next_page; /* extent of pages */ ++ struct writeback_control *wbc; ++ int io_done; ++ int pages_written; ++ int retval; ++}; ++ ++/* + * Special inodes numbers + */ + #define EXT4_BAD_INO 1 /* Bad blocks inode */ +@@ -929,6 +945,7 @@ struct ext4_sb_info { + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; ++ unsigned int s_max_writeback_mb_bump; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1146,6 +1146,64 @@ static int check_block_validity(struct i + } + + /* ++ * Return the number of dirty pages in the given inode starting at ++ * page frame idx. ++ */ ++static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, ++ unsigned int max_pages) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ pgoff_t index; ++ struct pagevec pvec; ++ pgoff_t num = 0; ++ int i, nr_pages, done = 0; ++ ++ if (max_pages == 0) ++ return 0; ++ pagevec_init(&pvec, 0); ++ while (!done) { ++ index = idx; ++ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, ++ PAGECACHE_TAG_DIRTY, ++ (pgoff_t)PAGEVEC_SIZE); ++ if (nr_pages == 0) ++ break; ++ for (i = 0; i < nr_pages; i++) { ++ struct page *page = pvec.pages[i]; ++ struct buffer_head *bh, *head; ++ ++ lock_page(page); ++ if (unlikely(page->mapping != mapping) || ++ !PageDirty(page) || ++ PageWriteback(page) || ++ page->index != idx) { ++ done = 1; ++ unlock_page(page); ++ break; ++ } ++ head = page_buffers(page); ++ bh = head; ++ do { ++ if (!buffer_delay(bh) && ++ !buffer_unwritten(bh)) { ++ done = 1; ++ break; ++ } ++ } while ((bh = bh->b_this_page) != head); ++ unlock_page(page); ++ if (done) ++ break; ++ idx++; ++ num++; ++ if (num >= max_pages) ++ break; ++ } ++ pagevec_release(&pvec); ++ } ++ return num; ++} ++ ++/* + * The ext4_get_blocks() function tries to look up the requested blocks, + * and returns if the blocks are already mapped. + * +@@ -1881,22 +1939,6 @@ static void ext4_da_page_release_reserva + } + + /* +- * Delayed allocation stuff +- */ +- +-struct mpage_da_data { +- struct inode *inode; +- sector_t b_blocknr; /* start block number of extent */ +- size_t b_size; /* size of extent */ +- unsigned long b_state; /* state of the extent */ +- unsigned long first_page, next_page; /* extent of pages */ +- struct writeback_control *wbc; +- int io_done; +- int pages_written; +- int retval; +-}; +- +-/* + * mpage_da_submit_io - walks through extent of pages and try to write + * them with writepage() call back + * +@@ -2756,8 +2798,10 @@ static int ext4_da_writepages(struct add + int no_nrwrite_index_update; + int pages_written = 0; + long pages_skipped; ++ unsigned int max_pages; + int range_cyclic, cycled = 1, io_done = 0; +- int needed_blocks, ret = 0, nr_to_writebump = 0; ++ int needed_blocks, ret = 0; ++ long desired_nr_to_write, nr_to_writebump = 0; + loff_t range_start = wbc->range_start; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + +@@ -2784,16 +2828,6 @@ static int ext4_da_writepages(struct add + if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) + return -EROFS; + +- /* +- * Make sure nr_to_write is >= sbi->s_mb_stream_request +- * This make sure small files blocks are allocated in +- * single attempt. This ensure that small files +- * get less fragmented. +- */ +- if (wbc->nr_to_write < sbi->s_mb_stream_request) { +- nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; +- wbc->nr_to_write = sbi->s_mb_stream_request; +- } + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + +@@ -2808,6 +2842,36 @@ static int ext4_da_writepages(struct add + } else + index = wbc->range_start >> PAGE_CACHE_SHIFT; + ++ /* ++ * This works around two forms of stupidity. The first is in ++ * the writeback code, which caps the maximum number of pages ++ * written to be 1024 pages. This is wrong on multiple ++ * levels; different architectues have a different page size, ++ * which changes the maximum amount of data which gets ++ * written. Secondly, 4 megabytes is way too small. XFS ++ * forces this value to be 16 megabytes by multiplying ++ * nr_to_write parameter by four, and then relies on its ++ * allocator to allocate larger extents to make them ++ * contiguous. Unfortunately this brings us to the second ++ * stupidity, which is that ext4's mballoc code only allocates ++ * at most 2048 blocks. So we force contiguous writes up to ++ * the number of dirty blocks in the inode, or ++ * sbi->max_writeback_mb_bump whichever is smaller. ++ */ ++ max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); ++ if (!range_cyclic && range_whole) ++ desired_nr_to_write = wbc->nr_to_write * 8; ++ else ++ desired_nr_to_write = ext4_num_dirty_pages(inode, index, ++ max_pages); ++ if (desired_nr_to_write > max_pages) ++ desired_nr_to_write = max_pages; ++ ++ if (wbc->nr_to_write < desired_nr_to_write) { ++ nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; ++ wbc->nr_to_write = desired_nr_to_write; ++ } ++ + mpd.wbc = wbc; + mpd.inode = mapping->host; + +@@ -2926,7 +2990,8 @@ retry: + out_writepages: + if (!no_nrwrite_index_update) + wbc->no_nrwrite_index_update = 0; +- wbc->nr_to_write -= nr_to_writebump; ++ if (wbc->nr_to_write > nr_to_writebump) ++ wbc->nr_to_write -= nr_to_writebump; + wbc->range_start = range_start; + trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); + return ret; +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -2199,6 +2199,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb + EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); + EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); + EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); ++EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); + + static struct attribute *ext4_attrs[] = { + ATTR_LIST(delayed_allocation_blocks), +@@ -2212,6 +2213,7 @@ static struct attribute *ext4_attrs[] = + ATTR_LIST(mb_order2_req), + ATTR_LIST(mb_stream_req), + ATTR_LIST(mb_group_prealloc), ++ ATTR_LIST(max_writeback_mb_bump), + NULL, + }; + +@@ -2681,6 +2683,7 @@ static int ext4_fill_super(struct super_ + } + + sbi->s_stripe = ext4_get_stripe_size(sbi); ++ sbi->s_max_writeback_mb_bump = 128; + + /* + * set up enough so that it can read an inode +--- a/include/trace/events/ext4.h ++++ b/include/trace/events/ext4.h +@@ -231,6 +231,7 @@ TRACE_EVENT(ext4_da_writepages, + __field( char, for_reclaim ) + __field( char, for_writepages ) + __field( char, range_cyclic ) ++ __field( pgoff_t, writeback_index ) + ), + + TP_fast_assign( +@@ -245,14 +246,51 @@ TRACE_EVENT(ext4_da_writepages, + __entry->for_reclaim = wbc->for_reclaim; + __entry->for_writepages = wbc->for_writepages; + __entry->range_cyclic = wbc->range_cyclic; ++ __entry->writeback_index = inode->i_mapping->writeback_index; + ), + +- TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d", +- jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write, ++ TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d writeback_index %lu", ++ jbd2_dev_to_name(__entry->dev), ++ (unsigned long) __entry->ino, __entry->nr_to_write, + __entry->pages_skipped, __entry->range_start, + __entry->range_end, __entry->nonblocking, + __entry->for_kupdate, __entry->for_reclaim, +- __entry->for_writepages, __entry->range_cyclic) ++ __entry->for_writepages, __entry->range_cyclic, ++ (unsigned long) __entry->writeback_index) ++); ++ ++TRACE_EVENT(ext4_da_write_pages, ++ TP_PROTO(struct inode *inode, struct mpage_da_data *mpd), ++ ++ TP_ARGS(inode, mpd), ++ ++ TP_STRUCT__entry( ++ __field( dev_t, dev ) ++ __field( ino_t, ino ) ++ __field( __u64, b_blocknr ) ++ __field( __u32, b_size ) ++ __field( __u32, b_state ) ++ __field( unsigned long, first_page ) ++ __field( int, io_done ) ++ __field( int, pages_written ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = inode->i_sb->s_dev; ++ __entry->ino = inode->i_ino; ++ __entry->b_blocknr = mpd->b_blocknr; ++ __entry->b_size = mpd->b_size; ++ __entry->b_state = mpd->b_state; ++ __entry->first_page = mpd->first_page; ++ __entry->io_done = mpd->io_done; ++ __entry->pages_written = mpd->pages_written; ++ ), ++ ++ TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d", ++ jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, ++ __entry->b_blocknr, __entry->b_size, ++ __entry->b_state, __entry->first_page, ++ __entry->io_done, __entry->pages_written) + ); + + TRACE_EVENT(ext4_da_writepages_result, +@@ -270,6 +308,7 @@ TRACE_EVENT(ext4_da_writepages_result, + __field( char, encountered_congestion ) + __field( char, more_io ) + __field( char, no_nrwrite_index_update ) ++ __field( pgoff_t, writeback_index ) + ), + + TP_fast_assign( +@@ -281,13 +320,16 @@ TRACE_EVENT(ext4_da_writepages_result, + __entry->encountered_congestion = wbc->encountered_congestion; + __entry->more_io = wbc->more_io; + __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update; ++ __entry->writeback_index = inode->i_mapping->writeback_index; + ), + +- TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d", +- jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->ret, ++ TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu", ++ jbd2_dev_to_name(__entry->dev), ++ (unsigned long) __entry->ino, __entry->ret, + __entry->pages_written, __entry->pages_skipped, + __entry->encountered_congestion, __entry->more_io, +- __entry->no_nrwrite_index_update) ++ __entry->no_nrwrite_index_update, ++ (unsigned long) __entry->writeback_index) + ); + + TRACE_EVENT(ext4_da_write_begin, diff --git a/queue-2.6.31/0041-ext4-release-reserved-quota-when-block-reservation-f.patch b/queue-2.6.31/0041-ext4-release-reserved-quota-when-block-reservation-f.patch new file mode 100644 index 00000000000..34699f64e82 --- /dev/null +++ b/queue-2.6.31/0041-ext4-release-reserved-quota-when-block-reservation-f.patch @@ -0,0 +1,35 @@ +From 2ea2ba3683c5a3d4a77e35e3b3a145f136bbfa44 Mon Sep 17 00:00:00 2001 +From: Mingming Cao +Date: Mon, 28 Sep 2009 15:49:52 -0400 +Subject: [PATCH 41/85] ext4: release reserved quota when block reservation for delalloc retry + +(cherry picked from commit 9f0ccfd8e07d61b413e6536ffa02fbf60d2e20d8) + +ext4_da_reserve_space() can reserve quota blocks multiple times if +ext4_claim_free_blocks() fail and we retry the allocation. We should +release the quota reservation before restarting. + +Bug found by Jan Kara. + +Signed-off-by: Mingming Cao +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1855,11 +1855,11 @@ repeat: + + if (ext4_claim_free_blocks(sbi, total)) { + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); ++ vfs_dq_release_reservation_block(inode, total); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { + yield(); + goto repeat; + } +- vfs_dq_release_reservation_block(inode, total); + return -ENOSPC; + } + EXT4_I(inode)->i_reserved_data_blocks += nrblocks; diff --git a/queue-2.6.31/0042-ext4-Split-uninitialized-extents-for-direct-I-O.patch b/queue-2.6.31/0042-ext4-Split-uninitialized-extents-for-direct-I-O.patch new file mode 100644 index 00000000000..0bcf34b30cb --- /dev/null +++ b/queue-2.6.31/0042-ext4-Split-uninitialized-extents-for-direct-I-O.patch @@ -0,0 +1,654 @@ +From 079fbf5052f559c375452803b25e766de93c2276 Mon Sep 17 00:00:00 2001 +From: Mingming Cao +Date: Mon, 28 Sep 2009 15:49:08 -0400 +Subject: [PATCH 42/85] ext4: Split uninitialized extents for direct I/O + +(cherry picked from commit 0031462b5b392f90d17f1d75abb795883c44e969) + +When writing into an unitialized extent via direct I/O, and the direct +I/O doesn't exactly cover the unitialized extent, split the extent +into uninitialized and initialized extents before submitting the I/O. +This avoids needing to deal with an ENOSPC error in the end_io +callback that gets used for direct I/O. + +When the IO is complete, the written extent will be marked as initialized. + +Singed-Off-By: Mingming Cao +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 22 ++ + fs/ext4/ext4_extents.h | 7 + fs/ext4/extents.c | 423 ++++++++++++++++++++++++++++++++++++++++++++----- + fs/ext4/inode.c | 3 + fs/ext4/migrate.c | 2 + fs/ext4/move_extent.c | 4 + 6 files changed, 419 insertions(+), 42 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -113,6 +113,15 @@ struct ext4_allocation_request { + unsigned int flags; + }; + ++typedef struct ext4_io_end { ++ struct inode *inode; /* file being written to */ ++ unsigned int flag; /* sync IO or AIO */ ++ int error; /* I/O error code */ ++ ext4_lblk_t offset; /* offset in the file */ ++ size_t size; /* size of the extent */ ++ struct work_struct work; /* data work queue */ ++} ext4_io_end_t; ++ + /* + * Delayed allocation stuff + */ +@@ -348,7 +357,16 @@ struct ext4_new_group_data { + /* Call ext4_da_update_reserve_space() after successfully + allocating the blocks */ + #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 +- ++ /* caller is from the direct IO path, request to creation of an ++ unitialized extents if not allocated, split the uninitialized ++ extent if blocks has been preallocated already*/ ++#define EXT4_GET_BLOCKS_DIO 0x0010 ++#define EXT4_GET_BLOCKS_CONVERT 0x0020 ++#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ ++ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) ++ /* Convert extent to initialized after direct IO complete */ ++#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ ++ EXT4_GET_BLOCKS_DIO_CREATE_EXT) + + /* + * ioctl commands +@@ -1702,6 +1720,8 @@ extern void ext4_ext_init(struct super_b + extern void ext4_ext_release(struct super_block *); + extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, + loff_t len); ++extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ++ loff_t len); + extern int ext4_get_blocks(handle_t *handle, struct inode *inode, + sector_t block, unsigned int max_blocks, + struct buffer_head *bh, int flags); +--- a/fs/ext4/ext4_extents.h ++++ b/fs/ext4/ext4_extents.h +@@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_le + (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); + } + ++static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) ++{ ++ ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); ++} ++ + extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); + extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); + extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); +@@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct + struct ext4_ext_path *path, + struct ext4_extent *); + extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); +-extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); ++extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); + extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, + ext_prepare_callback, void *); + extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -710,7 +710,7 @@ err: + * insert new index [@logical;@ptr] into the block at @curp; + * check where to insert: before @curp or after @curp + */ +-static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, ++int ext4_ext_insert_index(handle_t *handle, struct inode *inode, + struct ext4_ext_path *curp, + int logical, ext4_fsblk_t ptr) + { +@@ -1572,7 +1572,7 @@ out: + */ + int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, +- struct ext4_extent *newext) ++ struct ext4_extent *newext, int flag) + { + struct ext4_extent_header *eh; + struct ext4_extent *ex, *fex; +@@ -1588,7 +1588,8 @@ int ext4_ext_insert_extent(handle_t *han + BUG_ON(path[depth].p_hdr == NULL); + + /* try to insert block into found extent and return */ +- if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { ++ if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) ++ && ext4_can_extents_be_merged(inode, ex, newext)) { + ext_debug("append %d block to %d:%d (from %llu)\n", + ext4_ext_get_actual_len(newext), + le32_to_cpu(ex->ee_block), +@@ -1703,7 +1704,8 @@ has_space: + + merge: + /* try to merge extents to the right */ +- ext4_ext_try_to_merge(inode, path, nearex); ++ if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT) ++ ext4_ext_try_to_merge(inode, path, nearex); + + /* try to merge extents to the left */ + +@@ -2470,7 +2472,6 @@ static int ext4_ext_zeroout(struct inode + } + + #define EXT4_EXT_ZERO_LEN 7 +- + /* + * This function is called by ext4_ext_get_blocks() if someone tries to write + * to an uninitialized extent. It may result in splitting the uninitialized +@@ -2563,7 +2564,8 @@ static int ext4_ext_convert_to_initializ + ex3->ee_block = cpu_to_le32(iblock); + ext4_ext_store_pblock(ex3, newblock); + ex3->ee_len = cpu_to_le16(allocated); +- err = ext4_ext_insert_extent(handle, inode, path, ex3); ++ err = ext4_ext_insert_extent(handle, inode, path, ++ ex3, 0); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) +@@ -2619,7 +2621,7 @@ static int ext4_ext_convert_to_initializ + ext4_ext_store_pblock(ex3, newblock + max_blocks); + ex3->ee_len = cpu_to_le16(allocated - max_blocks); + ext4_ext_mark_uninitialized(ex3); +- err = ext4_ext_insert_extent(handle, inode, path, ex3); ++ err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) +@@ -2737,7 +2739,7 @@ static int ext4_ext_convert_to_initializ + err = ext4_ext_dirty(handle, inode, path + depth); + goto out; + insert: +- err = ext4_ext_insert_extent(handle, inode, path, &newex); ++ err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) +@@ -2764,6 +2766,320 @@ fix_extent_len: + } + + /* ++ * This function is called by ext4_ext_get_blocks() from ++ * ext4_get_blocks_dio_write() when DIO to write ++ * to an uninitialized extent. ++ * ++ * Writing to an uninitized extent may result in splitting the uninitialized ++ * extent into multiple /intialized unintialized extents (up to three) ++ * There are three possibilities: ++ * a> There is no split required: Entire extent should be uninitialized ++ * b> Splits in two extents: Write is happening at either end of the extent ++ * c> Splits in three extents: Somone is writing in middle of the extent ++ * ++ * One of more index blocks maybe needed if the extent tree grow after ++ * the unintialized extent split. To prevent ENOSPC occur at the IO ++ * complete, we need to split the uninitialized extent before DIO submit ++ * the IO. The uninitilized extent called at this time will be split ++ * into three uninitialized extent(at most). After IO complete, the part ++ * being filled will be convert to initialized by the end_io callback function ++ * via ext4_convert_unwritten_extents(). ++ */ ++static int ext4_split_unwritten_extents(handle_t *handle, ++ struct inode *inode, ++ struct ext4_ext_path *path, ++ ext4_lblk_t iblock, ++ unsigned int max_blocks, ++ int flags) ++{ ++ struct ext4_extent *ex, newex, orig_ex; ++ struct ext4_extent *ex1 = NULL; ++ struct ext4_extent *ex2 = NULL; ++ struct ext4_extent *ex3 = NULL; ++ struct ext4_extent_header *eh; ++ ext4_lblk_t ee_block; ++ unsigned int allocated, ee_len, depth; ++ ext4_fsblk_t newblock; ++ int err = 0; ++ int ret = 0; ++ ++ ext_debug("ext4_split_unwritten_extents: inode %lu," ++ "iblock %llu, max_blocks %u\n", inode->i_ino, ++ (unsigned long long)iblock, max_blocks); ++ depth = ext_depth(inode); ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ ee_block = le32_to_cpu(ex->ee_block); ++ ee_len = ext4_ext_get_actual_len(ex); ++ allocated = ee_len - (iblock - ee_block); ++ newblock = iblock - ee_block + ext_pblock(ex); ++ ex2 = ex; ++ orig_ex.ee_block = ex->ee_block; ++ orig_ex.ee_len = cpu_to_le16(ee_len); ++ ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); ++ ++ /* ++ * if the entire unintialized extent length less than ++ * the size of extent to write, there is no need to split ++ * uninitialized extent ++ */ ++ if (allocated <= max_blocks) ++ return ret; ++ ++ err = ext4_ext_get_access(handle, inode, path + depth); ++ if (err) ++ goto out; ++ /* ex1: ee_block to iblock - 1 : uninitialized */ ++ if (iblock > ee_block) { ++ ex1 = ex; ++ ex1->ee_len = cpu_to_le16(iblock - ee_block); ++ ext4_ext_mark_uninitialized(ex1); ++ ex2 = &newex; ++ } ++ /* ++ * for sanity, update the length of the ex2 extent before ++ * we insert ex3, if ex1 is NULL. This is to avoid temporary ++ * overlap of blocks. ++ */ ++ if (!ex1 && allocated > max_blocks) ++ ex2->ee_len = cpu_to_le16(max_blocks); ++ /* ex3: to ee_block + ee_len : uninitialised */ ++ if (allocated > max_blocks) { ++ unsigned int newdepth; ++ ex3 = &newex; ++ ex3->ee_block = cpu_to_le32(iblock + max_blocks); ++ ext4_ext_store_pblock(ex3, newblock + max_blocks); ++ ex3->ee_len = cpu_to_le16(allocated - max_blocks); ++ ext4_ext_mark_uninitialized(ex3); ++ err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); ++ if (err == -ENOSPC) { ++ err = ext4_ext_zeroout(inode, &orig_ex); ++ if (err) ++ goto fix_extent_len; ++ /* update the extent length and mark as initialized */ ++ ex->ee_block = orig_ex.ee_block; ++ ex->ee_len = orig_ex.ee_len; ++ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ++ ext4_ext_dirty(handle, inode, path + depth); ++ /* zeroed the full extent */ ++ /* blocks available from iblock */ ++ return allocated; ++ ++ } else if (err) ++ goto fix_extent_len; ++ /* ++ * The depth, and hence eh & ex might change ++ * as part of the insert above. ++ */ ++ newdepth = ext_depth(inode); ++ /* ++ * update the extent length after successful insert of the ++ * split extent ++ */ ++ orig_ex.ee_len = cpu_to_le16(ee_len - ++ ext4_ext_get_actual_len(ex3)); ++ depth = newdepth; ++ ext4_ext_drop_refs(path); ++ path = ext4_ext_find_extent(inode, iblock, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ goto out; ++ } ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ if (ex2 != &newex) ++ ex2 = ex; ++ ++ err = ext4_ext_get_access(handle, inode, path + depth); ++ if (err) ++ goto out; ++ ++ allocated = max_blocks; ++ } ++ /* ++ * If there was a change of depth as part of the ++ * insertion of ex3 above, we need to update the length ++ * of the ex1 extent again here ++ */ ++ if (ex1 && ex1 != ex) { ++ ex1 = ex; ++ ex1->ee_len = cpu_to_le16(iblock - ee_block); ++ ext4_ext_mark_uninitialized(ex1); ++ ex2 = &newex; ++ } ++ /* ++ * ex2: iblock to iblock + maxblocks-1 : to be direct IO written, ++ * uninitialised still. ++ */ ++ ex2->ee_block = cpu_to_le32(iblock); ++ ext4_ext_store_pblock(ex2, newblock); ++ ex2->ee_len = cpu_to_le16(allocated); ++ ext4_ext_mark_uninitialized(ex2); ++ if (ex2 != ex) ++ goto insert; ++ /* Mark modified extent as dirty */ ++ err = ext4_ext_dirty(handle, inode, path + depth); ++ ext_debug("out here\n"); ++ goto out; ++insert: ++ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); ++ if (err == -ENOSPC) { ++ err = ext4_ext_zeroout(inode, &orig_ex); ++ if (err) ++ goto fix_extent_len; ++ /* update the extent length and mark as initialized */ ++ ex->ee_block = orig_ex.ee_block; ++ ex->ee_len = orig_ex.ee_len; ++ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ++ ext4_ext_dirty(handle, inode, path + depth); ++ /* zero out the first half */ ++ return allocated; ++ } else if (err) ++ goto fix_extent_len; ++out: ++ ext4_ext_show_leaf(inode, path); ++ return err ? err : allocated; ++ ++fix_extent_len: ++ ex->ee_block = orig_ex.ee_block; ++ ex->ee_len = orig_ex.ee_len; ++ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); ++ ext4_ext_mark_uninitialized(ex); ++ ext4_ext_dirty(handle, inode, path + depth); ++ return err; ++} ++static int ext4_convert_unwritten_extents_dio(handle_t *handle, ++ struct inode *inode, ++ struct ext4_ext_path *path) ++{ ++ struct ext4_extent *ex; ++ struct ext4_extent_header *eh; ++ int depth; ++ int err = 0; ++ int ret = 0; ++ ++ depth = ext_depth(inode); ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ ++ err = ext4_ext_get_access(handle, inode, path + depth); ++ if (err) ++ goto out; ++ /* first mark the extent as initialized */ ++ ext4_ext_mark_initialized(ex); ++ ++ /* ++ * We have to see if it can be merged with the extent ++ * on the left. ++ */ ++ if (ex > EXT_FIRST_EXTENT(eh)) { ++ /* ++ * To merge left, pass "ex - 1" to try_to_merge(), ++ * since it merges towards right _only_. ++ */ ++ ret = ext4_ext_try_to_merge(inode, path, ex - 1); ++ if (ret) { ++ err = ext4_ext_correct_indexes(handle, inode, path); ++ if (err) ++ goto out; ++ depth = ext_depth(inode); ++ ex--; ++ } ++ } ++ /* ++ * Try to Merge towards right. ++ */ ++ ret = ext4_ext_try_to_merge(inode, path, ex); ++ if (ret) { ++ err = ext4_ext_correct_indexes(handle, inode, path); ++ if (err) ++ goto out; ++ depth = ext_depth(inode); ++ } ++ /* Mark modified extent as dirty */ ++ err = ext4_ext_dirty(handle, inode, path + depth); ++out: ++ ext4_ext_show_leaf(inode, path); ++ return err; ++} ++ ++static int ++ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ++ ext4_lblk_t iblock, unsigned int max_blocks, ++ struct ext4_ext_path *path, int flags, ++ unsigned int allocated, struct buffer_head *bh_result, ++ ext4_fsblk_t newblock) ++{ ++ int ret = 0; ++ int err = 0; ++ ++ ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" ++ "block %llu, max_blocks %u, flags %d, allocated %u", ++ inode->i_ino, (unsigned long long)iblock, max_blocks, ++ flags, allocated); ++ ext4_ext_show_leaf(inode, path); ++ ++ /* DIO get_block() before submit the IO, split the extent */ ++ if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { ++ ret = ext4_split_unwritten_extents(handle, ++ inode, path, iblock, ++ max_blocks, flags); ++ goto out; ++ } ++ /* DIO end_io complete, convert the filled extent to written */ ++ if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { ++ ret = ext4_convert_unwritten_extents_dio(handle, inode, ++ path); ++ goto out2; ++ } ++ /* buffered IO case */ ++ /* ++ * repeat fallocate creation request ++ * we already have an unwritten extent ++ */ ++ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) ++ goto map_out; ++ ++ /* buffered READ or buffered write_begin() lookup */ ++ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { ++ /* ++ * We have blocks reserved already. We ++ * return allocated blocks so that delalloc ++ * won't do block reservation for us. But ++ * the buffer head will be unmapped so that ++ * a read from the block returns 0s. ++ */ ++ set_buffer_unwritten(bh_result); ++ goto out1; ++ } ++ ++ /* buffered write, writepage time, convert*/ ++ ret = ext4_ext_convert_to_initialized(handle, inode, ++ path, iblock, ++ max_blocks); ++out: ++ if (ret <= 0) { ++ err = ret; ++ goto out2; ++ } else ++ allocated = ret; ++ set_buffer_new(bh_result); ++map_out: ++ set_buffer_mapped(bh_result); ++out1: ++ if (allocated > max_blocks) ++ allocated = max_blocks; ++ ext4_ext_show_leaf(inode, path); ++ bh_result->b_bdev = inode->i_sb->s_bdev; ++ bh_result->b_blocknr = newblock; ++out2: ++ if (path) { ++ ext4_ext_drop_refs(path); ++ kfree(path); ++ } ++ return err ? err : allocated; ++} ++/* + * Block allocation/map/preallocation routine for extents based files + * + * +@@ -2868,33 +3184,10 @@ int ext4_ext_get_blocks(handle_t *handle + EXT4_EXT_CACHE_EXTENT); + goto out; + } +- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) +- goto out; +- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { +- if (allocated > max_blocks) +- allocated = max_blocks; +- /* +- * We have blocks reserved already. We +- * return allocated blocks so that delalloc +- * won't do block reservation for us. But +- * the buffer head will be unmapped so that +- * a read from the block returns 0s. +- */ +- set_buffer_unwritten(bh_result); +- bh_result->b_bdev = inode->i_sb->s_bdev; +- bh_result->b_blocknr = newblock; +- goto out2; +- } +- +- ret = ext4_ext_convert_to_initialized(handle, inode, +- path, iblock, +- max_blocks); +- if (ret <= 0) { +- err = ret; +- goto out2; +- } else +- allocated = ret; +- goto outnew; ++ ret = ext4_ext_handle_uninitialized_extents(handle, ++ inode, iblock, max_blocks, path, ++ flags, allocated, bh_result, newblock); ++ return ret; + } + } + +@@ -2967,7 +3260,7 @@ int ext4_ext_get_blocks(handle_t *handle + newex.ee_len = cpu_to_le16(ar.len); + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ + ext4_ext_mark_uninitialized(&newex); +- err = ext4_ext_insert_extent(handle, inode, path, &newex); ++ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (err) { + /* free data blocks we just allocated */ + /* not a good idea to call discard here directly, +@@ -2981,7 +3274,6 @@ int ext4_ext_get_blocks(handle_t *handle + /* previous routine could use block we allocated */ + newblock = ext_pblock(&newex); + allocated = ext4_ext_get_actual_len(&newex); +-outnew: + set_buffer_new(bh_result); + + /* Cache only when it is _not_ an uninitialized extent */ +@@ -3180,6 +3472,63 @@ retry: + } + + /* ++ * This function convert a range of blocks to written extents ++ * The caller of this function will pass the start offset and the size. ++ * all unwritten extents within this range will be converted to ++ * written extents. ++ * ++ * This function is called from the direct IO end io call back ++ * function, to convert the fallocated extents after IO is completed. ++ */ ++int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ++ loff_t len) ++{ ++ handle_t *handle; ++ ext4_lblk_t block; ++ unsigned int max_blocks; ++ int ret = 0; ++ int ret2 = 0; ++ struct buffer_head map_bh; ++ unsigned int credits, blkbits = inode->i_blkbits; ++ ++ block = offset >> blkbits; ++ /* ++ * We can't just convert len to max_blocks because ++ * If blocksize = 4096 offset = 3072 and len = 2048 ++ */ ++ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) ++ - block; ++ /* ++ * credits to insert 1 extent into extent tree ++ */ ++ credits = ext4_chunk_trans_blocks(inode, max_blocks); ++ while (ret >= 0 && ret < max_blocks) { ++ block = block + ret; ++ max_blocks = max_blocks - ret; ++ handle = ext4_journal_start(inode, credits); ++ if (IS_ERR(handle)) { ++ ret = PTR_ERR(handle); ++ break; ++ } ++ map_bh.b_state = 0; ++ ret = ext4_get_blocks(handle, inode, block, ++ max_blocks, &map_bh, ++ EXT4_GET_BLOCKS_DIO_CONVERT_EXT); ++ if (ret <= 0) { ++ WARN_ON(ret <= 0); ++ printk(KERN_ERR "%s: ext4_ext_get_blocks " ++ "returned error inode#%lu, block=%u, " ++ "max_blocks=%u", __func__, ++ inode->i_ino, block, max_blocks); ++ } ++ ext4_mark_inode_dirty(handle, inode); ++ ret2 = ext4_journal_stop(handle); ++ if (ret <= 0 || ret2 ) ++ break; ++ } ++ return ret > 0 ? ret2 : ret; ++} ++/* + * Callback function called for each extent to gather FIEMAP information. + */ + static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1234,6 +1234,9 @@ int ext4_get_blocks(handle_t *handle, st + clear_buffer_mapped(bh); + clear_buffer_unwritten(bh); + ++ ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u," ++ "logical block %lu\n", inode->i_ino, flags, max_blocks, ++ (unsigned long)block); + /* + * Try to see if we can get the block without requesting a new + * file system block. +--- a/fs/ext4/migrate.c ++++ b/fs/ext4/migrate.c +@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle + goto err_out; + } + } +- retval = ext4_ext_insert_extent(handle, inode, path, &newext); ++ retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); + err_out: + if (path) { + ext4_ext_drop_refs(path); +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -322,7 +322,7 @@ mext_insert_across_blocks(handle_t *hand + goto out; + + if (ext4_ext_insert_extent(handle, orig_inode, +- orig_path, new_ext)) ++ orig_path, new_ext, 0)) + goto out; + } + +@@ -333,7 +333,7 @@ mext_insert_across_blocks(handle_t *hand + goto out; + + if (ext4_ext_insert_extent(handle, orig_inode, +- orig_path, end_ext)) ++ orig_path, end_ext, 0)) + goto out; + } + out: diff --git a/queue-2.6.31/0043-ext4-Use-end_io-callback-to-avoid-direct-I-O-fallbac.patch b/queue-2.6.31/0043-ext4-Use-end_io-callback-to-avoid-direct-I-O-fallbac.patch new file mode 100644 index 00000000000..04c84bcc688 --- /dev/null +++ b/queue-2.6.31/0043-ext4-Use-end_io-callback-to-avoid-direct-I-O-fallbac.patch @@ -0,0 +1,305 @@ +From 876a265a60f36961c5e06d6890cc7a861c119cd1 Mon Sep 17 00:00:00 2001 +From: Mingming Cao +Date: Mon, 28 Sep 2009 15:48:41 -0400 +Subject: [PATCH 43/85] ext4: Use end_io callback to avoid direct I/O fallback to buffered I/O + +(cherry picked from commit 4c0425ff68b1b87b802ffeda7b6a46ff7da7241c) + +Currently the DIO VFS code passes create = 0 when writing to the +middle of file. It does this to avoid block allocation for holes, so +as not to expose stale data out when there is a parallel buffered read +(which does not hold the i_mutex lock). Direct I/O writes into holes +falls back to buffered IO for this reason. + +Since preallocated extents are treated as holes when doing a +get_block() look up (buffer is not mapped), direct IO over fallocate +also falls back to buffered IO. Thus ext4 actually silently falls +back to buffered IO in above two cases, which is undesirable. + +To fix this, this patch creates unitialized extents when a direct I/O +write into holes in sparse files, and registering an end_io callback which +converts the uninitialized extent to an initialized extent after the +I/O is completed. + +Singed-Off-By: Mingming Cao +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 3 + fs/ext4/inode.c | 197 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- + fs/ext4/super.c | 11 +++ + 3 files changed, 210 insertions(+), 1 deletion(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1002,6 +1002,9 @@ struct ext4_sb_info { + + unsigned int s_log_groups_per_flex; + struct flex_groups *s_flex_groups; ++ ++ /* workqueue for dio unwritten */ ++ struct workqueue_struct *dio_unwritten_wq; + }; + + static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + + #include "ext4_jbd2.h" + #include "xattr.h" +@@ -3350,6 +3351,8 @@ static int ext4_releasepage(struct page + } + + /* ++ * O_DIRECT for ext3 (or indirect map) based files ++ * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size + * if the machine crashes during the write. +@@ -3358,7 +3361,7 @@ static int ext4_releasepage(struct page + * crashes then stale disk data _may_ be exposed inside the file. But current + * VFS code falls back into buffered path in that case so we are safe. + */ +-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, ++static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) + { +@@ -3432,6 +3435,198 @@ out: + return ret; + } + ++/* Maximum number of blocks we map for direct IO at once. */ ++ ++static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, ++ struct buffer_head *bh_result, int create) ++{ ++ handle_t *handle = NULL; ++ int ret = 0; ++ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; ++ int dio_credits; ++ ++ /* ++ * DIO VFS code passes create = 0 flag for write to ++ * the middle of file. It does this to avoid block ++ * allocation for holes, to prevent expose stale data ++ * out when there is parallel buffered read (which does ++ * not hold the i_mutex lock) while direct IO write has ++ * not completed. DIO request on holes finally falls back ++ * to buffered IO for this reason. ++ * ++ * For ext4 extent based file, since we support fallocate, ++ * new allocated extent as uninitialized, for holes, we ++ * could fallocate blocks for holes, thus parallel ++ * buffered IO read will zero out the page when read on ++ * a hole while parallel DIO write to the hole has not completed. ++ * ++ * when we come here, we know it's a direct IO write to ++ * to the middle of file ( DIO_MAX_BLOCKS) ++ max_blocks = DIO_MAX_BLOCKS; ++ dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); ++ handle = ext4_journal_start(inode, dio_credits); ++ if (IS_ERR(handle)) { ++ ret = PTR_ERR(handle); ++ goto out; ++ } ++ ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, ++ create); ++ if (ret > 0) { ++ bh_result->b_size = (ret << inode->i_blkbits); ++ ret = 0; ++ } ++ ext4_journal_stop(handle); ++out: ++ return ret; ++} ++ ++#define DIO_AIO 0x1 ++ ++static void ext4_free_io_end(ext4_io_end_t *io) ++{ ++ kfree(io); ++} ++ ++/* ++ * IO write completion for unwritten extents. ++ * ++ * check a range of space and convert unwritten extents to written. ++ */ ++static void ext4_end_dio_unwritten(struct work_struct *work) ++{ ++ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); ++ struct inode *inode = io->inode; ++ loff_t offset = io->offset; ++ size_t size = io->size; ++ int ret = 0; ++ int aio = io->flag & DIO_AIO; ++ ++ if (aio) ++ mutex_lock(&inode->i_mutex); ++ if (offset + size <= i_size_read(inode)) ++ ret = ext4_convert_unwritten_extents(inode, offset, size); ++ ++ if (ret < 0) ++ printk(KERN_EMERG "%s: failed to convert unwritten" ++ "extents to written extents, error is %d\n", ++ __func__, ret); ++ ++ ext4_free_io_end(io); ++ if (aio) ++ mutex_unlock(&inode->i_mutex); ++} ++ ++static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag) ++{ ++ ext4_io_end_t *io = NULL; ++ ++ io = kmalloc(sizeof(*io), GFP_NOFS); ++ ++ if (io) { ++ io->inode = inode; ++ io->flag = flag; ++ io->offset = 0; ++ io->size = 0; ++ io->error = 0; ++ INIT_WORK(&io->work, ext4_end_dio_unwritten); ++ } ++ ++ return io; ++} ++ ++static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ++ ssize_t size, void *private) ++{ ++ ext4_io_end_t *io_end = iocb->private; ++ struct workqueue_struct *wq; ++ ++ /* if not hole or unwritten extents, just simple return */ ++ if (!io_end || !size || !iocb->private) ++ return; ++ io_end->offset = offset; ++ io_end->size = size; ++ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; ++ ++ /* We need to convert unwritten extents to written */ ++ queue_work(wq, &io_end->work); ++ ++ if (is_sync_kiocb(iocb)) ++ flush_workqueue(wq); ++ ++ iocb->private = NULL; ++} ++/* ++ * For ext4 extent files, ext4 will do direct-io write to holes, ++ * preallocated extents, and those write extend the file, no need to ++ * fall back to buffered IO. ++ * ++ * For holes, we fallocate those blocks, mark them as unintialized ++ * If those blocks were preallocated, we mark sure they are splited, but ++ * still keep the range to write as unintialized. ++ * ++ * When end_io call back function called at the last IO complete time, ++ * those extents will be converted to written extents. ++ * ++ * If the O_DIRECT write will extend the file then add this inode to the ++ * orphan list. So recovery will truncate it back to the original size ++ * if the machine crashes during the write. ++ * ++ */ ++static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ++ const struct iovec *iov, loff_t offset, ++ unsigned long nr_segs) ++{ ++ struct file *file = iocb->ki_filp; ++ struct inode *inode = file->f_mapping->host; ++ ssize_t ret; ++ size_t count = iov_length(iov, nr_segs); ++ ++ loff_t final_size = offset + count; ++ if (rw == WRITE && final_size <= inode->i_size) { ++ /* ++ * For DIO we fallocate blocks for holes, we fallocate blocks ++ * The fallocated extent for hole is marked as uninitialized ++ * to prevent paralel buffered read to expose the stale data ++ * before DIO complete the data IO. ++ * as for previously fallocated extents, ext4 get_block ++ * will just simply mark the buffer mapped but still ++ * keep the extents uninitialized. ++ * ++ * At the end of IO, the ext4 end_io callback function ++ * will convert those unwritten extents to written, ++ * ++ */ ++ iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb)); ++ if (!iocb->private) ++ return -ENOMEM; ++ ret = blockdev_direct_IO(rw, iocb, inode, ++ inode->i_sb->s_bdev, iov, ++ offset, nr_segs, ++ ext4_get_block_dio_write, ++ ext4_end_io_dio); ++ return ret; ++ } ++ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); ++} ++ ++static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, ++ const struct iovec *iov, loff_t offset, ++ unsigned long nr_segs) ++{ ++ struct file *file = iocb->ki_filp; ++ struct inode *inode = file->f_mapping->host; ++ ++ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) ++ return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); ++ ++ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); ++} ++ + /* + * Pages can be marked dirty completely asynchronously from ext4's journalling + * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -580,6 +580,9 @@ static void ext4_put_super(struct super_ + struct ext4_super_block *es = sbi->s_es; + int i, err; + ++ flush_workqueue(sbi->dio_unwritten_wq); ++ destroy_workqueue(sbi->dio_unwritten_wq); ++ + lock_super(sb); + lock_kernel(); + if (sb->s_dirt) +@@ -2809,6 +2812,12 @@ no_journal: + clear_opt(sbi->s_mount_opt, NOBH); + } + } ++ EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); ++ if (!EXT4_SB(sb)->dio_unwritten_wq) { ++ printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); ++ goto failed_mount_wq; ++ } ++ + /* + * The jbd2_journal_load will have done any necessary log recovery, + * so we can safely mount the rest of the filesystem now. +@@ -2921,6 +2930,8 @@ cantfind_ext4: + + failed_mount4: + ext4_msg(sb, KERN_ERR, "mount failed"); ++ destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); ++failed_mount_wq: + ext4_release_system_zone(sb); + if (sbi->s_journal) { + jbd2_journal_destroy(sbi->s_journal); diff --git a/queue-2.6.31/0044-ext4-async-direct-IO-for-holes-and-fallocate-support.patch b/queue-2.6.31/0044-ext4-async-direct-IO-for-holes-and-fallocate-support.patch new file mode 100644 index 00000000000..4f834c48ee1 --- /dev/null +++ b/queue-2.6.31/0044-ext4-async-direct-IO-for-holes-and-fallocate-support.patch @@ -0,0 +1,479 @@ +From c16e4c11f69bac047eff03aa656af1080e378060 Mon Sep 17 00:00:00 2001 +From: Mingming Cao +Date: Mon, 28 Sep 2009 15:48:29 -0400 +Subject: [PATCH 44/85] ext4: async direct IO for holes and fallocate support + +(cherry picked from commit 8d5d02e6b176565c77ff03604908b1453a22044d) + +For async direct IO that covers holes or fallocate, the end_io +callback function now queued the convertion work on workqueue but +don't flush the work rightaway as it might take too long to afford. + +But when fsync is called after all the data is completed, user expects +the metadata also being updated before fsync returns. + +Thus we need to flush the conversion work when fsync() is called. +This patch keep track of a listed of completed async direct io that +has a work queued on workqueue. When fsync() is called, it will go +through the list and do the conversion. + +Signed-off-by: Mingming Cao +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 9 +- + fs/ext4/extents.c | 19 ++++ + fs/ext4/fsync.c | 5 + + fs/ext4/inode.c | 231 +++++++++++++++++++++++++++++++++++++++++++++--------- + fs/ext4/super.c | 8 + + 5 files changed, 233 insertions(+), 39 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -113,7 +113,9 @@ struct ext4_allocation_request { + unsigned int flags; + }; + ++#define DIO_AIO_UNWRITTEN 0x1 + typedef struct ext4_io_end { ++ struct list_head list; /* per-file finished AIO list */ + struct inode *inode; /* file being written to */ + unsigned int flag; /* sync IO or AIO */ + int error; /* I/O error code */ +@@ -692,6 +694,11 @@ struct ext4_inode_info { + __u16 i_extra_isize; + + spinlock_t i_block_reservation_lock; ++ ++ /* completed async DIOs that might need unwritten extents handling */ ++ struct list_head i_aio_dio_complete_list; ++ /* current io_end structure for async DIO write*/ ++ ext4_io_end_t *cur_aio_dio; + }; + + /* +@@ -1424,7 +1431,7 @@ extern int ext4_block_truncate_page(hand + struct address_space *mapping, loff_t from); + extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); + extern qsize_t ext4_get_reserved_space(struct inode *inode); +- ++extern int flush_aio_dio_completed_IO(struct inode *inode); + /* ioctl.c */ + extern long ext4_ioctl(struct file *, unsigned int, unsigned long); + extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3012,6 +3012,7 @@ ext4_ext_handle_uninitialized_extents(ha + { + int ret = 0; + int err = 0; ++ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + + ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" + "block %llu, max_blocks %u, flags %d, allocated %u", +@@ -3024,6 +3025,9 @@ ext4_ext_handle_uninitialized_extents(ha + ret = ext4_split_unwritten_extents(handle, + inode, path, iblock, + max_blocks, flags); ++ /* flag the io_end struct that we need convert when IO done */ ++ if (io) ++ io->flag = DIO_AIO_UNWRITTEN; + goto out; + } + /* DIO end_io complete, convert the filled extent to written */ +@@ -3109,6 +3113,7 @@ int ext4_ext_get_blocks(handle_t *handle + int err = 0, depth, ret, cache_type; + unsigned int allocated = 0; + struct ext4_allocation_request ar; ++ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; + + __clear_bit(BH_New, &bh_result->b_state); + ext_debug("blocks %u/%u requested for inode %u\n", +@@ -3258,8 +3263,20 @@ int ext4_ext_get_blocks(handle_t *handle + /* try to insert new extent into found leaf and return */ + ext4_ext_store_pblock(&newex, newblock); + newex.ee_len = cpu_to_le16(ar.len); +- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ ++ /* Mark uninitialized */ ++ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ + ext4_ext_mark_uninitialized(&newex); ++ /* ++ * io_end structure was created for every async ++ * direct IO write to the middle of the file. ++ * To avoid unecessary convertion for every aio dio rewrite ++ * to the mid of file, here we flag the IO that is really ++ * need the convertion. ++ * ++ */ ++ if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) ++ io->flag = DIO_AIO_UNWRITTEN; ++ } + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (err) { + /* free data blocks we just allocated */ +--- a/fs/ext4/fsync.c ++++ b/fs/ext4/fsync.c +@@ -44,6 +44,8 @@ + * + * What we do is just kick off a commit and wait on it. This will snapshot the + * inode to disk. ++ * ++ * i_mutex lock is held when entering and exiting this function + */ + + int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) +@@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, st + + trace_ext4_sync_file(file, dentry, datasync); + ++ ret = flush_aio_dio_completed_IO(inode); ++ if (ret < 0) ++ goto out; + /* + * data=writeback: + * The caller's filemap_fdatawrite()/wait will sync the data. +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3445,6 +3445,8 @@ static int ext4_get_block_dio_write(stru + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + int dio_credits; + ++ ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", ++ inode->i_ino, create); + /* + * DIO VFS code passes create = 0 flag for write to + * the middle of file. It does this to avoid block +@@ -3485,55 +3487,152 @@ out: + return ret; + } + +-#define DIO_AIO 0x1 +- + static void ext4_free_io_end(ext4_io_end_t *io) + { ++ BUG_ON(!io); ++ iput(io->inode); + kfree(io); + } ++static void dump_aio_dio_list(struct inode * inode) ++{ ++#ifdef EXT4_DEBUG ++ struct list_head *cur, *before, *after; ++ ext4_io_end_t *io, *io0, *io1; ++ ++ if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ ++ ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); ++ return; ++ } ++ ++ ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); ++ list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ ++ cur = &io->list; ++ before = cur->prev; ++ io0 = container_of(before, ext4_io_end_t, list); ++ after = cur->next; ++ io1 = container_of(after, ext4_io_end_t, list); ++ ++ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", ++ io, inode->i_ino, io0, io1); ++ } ++#endif ++} + + /* +- * IO write completion for unwritten extents. +- * + * check a range of space and convert unwritten extents to written. + */ +-static void ext4_end_dio_unwritten(struct work_struct *work) ++static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) + { +- ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); + struct inode *inode = io->inode; + loff_t offset = io->offset; + size_t size = io->size; + int ret = 0; +- int aio = io->flag & DIO_AIO; + +- if (aio) +- mutex_lock(&inode->i_mutex); ++ ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," ++ "list->prev 0x%p\n", ++ io, inode->i_ino, io->list.next, io->list.prev); ++ ++ if (list_empty(&io->list)) ++ return ret; ++ ++ if (io->flag != DIO_AIO_UNWRITTEN) ++ return ret; ++ + if (offset + size <= i_size_read(inode)) + ret = ext4_convert_unwritten_extents(inode, offset, size); + +- if (ret < 0) ++ if (ret < 0) { + printk(KERN_EMERG "%s: failed to convert unwritten" +- "extents to written extents, error is %d\n", +- __func__, ret); ++ "extents to written extents, error is %d" ++ " io is still on inode %lu aio dio list\n", ++ __func__, ret, inode->i_ino); ++ return ret; ++ } ++ ++ /* clear the DIO AIO unwritten flag */ ++ io->flag = 0; ++ return ret; ++} ++/* ++ * work on completed aio dio IO, to convert unwritten extents to extents ++ */ ++static void ext4_end_aio_dio_work(struct work_struct *work) ++{ ++ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); ++ struct inode *inode = io->inode; ++ int ret = 0; + +- ext4_free_io_end(io); +- if (aio) +- mutex_unlock(&inode->i_mutex); ++ mutex_lock(&inode->i_mutex); ++ ret = ext4_end_aio_dio_nolock(io); ++ if (ret >= 0) { ++ if (!list_empty(&io->list)) ++ list_del_init(&io->list); ++ ext4_free_io_end(io); ++ } ++ mutex_unlock(&inode->i_mutex); + } ++/* ++ * This function is called from ext4_sync_file(). ++ * ++ * When AIO DIO IO is completed, the work to convert unwritten ++ * extents to written is queued on workqueue but may not get immediately ++ * scheduled. When fsync is called, we need to ensure the ++ * conversion is complete before fsync returns. ++ * The inode keeps track of a list of completed AIO from DIO path ++ * that might needs to do the conversion. This function walks through ++ * the list and convert the related unwritten extents to written. ++ */ ++int flush_aio_dio_completed_IO(struct inode *inode) ++{ ++ ext4_io_end_t *io; ++ int ret = 0; ++ int ret2 = 0; ++ ++ if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) ++ return ret; + +-static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag) ++ dump_aio_dio_list(inode); ++ while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ ++ io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, ++ ext4_io_end_t, list); ++ /* ++ * Calling ext4_end_aio_dio_nolock() to convert completed ++ * IO to written. ++ * ++ * When ext4_sync_file() is called, run_queue() may already ++ * about to flush the work corresponding to this io structure. ++ * It will be upset if it founds the io structure related ++ * to the work-to-be schedule is freed. ++ * ++ * Thus we need to keep the io structure still valid here after ++ * convertion finished. The io structure has a flag to ++ * avoid double converting from both fsync and background work ++ * queue work. ++ */ ++ ret = ext4_end_aio_dio_nolock(io); ++ if (ret < 0) ++ ret2 = ret; ++ else ++ list_del_init(&io->list); ++ } ++ return (ret2 < 0) ? ret2 : 0; ++} ++ ++static ext4_io_end_t *ext4_init_io_end (struct inode *inode) + { + ext4_io_end_t *io = NULL; + + io = kmalloc(sizeof(*io), GFP_NOFS); + + if (io) { ++ igrab(inode); + io->inode = inode; +- io->flag = flag; ++ io->flag = 0; + io->offset = 0; + io->size = 0; + io->error = 0; +- INIT_WORK(&io->work, ext4_end_dio_unwritten); ++ INIT_WORK(&io->work, ext4_end_aio_dio_work); ++ INIT_LIST_HEAD(&io->list); + } + + return io; +@@ -3545,19 +3644,31 @@ static void ext4_end_io_dio(struct kiocb + ext4_io_end_t *io_end = iocb->private; + struct workqueue_struct *wq; + +- /* if not hole or unwritten extents, just simple return */ +- if (!io_end || !size || !iocb->private) ++ ext_debug("ext4_end_io_dio(): io_end 0x%p" ++ "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", ++ iocb->private, io_end->inode->i_ino, iocb, offset, ++ size); ++ /* if not async direct IO or dio with 0 bytes write, just return */ ++ if (!io_end || !size) ++ return; ++ ++ /* if not aio dio with unwritten extents, just free io and return */ ++ if (io_end->flag != DIO_AIO_UNWRITTEN){ ++ ext4_free_io_end(io_end); ++ iocb->private = NULL; + return; ++ } ++ + io_end->offset = offset; + io_end->size = size; + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; + +- /* We need to convert unwritten extents to written */ ++ /* queue the work to convert unwritten extents to written */ + queue_work(wq, &io_end->work); + +- if (is_sync_kiocb(iocb)) +- flush_workqueue(wq); +- ++ /* Add the io_end to per-inode completed aio dio list*/ ++ list_add_tail(&io_end->list, ++ &EXT4_I(io_end->inode)->i_aio_dio_complete_list); + iocb->private = NULL; + } + /* +@@ -3569,8 +3680,10 @@ static void ext4_end_io_dio(struct kiocb + * If those blocks were preallocated, we mark sure they are splited, but + * still keep the range to write as unintialized. + * +- * When end_io call back function called at the last IO complete time, +- * those extents will be converted to written extents. ++ * The unwrritten extents will be converted to written when DIO is completed. ++ * For async direct IO, since the IO may still pending when return, we ++ * set up an end_io call back function, which will do the convertion ++ * when async direct IO completed. + * + * If the O_DIRECT write will extend the file then add this inode to the + * orphan list. So recovery will truncate it back to the original size +@@ -3589,28 +3702,76 @@ static ssize_t ext4_ext_direct_IO(int rw + loff_t final_size = offset + count; + if (rw == WRITE && final_size <= inode->i_size) { + /* +- * For DIO we fallocate blocks for holes, we fallocate blocks +- * The fallocated extent for hole is marked as uninitialized ++ * We could direct write to holes and fallocate. ++ * ++ * Allocated blocks to fill the hole are marked as uninitialized + * to prevent paralel buffered read to expose the stale data + * before DIO complete the data IO. +- * as for previously fallocated extents, ext4 get_block ++ * ++ * As to previously fallocated extents, ext4 get_block + * will just simply mark the buffer mapped but still + * keep the extents uninitialized. + * +- * At the end of IO, the ext4 end_io callback function +- * will convert those unwritten extents to written, +- * ++ * for non AIO case, we will convert those unwritten extents ++ * to written after return back from blockdev_direct_IO. ++ * ++ * for async DIO, the conversion needs to be defered when ++ * the IO is completed. The ext4 end_io callback function ++ * will be called to take care of the conversion work. ++ * Here for async case, we allocate an io_end structure to ++ * hook to the iocb. + */ +- iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb)); +- if (!iocb->private) +- return -ENOMEM; ++ iocb->private = NULL; ++ EXT4_I(inode)->cur_aio_dio = NULL; ++ if (!is_sync_kiocb(iocb)) { ++ iocb->private = ext4_init_io_end(inode); ++ if (!iocb->private) ++ return -ENOMEM; ++ /* ++ * we save the io structure for current async ++ * direct IO, so that later ext4_get_blocks() ++ * could flag the io structure whether there ++ * is a unwritten extents needs to be converted ++ * when IO is completed. ++ */ ++ EXT4_I(inode)->cur_aio_dio = iocb->private; ++ } ++ + ret = blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block_dio_write, + ext4_end_io_dio); ++ if (iocb->private) ++ EXT4_I(inode)->cur_aio_dio = NULL; ++ /* ++ * The io_end structure takes a reference to the inode, ++ * that structure needs to be destroyed and the ++ * reference to the inode need to be dropped, when IO is ++ * complete, even with 0 byte write, or failed. ++ * ++ * In the successful AIO DIO case, the io_end structure will be ++ * desctroyed and the reference to the inode will be dropped ++ * after the end_io call back function is called. ++ * ++ * In the case there is 0 byte write, or error case, since ++ * VFS direct IO won't invoke the end_io call back function, ++ * we need to free the end_io structure here. ++ */ ++ if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { ++ ext4_free_io_end(iocb->private); ++ iocb->private = NULL; ++ } else if (ret > 0) ++ /* ++ * for non AIO case, since the IO is already ++ * completed, we could do the convertion right here ++ */ ++ ret = ext4_convert_unwritten_extents(inode, ++ offset, ret); + return ret; + } ++ ++ /* for write the the end of file case, we fall back to old way */ + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); + } + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -687,6 +687,8 @@ static struct inode *ext4_alloc_inode(st + ei->i_allocated_meta_blocks = 0; + ei->i_delalloc_reserved_flag = 0; + spin_lock_init(&(ei->i_block_reservation_lock)); ++ INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); ++ ei->cur_aio_dio = NULL; + + return &ei->vfs_inode; + } +@@ -3383,11 +3385,13 @@ static int ext4_sync_fs(struct super_blo + { + int ret = 0; + tid_t target; ++ struct ext4_sb_info *sbi = EXT4_SB(sb); + + trace_ext4_sync_fs(sb, wait); +- if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { ++ flush_workqueue(sbi->dio_unwritten_wq); ++ if (jbd2_journal_start_commit(sbi->s_journal, &target)) { + if (wait) +- jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); ++ jbd2_log_wait_commit(sbi->s_journal, target); + } + return ret; + } diff --git a/queue-2.6.31/0045-ext4-EXT4_IOC_MOVE_EXT-Check-for-different-original-.patch b/queue-2.6.31/0045-ext4-EXT4_IOC_MOVE_EXT-Check-for-different-original-.patch new file mode 100644 index 00000000000..9c5665b2880 --- /dev/null +++ b/queue-2.6.31/0045-ext4-EXT4_IOC_MOVE_EXT-Check-for-different-original-.patch @@ -0,0 +1,49 @@ +From ab1fbc60d1f924d4ccbe645ce6e7e5df10d923e2 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Mon, 28 Sep 2009 15:58:29 -0400 +Subject: [PATCH 45/85] ext4: EXT4_IOC_MOVE_EXT: Check for different original and donor inodes first + +(cherry picked from commit f3ce8064b388ccf420012c5a4907aae4f13fe9d0) + +Move the check to make sure the original and donor inodes are +different earlier, to avoid a potential deadlock by trying to lock the +same inode twice. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/move_extent.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -1001,14 +1001,6 @@ mext_check_arguments(struct inode *orig_ + return -EINVAL; + } + +- /* orig and donor should be different file */ +- if (orig_inode->i_ino == donor_inode->i_ino) { +- ext4_debug("ext4 move extent: The argument files should not " +- "be same file [ino:orig %lu, donor %lu]\n", +- orig_inode->i_ino, donor_inode->i_ino); +- return -EINVAL; +- } +- + /* Ext4 move extent supports only extent based file */ + if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { + ext4_debug("ext4 move extent: orig file is not extents " +@@ -1232,6 +1224,14 @@ ext4_move_extents(struct file *o_filp, s + int block_len_in_page; + int uninit; + ++ /* orig and donor should be different file */ ++ if (orig_inode->i_ino == donor_inode->i_ino) { ++ ext4_debug("ext4 move extent: The argument files should not " ++ "be same file [ino:orig %lu, donor %lu]\n", ++ orig_inode->i_ino, donor_inode->i_ino); ++ return -EINVAL; ++ } ++ + /* protect orig and donor against a truncate */ + ret1 = mext_inode_double_lock(orig_inode, donor_inode); + if (ret1 < 0) diff --git a/queue-2.6.31/0046-ext4-Avoid-updating-the-inode-table-bh-twice-in-no-j.patch b/queue-2.6.31/0046-ext4-Avoid-updating-the-inode-table-bh-twice-in-no-j.patch new file mode 100644 index 00000000000..ca8535d0eb4 --- /dev/null +++ b/queue-2.6.31/0046-ext4-Avoid-updating-the-inode-table-bh-twice-in-no-j.patch @@ -0,0 +1,88 @@ +From 865943e8572497262dde59d86a8457b0e5b066d6 Mon Sep 17 00:00:00 2001 +From: Frank Mayhar +Date: Tue, 29 Sep 2009 10:07:47 -0400 +Subject: [PATCH 46/85] ext4: Avoid updating the inode table bh twice in no journal mode + +(cherry picked from commit 830156c79b0a99ddf0f62496bcf4de640f9f52cd) + +This is a cleanup of commit 91ac6f4. Since ext4_mark_inode_dirty() +has already called ext4_mark_iloc_dirty(), which in turn calls +ext4_do_update_inode(), it's not necessary to have ext4_write_inode() +call ext4_do_update_inode() in no journal mode. Indeed, it would be +duplicated work. + +Reviewed-by: "Aneesh Kumar K.V" +Signed-off-by: Frank Mayhar +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 37 ++++++++++++++++--------------------- + 1 file changed, 16 insertions(+), 21 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -4981,8 +4981,7 @@ static int ext4_inode_blocks_set(handle_ + */ + static int ext4_do_update_inode(handle_t *handle, + struct inode *inode, +- struct ext4_iloc *iloc, +- int do_sync) ++ struct ext4_iloc *iloc) + { + struct ext4_inode *raw_inode = ext4_raw_inode(iloc); + struct ext4_inode_info *ei = EXT4_I(inode); +@@ -5083,22 +5082,10 @@ static int ext4_do_update_inode(handle_t + raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); + } + +- /* +- * If we're not using a journal and we were called from +- * ext4_write_inode() to sync the inode (making do_sync true), +- * we can just use sync_dirty_buffer() directly to do our dirty +- * work. Testing s_journal here is a bit redundant but it's +- * worth it to avoid potential future trouble. +- */ +- if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) { +- BUFFER_TRACE(bh, "call sync_dirty_buffer"); +- sync_dirty_buffer(bh); +- } else { +- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); +- rc = ext4_handle_dirty_metadata(handle, inode, bh); +- if (!err) +- err = rc; +- } ++ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); ++ rc = ext4_handle_dirty_metadata(handle, inode, bh); ++ if (!err) ++ err = rc; + ei->i_state &= ~EXT4_STATE_NEW; + + out_brelse: +@@ -5166,8 +5153,16 @@ int ext4_write_inode(struct inode *inode + err = ext4_get_inode_loc(inode, &iloc); + if (err) + return err; +- err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE, +- inode, &iloc, wait); ++ if (wait) ++ sync_dirty_buffer(iloc.bh); ++ if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { ++ ext4_error(inode->i_sb, __func__, ++ "IO error syncing inode, " ++ "inode=%lu, block=%llu", ++ inode->i_ino, ++ (unsigned long long)iloc.bh->b_blocknr); ++ err = -EIO; ++ } + } + return err; + } +@@ -5463,7 +5458,7 @@ int ext4_mark_iloc_dirty(handle_t *handl + get_bh(iloc->bh); + + /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ +- err = ext4_do_update_inode(handle, inode, iloc, 0); ++ err = ext4_do_update_inode(handle, inode, iloc); + put_bh(iloc->bh); + return err; + } diff --git a/queue-2.6.31/0047-ext4-Make-sure-ext4_dirty_inode-updates-the-inode-in.patch b/queue-2.6.31/0047-ext4-Make-sure-ext4_dirty_inode-updates-the-inode-in.patch new file mode 100644 index 00000000000..a89e92decf2 --- /dev/null +++ b/queue-2.6.31/0047-ext4-Make-sure-ext4_dirty_inode-updates-the-inode-in.patch @@ -0,0 +1,52 @@ +From 3e837135a49aba6f8d1467c45f07ef1a348fa1e6 Mon Sep 17 00:00:00 2001 +From: Curt Wohlgemuth +Date: Tue, 29 Sep 2009 16:06:01 -0400 +Subject: [PATCH 47/85] ext4: Make sure ext4_dirty_inode() updates the inode in no journal mode + +(cherry picked from commit f3dc272fd5e2ae08244796bb39e7e1ce4b25d3b3) + +This patch a problem that ext4_dirty_inode() was not calling +ext4_mark_inode_dirty() if the current_handle is not valid, which it +is the case in no journal mode. + +It also removes a test for non-matching transaction which can never +happen. + +Signed-off-by: Curt Wohlgemuth +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 19 ++++--------------- + 1 file changed, 4 insertions(+), 15 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -5605,24 +5605,13 @@ void ext4_dirty_inode(struct inode *inod + handle_t *current_handle = ext4_journal_current_handle(); + handle_t *handle; + +- if (!ext4_handle_valid(current_handle)) { +- ext4_mark_inode_dirty(current_handle, inode); +- return; +- } +- + handle = ext4_journal_start(inode, 2); + if (IS_ERR(handle)) + goto out; +- if (current_handle && +- current_handle->h_transaction != handle->h_transaction) { +- /* This task has a transaction open against a different fs */ +- printk(KERN_EMERG "%s: transactions do not match!\n", +- __func__); +- } else { +- jbd_debug(5, "marking dirty. outer handle=%p\n", +- current_handle); +- ext4_mark_inode_dirty(handle, inode); +- } ++ ++ jbd_debug(5, "marking dirty. outer handle=%p\n", current_handle); ++ ext4_mark_inode_dirty(handle, inode); ++ + ext4_journal_stop(handle); + out: + return; diff --git a/queue-2.6.31/0048-ext4-Handle-nested-ext4_journal_start-stop-calls-wit.patch b/queue-2.6.31/0048-ext4-Handle-nested-ext4_journal_start-stop-calls-wit.patch new file mode 100644 index 00000000000..3b6b5178b1b --- /dev/null +++ b/queue-2.6.31/0048-ext4-Handle-nested-ext4_journal_start-stop-calls-wit.patch @@ -0,0 +1,114 @@ +From 8f52450f144ba68f24eb23d37c940c7098889391 Mon Sep 17 00:00:00 2001 +From: Curt Wohlgemuth +Date: Tue, 29 Sep 2009 11:01:03 -0400 +Subject: [PATCH 48/85] ext4: Handle nested ext4_journal_start/stop calls without a journal + +(cherry picked from commit d3d1faf6a74496ea4435fd057c6a2cad49f3e523) + +This patch fixes a problem with handling nested calls to +ext4_journal_start/ext4_journal_stop, when there is no journal present. + +Signed-off-by: Curt Wohlgemuth +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4_jbd2.h | 6 ++++-- + fs/ext4/namei.c | 3 ++- + fs/ext4/super.c | 42 ++++++++++++++++++++++++++++++++---------- + 3 files changed, 38 insertions(+), 13 deletions(-) + +--- a/fs/ext4/ext4_jbd2.h ++++ b/fs/ext4/ext4_jbd2.h +@@ -161,11 +161,13 @@ int __ext4_handle_dirty_metadata(const c + handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); + int __ext4_journal_stop(const char *where, handle_t *handle); + +-#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) ++#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) + ++/* Note: Do not use this for NULL handles. This is only to determine if ++ * a properly allocated handle is using a journal or not. */ + static inline int ext4_handle_valid(handle_t *handle) + { +- if (handle == EXT4_NOJOURNAL_HANDLE) ++ if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT) + return 0; + return 1; + } +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -2068,7 +2068,8 @@ int ext4_orphan_del(handle_t *handle, st + struct ext4_iloc iloc; + int err = 0; + +- if (!ext4_handle_valid(handle)) ++ /* ext4_handle_valid() assumes a valid handle_t pointer */ ++ if (handle && !ext4_handle_valid(handle)) + return 0; + + mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -189,6 +189,36 @@ void ext4_itable_unused_set(struct super + bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); + } + ++ ++/* Just increment the non-pointer handle value */ ++static handle_t *ext4_get_nojournal(void) ++{ ++ handle_t *handle = current->journal_info; ++ unsigned long ref_cnt = (unsigned long)handle; ++ ++ BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT); ++ ++ ref_cnt++; ++ handle = (handle_t *)ref_cnt; ++ ++ current->journal_info = handle; ++ return handle; ++} ++ ++ ++/* Decrement the non-pointer handle value */ ++static void ext4_put_nojournal(handle_t *handle) ++{ ++ unsigned long ref_cnt = (unsigned long)handle; ++ ++ BUG_ON(ref_cnt == 0); ++ ++ ref_cnt--; ++ handle = (handle_t *)ref_cnt; ++ ++ current->journal_info = handle; ++} ++ + /* + * Wrappers for jbd2_journal_start/end. + * +@@ -215,11 +245,7 @@ handle_t *ext4_journal_start_sb(struct s + } + return jbd2_journal_start(journal, nblocks); + } +- /* +- * We're not journaling, return the appropriate indication. +- */ +- current->journal_info = EXT4_NOJOURNAL_HANDLE; +- return current->journal_info; ++ return ext4_get_nojournal(); + } + + /* +@@ -235,11 +261,7 @@ int __ext4_journal_stop(const char *wher + int rc; + + if (!ext4_handle_valid(handle)) { +- /* +- * Do this here since we don't call jbd2_journal_stop() in +- * no-journal mode. +- */ +- current->journal_info = NULL; ++ ext4_put_nojournal(handle); + return 0; + } + sb = handle->h_transaction->t_journal->j_private; diff --git a/queue-2.6.31/0049-ext4-Fix-time-encoding-with-extra-epoch-bits.patch b/queue-2.6.31/0049-ext4-Fix-time-encoding-with-extra-epoch-bits.patch new file mode 100644 index 00000000000..2875c41bfd4 --- /dev/null +++ b/queue-2.6.31/0049-ext4-Fix-time-encoding-with-extra-epoch-bits.patch @@ -0,0 +1,41 @@ +From 49a528d30e81c05dea7208bb4c1fcb30dfbf898f Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Wed, 30 Sep 2009 01:13:55 -0400 +Subject: [PATCH 49/85] ext4: Fix time encoding with extra epoch bits + +(cherry picked from commit c1fccc0696bcaff6008c11865091f5ec4b0937ab) + +"Looking at ext4.h, I think the setting of extra time fields forgets to +mask the epoch bits so the epoch part overwrites nsec part. The second +change is only for coherency (2 -> EXT4_EPOCH_BITS)." + +Thanks to Damien Guibouret for pointing out this problem. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -522,8 +522,8 @@ struct move_extent { + static inline __le32 ext4_encode_extra_time(struct timespec *time) + { + return cpu_to_le32((sizeof(time->tv_sec) > 4 ? +- time->tv_sec >> 32 : 0) | +- ((time->tv_nsec << 2) & EXT4_NSEC_MASK)); ++ (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) | ++ ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK)); + } + + static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) +@@ -531,7 +531,7 @@ static inline void ext4_decode_extra_tim + if (sizeof(time->tv_sec) > 4) + time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) + << 32; +- time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2; ++ time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; + } + + #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ diff --git a/queue-2.6.31/0050-ext4-fix-a-BUG_ON-crash-by-checking-that-page-has-bu.patch b/queue-2.6.31/0050-ext4-fix-a-BUG_ON-crash-by-checking-that-page-has-bu.patch new file mode 100644 index 00000000000..01ae93264c7 --- /dev/null +++ b/queue-2.6.31/0050-ext4-fix-a-BUG_ON-crash-by-checking-that-page-has-bu.patch @@ -0,0 +1,57 @@ +From 7fcfa625f4bdba955d0fb0b717dc576b1e48b470 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Wed, 30 Sep 2009 22:57:41 -0400 +Subject: [PATCH 50/85] ext4: fix a BUG_ON crash by checking that page has buffers attached to it + +(cherry picked from commit 1f94533d9cd75f6d2826018d54a971b9cc085992) + +In ext4_num_dirty_pages() we were calling page_buffers() before +checking to see if the page actually had pages attached to it; this +would cause a BUG check crash in the inline function page_buffers(). + +Thanks to Markus Trippelsdorf for reporting this bug. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 22 +++++++++++----------- + 1 file changed, 11 insertions(+), 11 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1147,8 +1147,8 @@ static int check_block_validity(struct i + } + + /* +- * Return the number of dirty pages in the given inode starting at +- * page frame idx. ++ * Return the number of contiguous dirty pages in a given inode ++ * starting at page frame idx. + */ + static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, + unsigned int max_pages) +@@ -1182,15 +1182,15 @@ static pgoff_t ext4_num_dirty_pages(stru + unlock_page(page); + break; + } +- head = page_buffers(page); +- bh = head; +- do { +- if (!buffer_delay(bh) && +- !buffer_unwritten(bh)) { +- done = 1; +- break; +- } +- } while ((bh = bh->b_this_page) != head); ++ if (page_has_buffers(page)) { ++ bh = head = page_buffers(page); ++ do { ++ if (!buffer_delay(bh) && ++ !buffer_unwritten(bh)) ++ done = 1; ++ bh = bh->b_this_page; ++ } while (!done && (bh != head)); ++ } + unlock_page(page); + if (done) + break; diff --git a/queue-2.6.31/0051-ext4-retry-failed-direct-IO-allocations.patch b/queue-2.6.31/0051-ext4-retry-failed-direct-IO-allocations.patch new file mode 100644 index 00000000000..2c61d25d82b --- /dev/null +++ b/queue-2.6.31/0051-ext4-retry-failed-direct-IO-allocations.patch @@ -0,0 +1,48 @@ +From ff63e19060e0fc2fadab863a9c20be54f2d55b35 Mon Sep 17 00:00:00 2001 +From: Eric Sandeen +Date: Fri, 2 Oct 2009 21:20:55 -0400 +Subject: [PATCH 51/85] ext4: retry failed direct IO allocations + +(cherry picked from commit fbbf69456619de5d251cb9f1df609069178c62d5) + +On a 256M filesystem, doing this in a loop: + + xfs_io -F -f -d -c 'pwrite 0 64m' test + rm -f test + +eventually leads to ENOSPC. (the xfs_io command does a +64m direct IO write to the file "test") + +As with other block allocation callers, it looks like we need to +potentially retry the allocations on the initial ENOSPC. + +Signed-off-by: Eric Sandeen +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3372,6 +3372,7 @@ static ssize_t ext4_ind_direct_IO(int rw + ssize_t ret; + int orphan = 0; + size_t count = iov_length(iov, nr_segs); ++ int retries = 0; + + if (rw == WRITE) { + loff_t final_size = offset + count; +@@ -3394,9 +3395,12 @@ static ssize_t ext4_ind_direct_IO(int rw + } + } + ++retry: + ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, + offset, nr_segs, + ext4_get_block, NULL); ++ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) ++ goto retry; + + if (orphan) { + int err; diff --git a/queue-2.6.31/0052-ext4-discard-preallocation-when-restarting-a-transac.patch b/queue-2.6.31/0052-ext4-discard-preallocation-when-restarting-a-transac.patch new file mode 100644 index 00000000000..6d57fbdffe6 --- /dev/null +++ b/queue-2.6.31/0052-ext4-discard-preallocation-when-restarting-a-transac.patch @@ -0,0 +1,39 @@ +From d0f9bae5cc4b14857f209a7060a13aa266fa824f Mon Sep 17 00:00:00 2001 +From: Aneesh Kumar K.V +Date: Mon, 2 Nov 2009 18:50:49 -0500 +Subject: [PATCH 52/85] ext4: discard preallocation when restarting a transaction during truncate + +(cherry picked from commit fa5d11133b07053270e18fa9c18560e66e79217e) + +When restart a transaction during a truncate operation, we drop and +reacquire i_data_sem. After reacquiring i_data_sem, we need to +discard any inode-based preallocation that might have been grabbed +while we released i_data_sem (for example, if pdflush is allocating +blocks and racing against the truncate). + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -193,7 +193,7 @@ static int try_to_extend_transaction(han + * so before we call here everything must be consistently dirtied against + * this transaction. + */ +- int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, ++int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, + int nblocks) + { + int ret; +@@ -209,6 +209,7 @@ static int try_to_extend_transaction(han + up_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); + down_write(&EXT4_I(inode)->i_data_sem); ++ ext4_discard_preallocations(inode); + + return ret; + } diff --git a/queue-2.6.31/0053-ext4-fix-ext4_ext_direct_IO-s-return-value-after-con.patch b/queue-2.6.31/0053-ext4-fix-ext4_ext_direct_IO-s-return-value-after-con.patch new file mode 100644 index 00000000000..bdfd9b5a77d --- /dev/null +++ b/queue-2.6.31/0053-ext4-fix-ext4_ext_direct_IO-s-return-value-after-con.patch @@ -0,0 +1,59 @@ +From acd8eefddb85238ffd6ed9f67a159007d3de7254 Mon Sep 17 00:00:00 2001 +From: Mingming +Date: Tue, 10 Nov 2009 10:48:08 -0500 +Subject: [PATCH 53/85] ext4: fix ext4_ext_direct_IO()'s return value after converting uninit extents + +(cherry picked from commit 109f55651954def97fa41ee71c464d268c512ab0) + +After a direct I/O request covering an uninitalized extent (i.e., +created using the fallocate system call) or a hole in a file, ext4 +will convert the uninitialized extent so it is marked as initialized +by calling ext4_convert_unwritten_extents(). This function returns +zero on success. + +This return value was getting returned by ext4_direct_IO(); however +the file system's direct_IO function is supposed to return the number +of bytes read or written on a success. By returning zero, it confused +the direct I/O code into falling back to buffered I/O unnecessarily. + +Signed-off-by: Mingming Cao +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/extents.c | 1 + + fs/ext4/inode.c | 10 +++++++--- + 2 files changed, 8 insertions(+), 3 deletions(-) + +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3496,6 +3496,7 @@ retry: + * + * This function is called from the direct IO end io call back + * function, to convert the fallocated extents after IO is completed. ++ * Returns 0 on success. + */ + int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, + loff_t len) +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3766,13 +3766,17 @@ static ssize_t ext4_ext_direct_IO(int rw + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { + ext4_free_io_end(iocb->private); + iocb->private = NULL; +- } else if (ret > 0) ++ } else if (ret > 0) { ++ int err; + /* + * for non AIO case, since the IO is already + * completed, we could do the convertion right here + */ +- ret = ext4_convert_unwritten_extents(inode, +- offset, ret); ++ err = ext4_convert_unwritten_extents(inode, ++ offset, ret); ++ if (err < 0) ++ ret = err; ++ } + return ret; + } + diff --git a/queue-2.6.31/0054-ext4-skip-conversion-of-uninit-extents-after-direct-.patch b/queue-2.6.31/0054-ext4-skip-conversion-of-uninit-extents-after-direct-.patch new file mode 100644 index 00000000000..a893e0877eb --- /dev/null +++ b/queue-2.6.31/0054-ext4-skip-conversion-of-uninit-extents-after-direct-.patch @@ -0,0 +1,96 @@ +From 0c51dd21fdd04586411a3653caee9c73e3468ee5 Mon Sep 17 00:00:00 2001 +From: Mingming +Date: Tue, 10 Nov 2009 10:48:04 -0500 +Subject: [PATCH 54/85] ext4: skip conversion of uninit extents after direct IO if there isn't any + +(cherry picked from commit 5f5249507e4b5c4fc0f9c93f33d133d8c95f47e1) + +At the end of direct I/O operation, ext4_ext_direct_IO() always called +ext4_convert_unwritten_extents(), regardless of whether there were any +unwritten extents involved in the I/O or not. + +This commit adds a state flag so that ext4_ext_direct_IO() only calls +ext4_convert_unwritten_extents() when necessary. + +Signed-off-by: Mingming Cao +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 1 + + fs/ext4/extents.c | 22 +++++++++++++++++----- + fs/ext4/inode.c | 4 +++- + 3 files changed, 21 insertions(+), 6 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -318,6 +318,7 @@ static inline __u32 ext4_mask_flags(umod + #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ + #define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ + #define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ ++#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/ + + /* Used to pass group descriptor data when online resize is done */ + struct ext4_new_group_input { +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3025,12 +3025,18 @@ ext4_ext_handle_uninitialized_extents(ha + ret = ext4_split_unwritten_extents(handle, + inode, path, iblock, + max_blocks, flags); +- /* flag the io_end struct that we need convert when IO done */ ++ /* ++ * Flag the inode(non aio case) or end_io struct (aio case) ++ * that this IO needs to convertion to written when IO is ++ * completed ++ */ + if (io) + io->flag = DIO_AIO_UNWRITTEN; ++ else ++ EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; + goto out; + } +- /* DIO end_io complete, convert the filled extent to written */ ++ /* async DIO end_io complete, convert the filled extent to written */ + if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { + ret = ext4_convert_unwritten_extents_dio(handle, inode, + path); +@@ -3272,10 +3278,16 @@ int ext4_ext_get_blocks(handle_t *handle + * To avoid unecessary convertion for every aio dio rewrite + * to the mid of file, here we flag the IO that is really + * need the convertion. +- * ++ * For non asycn direct IO case, flag the inode state ++ * that we need to perform convertion when IO is done. + */ +- if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) +- io->flag = DIO_AIO_UNWRITTEN; ++ if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { ++ if (io) ++ io->flag = DIO_AIO_UNWRITTEN; ++ else ++ EXT4_I(inode)->i_state |= ++ EXT4_STATE_DIO_UNWRITTEN;; ++ } + } + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (err) { +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3766,7 +3766,8 @@ static ssize_t ext4_ext_direct_IO(int rw + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { + ext4_free_io_end(iocb->private); + iocb->private = NULL; +- } else if (ret > 0) { ++ } else if (ret > 0 && (EXT4_I(inode)->i_state & ++ EXT4_STATE_DIO_UNWRITTEN)) { + int err; + /* + * for non AIO case, since the IO is already +@@ -3776,6 +3777,7 @@ static ssize_t ext4_ext_direct_IO(int rw + offset, ret); + if (err < 0) + ret = err; ++ EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; + } + return ret; + } diff --git a/queue-2.6.31/0055-ext4-code-clean-up-for-dio-fallocate-handling.patch b/queue-2.6.31/0055-ext4-code-clean-up-for-dio-fallocate-handling.patch new file mode 100644 index 00000000000..db3971830b9 --- /dev/null +++ b/queue-2.6.31/0055-ext4-code-clean-up-for-dio-fallocate-handling.patch @@ -0,0 +1,52 @@ +From 92f6ebef4be7dffcb13f0f70e4c3b3200feb0c6b Mon Sep 17 00:00:00 2001 +From: Mingming +Date: Tue, 3 Nov 2009 14:44:54 -0500 +Subject: [PATCH 55/85] ext4: code clean up for dio fallocate handling + +(cherry picked from commit 4b70df181611012a3556f017b57dfcef7e1d279f) + +The ext4_debug() call in ext4_end_io_dio() should be moved after the +check to make sure that io_end is non-NULL. + +The comment above ext4_get_block_dio_write() ("Maximum number of +blocks...") is a duplicate; the original and correct comment is above +the #define DIO_MAX_BLOCKS up above. + +Based on review comments from Curt Wohlgemuth. + +Signed-off-by: Mingming Cao +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3440,8 +3440,6 @@ out: + return ret; + } + +-/* Maximum number of blocks we map for direct IO at once. */ +- + static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) + { +@@ -3649,13 +3647,14 @@ static void ext4_end_io_dio(struct kiocb + ext4_io_end_t *io_end = iocb->private; + struct workqueue_struct *wq; + ++ /* if not async direct IO or dio with 0 bytes write, just return */ ++ if (!io_end || !size) ++ return; ++ + ext_debug("ext4_end_io_dio(): io_end 0x%p" + "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", + iocb->private, io_end->inode->i_ino, iocb, offset, + size); +- /* if not async direct IO or dio with 0 bytes write, just return */ +- if (!io_end || !size) +- return; + + /* if not aio dio with unwritten extents, just free io and return */ + if (io_end->flag != DIO_AIO_UNWRITTEN){ diff --git a/queue-2.6.31/0056-ext4-Fix-return-value-of-ext4_split_unwritten_extent.patch b/queue-2.6.31/0056-ext4-Fix-return-value-of-ext4_split_unwritten_extent.patch new file mode 100644 index 00000000000..5a647492573 --- /dev/null +++ b/queue-2.6.31/0056-ext4-Fix-return-value-of-ext4_split_unwritten_extent.patch @@ -0,0 +1,62 @@ +From 9ae57b4a344fc808a2b7b763ab86a4a4a0a82410 Mon Sep 17 00:00:00 2001 +From: Mingming +Date: Fri, 6 Nov 2009 04:01:23 -0500 +Subject: [PATCH 56/85] ext4: Fix return value of ext4_split_unwritten_extents() to fix direct I/O + +(cherry picked from commit ba230c3f6dc88ec008806adb27b12088486d508e) + +To prepare for a direct I/O write, we need to split the unwritten +extents before submitting the I/O. When no extents needed to be +split, ext4_split_unwritten_extents() was incorrectly returning 0 +instead of the size of uninitialized extents. This bug caused the +wrong return value sent back to VFS code when it gets called from +async IO path, leading to an unnecessary fall back to buffered IO. + +This bug also hid the fact that the check to see whether or not a +split would be necessary was incorrect; we can only skip splitting the +extent if the write completely covers the uninitialized extent. + +Signed-off-by: Mingming Cao +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/extents.c | 13 +++++++------ + 1 file changed, 7 insertions(+), 6 deletions(-) + +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -2784,6 +2784,8 @@ fix_extent_len: + * into three uninitialized extent(at most). After IO complete, the part + * being filled will be convert to initialized by the end_io callback function + * via ext4_convert_unwritten_extents(). ++ * ++ * Returns the size of uninitialized extent to be written on success. + */ + static int ext4_split_unwritten_extents(handle_t *handle, + struct inode *inode, +@@ -2801,7 +2803,6 @@ static int ext4_split_unwritten_extents( + unsigned int allocated, ee_len, depth; + ext4_fsblk_t newblock; + int err = 0; +- int ret = 0; + + ext_debug("ext4_split_unwritten_extents: inode %lu," + "iblock %llu, max_blocks %u\n", inode->i_ino, +@@ -2819,12 +2820,12 @@ static int ext4_split_unwritten_extents( + ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); + + /* +- * if the entire unintialized extent length less than +- * the size of extent to write, there is no need to split +- * uninitialized extent ++ * If the uninitialized extent begins at the same logical ++ * block where the write begins, and the write completely ++ * covers the extent, then we don't need to split it. + */ +- if (allocated <= max_blocks) +- return ret; ++ if ((iblock == ee_block) && (allocated <= max_blocks)) ++ return allocated; + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) diff --git a/queue-2.6.31/0057-ext4-fix-potential-buffer-head-leak-when-add_dirent_.patch b/queue-2.6.31/0057-ext4-fix-potential-buffer-head-leak-when-add_dirent_.patch new file mode 100644 index 00000000000..16a9e0f7537 --- /dev/null +++ b/queue-2.6.31/0057-ext4-fix-potential-buffer-head-leak-when-add_dirent_.patch @@ -0,0 +1,122 @@ +From 217fe01d91370e4748409f76f5aae0081085b140 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Mon, 23 Nov 2009 07:25:49 -0500 +Subject: [PATCH 57/85] ext4: fix potential buffer head leak when add_dirent_to_buf() returns ENOSPC + +(cherry picked from commit 2de770a406b06dfc619faabbf5d85c835ed3f2e1) + +Previously add_dirent_to_buf() did not free its passed-in buffer head +in the case of ENOSPC, since in some cases the caller still needed it. +However, this led to potential buffer head leaks since not all callers +dealt with this correctly. Fix this by making simplifying the freeing +convention; now add_dirent_to_buf() *never* frees the passed-in buffer +head, and leaves that to the responsibility of its caller. This makes +things cleaner and easier to prove that the code is neither leaking +buffer heads or calling brelse() one time too many. + +Signed-off-by: "Theodore Ts'o" +Cc: Curt Wohlgemuth +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/namei.c | 30 ++++++++++++------------------ + 1 file changed, 12 insertions(+), 18 deletions(-) + +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -1292,9 +1292,6 @@ errout: + * add_dirent_to_buf will attempt search the directory block for + * space. It will return -ENOSPC if no space is available, and -EIO + * and -EEXIST if directory entry already exists. +- * +- * NOTE! bh is NOT released in the case where ENOSPC is returned. In +- * all other cases bh is released. + */ + static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, + struct inode *inode, struct ext4_dir_entry_2 *de, +@@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *h + top = bh->b_data + blocksize - reclen; + while ((char *) de <= top) { + if (!ext4_check_dir_entry("ext4_add_entry", dir, de, +- bh, offset)) { +- brelse(bh); ++ bh, offset)) + return -EIO; +- } +- if (ext4_match(namelen, name, de)) { +- brelse(bh); ++ if (ext4_match(namelen, name, de)) + return -EEXIST; +- } + nlen = EXT4_DIR_REC_LEN(de->name_len); + rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); + if ((de->inode? rlen - nlen: rlen) >= reclen) +@@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *h + err = ext4_journal_get_write_access(handle, bh); + if (err) { + ext4_std_error(dir->i_sb, err); +- brelse(bh); + return err; + } + +@@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *h + err = ext4_handle_dirty_metadata(handle, dir, bh); + if (err) + ext4_std_error(dir->i_sb, err); +- brelse(bh); + return 0; + } + +@@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *ha + if (!(de)) + return retval; + +- return add_dirent_to_buf(handle, dentry, inode, de, bh); ++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ brelse(bh); ++ return retval; + } + + /* +@@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *hand + if(!bh) + return retval; + retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); +- if (retval != -ENOSPC) ++ if (retval != -ENOSPC) { ++ brelse(bh); + return retval; ++ } + + if (blocks == 1 && !dx_fallback && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) +@@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *hand + de = (struct ext4_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); +- return add_dirent_to_buf(handle, dentry, inode, de, bh); ++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh); ++ brelse(bh); ++ return retval; + } + + /* +@@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *h + goto journal_error; + + err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); +- if (err != -ENOSPC) { +- bh = NULL; ++ if (err != -ENOSPC) + goto cleanup; +- } + + /* Block full, should compress but for now just split */ + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", +@@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *h + if (!de) + goto cleanup; + err = add_dirent_to_buf(handle, dentry, inode, de, bh); +- bh = NULL; + goto cleanup; + + journal_error: diff --git a/queue-2.6.31/0058-ext4-avoid-divide-by-zero-when-trying-to-mount-a-cor.patch b/queue-2.6.31/0058-ext4-avoid-divide-by-zero-when-trying-to-mount-a-cor.patch new file mode 100644 index 00000000000..76ebfaeb2a5 --- /dev/null +++ b/queue-2.6.31/0058-ext4-avoid-divide-by-zero-when-trying-to-mount-a-cor.patch @@ -0,0 +1,43 @@ +From de0ca30bddfa2e6ef13a69c35638aceec209bee1 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Mon, 23 Nov 2009 07:24:46 -0500 +Subject: [PATCH 58/85] ext4: avoid divide by zero when trying to mount a corrupted file system + +(cherry picked from commit 503358ae01b70ce6909d19dd01287093f6b6271c) + +If s_log_groups_per_flex is greater than 31, then groups_per_flex will +will overflow and cause a divide by zero error. This can cause kernel +BUG if such a file system is mounted. + +Thanks to Nageswara R Sastry for analyzing the failure and providing +an initial patch. + +http://bugzilla.kernel.org/show_bug.cgi?id=14287 + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/super.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1695,14 +1695,14 @@ static int ext4_fill_flex_info(struct su + size_t size; + int i; + +- if (!sbi->s_es->s_log_groups_per_flex) { ++ sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; ++ groups_per_flex = 1 << sbi->s_log_groups_per_flex; ++ ++ if (groups_per_flex < 2) { + sbi->s_log_groups_per_flex = 0; + return 1; + } + +- sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; +- groups_per_flex = 1 << sbi->s_log_groups_per_flex; +- + /* We allocate both existing and potentially added groups */ + flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + + ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << diff --git a/queue-2.6.31/0059-ext4-fix-the-returned-block-count-if-EXT4_IOC_MOVE_E.patch b/queue-2.6.31/0059-ext4-fix-the-returned-block-count-if-EXT4_IOC_MOVE_E.patch new file mode 100644 index 00000000000..3248b039c32 --- /dev/null +++ b/queue-2.6.31/0059-ext4-fix-the-returned-block-count-if-EXT4_IOC_MOVE_E.patch @@ -0,0 +1,353 @@ +From 9c3f2f73d340b25d6e3d15688e3051ad1a159f78 Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Mon, 23 Nov 2009 07:25:48 -0500 +Subject: [PATCH 59/85] ext4: fix the returned block count if EXT4_IOC_MOVE_EXT fails + +(cherry picked from commit f868a48d06f8886cb0367568a12367fa4f21ea0d) + +If the EXT4_IOC_MOVE_EXT ioctl fails, the number of blocks that were +exchanged before the failure should be returned to the userspace +caller. Unfortunately, currently if the block size is not the same as +the page size, the returned block count that is returned is the +page-aligned block count instead of the actual block count. This +commit addresses this bug. + +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/move_extent.c | 139 ++++++++++++++++++++++++++------------------------ + 1 file changed, 73 insertions(+), 66 deletions(-) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -661,6 +661,7 @@ mext_calc_swap_extents(struct ext4_exten + * @donor_inode: donor inode + * @from: block offset of orig_inode + * @count: block count to be replaced ++ * @err: pointer to save return value + * + * Replace original inode extents and donor inode extents page by page. + * We implement this replacement in the following three steps: +@@ -671,19 +672,18 @@ mext_calc_swap_extents(struct ext4_exten + * 3. Change the block information of donor inode to point at the saved + * original inode blocks in the dummy extents. + * +- * Return 0 on success, or a negative error value on failure. ++ * Return replaced block count. + */ + static int + mext_replace_branches(handle_t *handle, struct inode *orig_inode, + struct inode *donor_inode, ext4_lblk_t from, +- ext4_lblk_t count) ++ ext4_lblk_t count, int *err) + { + struct ext4_ext_path *orig_path = NULL; + struct ext4_ext_path *donor_path = NULL; + struct ext4_extent *oext, *dext; + struct ext4_extent tmp_dext, tmp_oext; + ext4_lblk_t orig_off = from, donor_off = from; +- int err = 0; + int depth; + int replaced_count = 0; + int dext_alen; +@@ -691,13 +691,13 @@ mext_replace_branches(handle_t *handle, + mext_double_down_write(orig_inode, donor_inode); + + /* Get the original extent for the block "orig_off" */ +- err = get_ext_path(orig_inode, orig_off, &orig_path); +- if (err) ++ *err = get_ext_path(orig_inode, orig_off, &orig_path); ++ if (*err) + goto out; + + /* Get the donor extent for the head */ +- err = get_ext_path(donor_inode, donor_off, &donor_path); +- if (err) ++ *err = get_ext_path(donor_inode, donor_off, &donor_path); ++ if (*err) + goto out; + depth = ext_depth(orig_inode); + oext = orig_path[depth].p_ext; +@@ -707,9 +707,9 @@ mext_replace_branches(handle_t *handle, + dext = donor_path[depth].p_ext; + tmp_dext = *dext; + +- err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, ++ *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + donor_off, count); +- if (err) ++ if (*err) + goto out; + + /* Loop for the donor extents */ +@@ -718,7 +718,7 @@ mext_replace_branches(handle_t *handle, + if (!dext) { + ext4_error(donor_inode->i_sb, __func__, + "The extent for donor must be found"); +- err = -EIO; ++ *err = -EIO; + goto out; + } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { + ext4_error(donor_inode->i_sb, __func__, +@@ -726,20 +726,20 @@ mext_replace_branches(handle_t *handle, + "extent(%u) should be equal", + donor_off, + le32_to_cpu(tmp_dext.ee_block)); +- err = -EIO; ++ *err = -EIO; + goto out; + } + + /* Set donor extent to orig extent */ +- err = mext_leaf_block(handle, orig_inode, ++ *err = mext_leaf_block(handle, orig_inode, + orig_path, &tmp_dext, &orig_off); +- if (err < 0) ++ if (*err) + goto out; + + /* Set orig extent to donor extent */ +- err = mext_leaf_block(handle, donor_inode, ++ *err = mext_leaf_block(handle, donor_inode, + donor_path, &tmp_oext, &donor_off); +- if (err < 0) ++ if (*err) + goto out; + + dext_alen = ext4_ext_get_actual_len(&tmp_dext); +@@ -753,35 +753,25 @@ mext_replace_branches(handle_t *handle, + + if (orig_path) + ext4_ext_drop_refs(orig_path); +- err = get_ext_path(orig_inode, orig_off, &orig_path); +- if (err) ++ *err = get_ext_path(orig_inode, orig_off, &orig_path); ++ if (*err) + goto out; + depth = ext_depth(orig_inode); + oext = orig_path[depth].p_ext; +- if (le32_to_cpu(oext->ee_block) + +- ext4_ext_get_actual_len(oext) <= orig_off) { +- err = 0; +- goto out; +- } + tmp_oext = *oext; + + if (donor_path) + ext4_ext_drop_refs(donor_path); +- err = get_ext_path(donor_inode, donor_off, &donor_path); +- if (err) ++ *err = get_ext_path(donor_inode, donor_off, &donor_path); ++ if (*err) + goto out; + depth = ext_depth(donor_inode); + dext = donor_path[depth].p_ext; +- if (le32_to_cpu(dext->ee_block) + +- ext4_ext_get_actual_len(dext) <= donor_off) { +- err = 0; +- goto out; +- } + tmp_dext = *dext; + +- err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, ++ *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, + donor_off, count - replaced_count); +- if (err) ++ if (*err) + goto out; + } + +@@ -796,7 +786,7 @@ out: + } + + mext_double_up_write(orig_inode, donor_inode); +- return err; ++ return replaced_count; + } + + /** +@@ -808,16 +798,17 @@ out: + * @data_offset_in_page: block index where data swapping starts + * @block_len_in_page: the number of blocks to be swapped + * @uninit: orig extent is uninitialized or not ++ * @err: pointer to save return value + * + * Save the data in original inode blocks and replace original inode extents + * with donor inode extents by calling mext_replace_branches(). +- * Finally, write out the saved data in new original inode blocks. Return 0 +- * on success, or a negative error value on failure. ++ * Finally, write out the saved data in new original inode blocks. Return ++ * replaced block count. + */ + static int + move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + pgoff_t orig_page_offset, int data_offset_in_page, +- int block_len_in_page, int uninit) ++ int block_len_in_page, int uninit, int *err) + { + struct inode *orig_inode = o_filp->f_dentry->d_inode; + struct address_space *mapping = orig_inode->i_mapping; +@@ -829,9 +820,11 @@ move_extent_per_page(struct file *o_filp + long long offs = orig_page_offset << PAGE_CACHE_SHIFT; + unsigned long blocksize = orig_inode->i_sb->s_blocksize; + unsigned int w_flags = 0; +- unsigned int tmp_data_len, data_len; ++ unsigned int tmp_data_size, data_size, replaced_size; + void *fsdata; +- int ret, i, jblocks; ++ int i, jblocks; ++ int err2 = 0; ++ int replaced_count = 0; + int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + + /* +@@ -841,8 +834,8 @@ move_extent_per_page(struct file *o_filp + jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; + handle = ext4_journal_start(orig_inode, jblocks); + if (IS_ERR(handle)) { +- ret = PTR_ERR(handle); +- return ret; ++ *err = PTR_ERR(handle); ++ return 0; + } + + if (segment_eq(get_fs(), KERNEL_DS)) +@@ -858,9 +851,9 @@ move_extent_per_page(struct file *o_filp + * Just swap data blocks between orig and donor. + */ + if (uninit) { +- ret = mext_replace_branches(handle, orig_inode, +- donor_inode, orig_blk_offset, +- block_len_in_page); ++ replaced_count = mext_replace_branches(handle, orig_inode, ++ donor_inode, orig_blk_offset, ++ block_len_in_page, err); + + /* Clear the inode cache not to refer to the old data */ + ext4_ext_invalidate_cache(orig_inode); +@@ -870,27 +863,28 @@ move_extent_per_page(struct file *o_filp + + offs = (long long)orig_blk_offset << orig_inode->i_blkbits; + +- /* Calculate data_len */ ++ /* Calculate data_size */ + if ((orig_blk_offset + block_len_in_page - 1) == + ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { + /* Replace the last block */ +- tmp_data_len = orig_inode->i_size & (blocksize - 1); ++ tmp_data_size = orig_inode->i_size & (blocksize - 1); + /* +- * If data_len equal zero, it shows data_len is multiples of ++ * If data_size equal zero, it shows data_size is multiples of + * blocksize. So we set appropriate value. + */ +- if (tmp_data_len == 0) +- tmp_data_len = blocksize; ++ if (tmp_data_size == 0) ++ tmp_data_size = blocksize; + +- data_len = tmp_data_len + ++ data_size = tmp_data_size + + ((block_len_in_page - 1) << orig_inode->i_blkbits); +- } else { +- data_len = block_len_in_page << orig_inode->i_blkbits; +- } ++ } else ++ data_size = block_len_in_page << orig_inode->i_blkbits; ++ ++ replaced_size = data_size; + +- ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags, ++ *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags, + &page, &fsdata); +- if (unlikely(ret < 0)) ++ if (unlikely(*err < 0)) + goto out; + + if (!PageUptodate(page)) { +@@ -911,10 +905,17 @@ move_extent_per_page(struct file *o_filp + /* Release old bh and drop refs */ + try_to_release_page(page, 0); + +- ret = mext_replace_branches(handle, orig_inode, donor_inode, +- orig_blk_offset, block_len_in_page); +- if (ret < 0) +- goto out; ++ replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, ++ orig_blk_offset, block_len_in_page, ++ &err2); ++ if (err2) { ++ if (replaced_count) { ++ block_len_in_page = replaced_count; ++ replaced_size = ++ block_len_in_page << orig_inode->i_blkbits; ++ } else ++ goto out; ++ } + + /* Clear the inode cache not to refer to the old data */ + ext4_ext_invalidate_cache(orig_inode); +@@ -928,16 +929,16 @@ move_extent_per_page(struct file *o_filp + bh = bh->b_this_page; + + for (i = 0; i < block_len_in_page; i++) { +- ret = ext4_get_block(orig_inode, ++ *err = ext4_get_block(orig_inode, + (sector_t)(orig_blk_offset + i), bh, 0); +- if (ret < 0) ++ if (*err < 0) + goto out; + + if (bh->b_this_page != NULL) + bh = bh->b_this_page; + } + +- ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len, ++ *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size, + page, fsdata); + page = NULL; + +@@ -951,7 +952,10 @@ out: + out2: + ext4_journal_stop(handle); + +- return ret < 0 ? ret : 0; ++ if (err2) ++ *err = err2; ++ ++ return replaced_count; + } + + /** +@@ -1367,15 +1371,17 @@ ext4_move_extents(struct file *o_filp, s + while (orig_page_offset <= seq_end_page) { + + /* Swap original branches with new branches */ +- ret1 = move_extent_per_page(o_filp, donor_inode, ++ block_len_in_page = move_extent_per_page( ++ o_filp, donor_inode, + orig_page_offset, + data_offset_in_page, +- block_len_in_page, uninit); +- if (ret1 < 0) +- goto out; +- orig_page_offset++; ++ block_len_in_page, uninit, ++ &ret1); ++ + /* Count how many blocks we have exchanged */ + *moved_len += block_len_in_page; ++ if (ret1 < 0) ++ goto out; + if (*moved_len > len) { + ext4_error(orig_inode->i_sb, __func__, + "We replaced blocks too much! " +@@ -1385,6 +1391,7 @@ ext4_move_extents(struct file *o_filp, s + goto out; + } + ++ orig_page_offset++; + data_offset_in_page = 0; + rest_blocks -= block_len_in_page; + if (rest_blocks > blocks_per_page) diff --git a/queue-2.6.31/0060-ext4-fix-lock-order-problem-in-ext4_move_extents.patch b/queue-2.6.31/0060-ext4-fix-lock-order-problem-in-ext4_move_extents.patch new file mode 100644 index 00000000000..df888ee30b6 --- /dev/null +++ b/queue-2.6.31/0060-ext4-fix-lock-order-problem-in-ext4_move_extents.patch @@ -0,0 +1,314 @@ +From ac8a1aac1d890562a6b843ecdfc146a9eb00b359 Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Mon, 23 Nov 2009 07:24:43 -0500 +Subject: [PATCH 60/85] ext4: fix lock order problem in ext4_move_extents() + +(cherry picked from commit fc04cb49a898c372a22b21fffc47f299d8710801) + +ext4_move_extents() checks the logical block contiguousness +of original file with ext4_find_extent() and mext_next_extent(). +Therefore the extent which ext4_ext_path structure indicates +must not be changed between above functions. + +But in current implementation, there is no i_data_sem protection +between ext4_ext_find_extent() and mext_next_extent(). So the extent +which ext4_ext_path structure indicates may be overwritten by +delalloc. As a result, ext4_move_extents() will exchange wrong blocks +between original and donor files. I change the place where +acquire/release i_data_sem to solve this problem. + +Moreover, I changed move_extent_per_page() to start transaction first, +and then acquire i_data_sem. Without this change, there is a +possibility of the deadlock between mmap() and ext4_move_extents(): + +* NOTE: "A", "B" and "C" mean different processes + +A-1: ext4_ext_move_extents() acquires i_data_sem of two inodes. + +B: do_page_fault() starts the transaction (T), + and then tries to acquire i_data_sem. + But process "A" is already holding it, so it is kept waiting. + +C: While "A" and "B" running, kjournald2 tries to commit transaction (T) + but it is under updating, so kjournald2 waits for it. + +A-2: Call ext4_journal_start with holding i_data_sem, + but transaction (T) is locked. + +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/move_extent.c | 117 ++++++++++++++++++++++---------------------------- + 1 file changed, 53 insertions(+), 64 deletions(-) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -77,12 +77,14 @@ static int + mext_next_extent(struct inode *inode, struct ext4_ext_path *path, + struct ext4_extent **extent) + { ++ struct ext4_extent_header *eh; + int ppos, leaf_ppos = path->p_depth; + + ppos = leaf_ppos; + if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { + /* leaf block */ + *extent = ++path[ppos].p_ext; ++ path[ppos].p_block = ext_pblock(path[ppos].p_ext); + return 0; + } + +@@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, st + ext_block_hdr(path[cur_ppos+1].p_bh); + } + ++ path[leaf_ppos].p_ext = *extent = NULL; ++ ++ eh = path[leaf_ppos].p_hdr; ++ if (le16_to_cpu(eh->eh_entries) == 0) ++ /* empty leaf is found */ ++ return -ENODATA; ++ + /* leaf block */ + path[leaf_ppos].p_ext = *extent = + EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); ++ path[leaf_ppos].p_block = ++ ext_pblock(path[leaf_ppos].p_ext); + return 0; + } + } +@@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inod + } + + /** +- * mext_double_down_read - Acquire two inodes' read semaphore +- * +- * @orig_inode: original inode structure +- * @donor_inode: donor inode structure +- * Acquire read semaphore of the two inodes (orig and donor) by i_ino order. +- */ +-static void +-mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode) +-{ +- struct inode *first = orig_inode, *second = donor_inode; +- +- /* +- * Use the inode number to provide the stable locking order instead +- * of its address, because the C language doesn't guarantee you can +- * compare pointers that don't come from the same array. +- */ +- if (donor_inode->i_ino < orig_inode->i_ino) { +- first = donor_inode; +- second = orig_inode; +- } +- +- down_read(&EXT4_I(first)->i_data_sem); +- down_read(&EXT4_I(second)->i_data_sem); +-} +- +-/** +- * mext_double_down_write - Acquire two inodes' write semaphore ++ * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem + * + * @orig_inode: original inode structure + * @donor_inode: donor inode structure +- * Acquire write semaphore of the two inodes (orig and donor) by i_ino order. ++ * Acquire write lock of i_data_sem of the two inodes (orig and donor) by ++ * i_ino order. + */ + static void +-mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) ++double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) + { + struct inode *first = orig_inode, *second = donor_inode; + +@@ -207,28 +193,14 @@ mext_double_down_write(struct inode *ori + } + + /** +- * mext_double_up_read - Release two inodes' read semaphore ++ * double_up_write_data_sem - Release two inodes' write lock of i_data_sem + * + * @orig_inode: original inode structure to be released its lock first + * @donor_inode: donor inode structure to be released its lock second +- * Release read semaphore of two inodes (orig and donor). ++ * Release write lock of i_data_sem of two inodes (orig and donor). + */ + static void +-mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) +-{ +- up_read(&EXT4_I(orig_inode)->i_data_sem); +- up_read(&EXT4_I(donor_inode)->i_data_sem); +-} +- +-/** +- * mext_double_up_write - Release two inodes' write semaphore +- * +- * @orig_inode: original inode structure to be released its lock first +- * @donor_inode: donor inode structure to be released its lock second +- * Release write semaphore of two inodes (orig and donor). +- */ +-static void +-mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) ++double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) + { + up_write(&EXT4_I(orig_inode)->i_data_sem); + up_write(&EXT4_I(donor_inode)->i_data_sem); +@@ -688,8 +660,6 @@ mext_replace_branches(handle_t *handle, + int replaced_count = 0; + int dext_alen; + +- mext_double_down_write(orig_inode, donor_inode); +- + /* Get the original extent for the block "orig_off" */ + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) +@@ -785,7 +755,6 @@ out: + kfree(donor_path); + } + +- mext_double_up_write(orig_inode, donor_inode); + return replaced_count; + } + +@@ -851,6 +820,11 @@ move_extent_per_page(struct file *o_filp + * Just swap data blocks between orig and donor. + */ + if (uninit) { ++ /* ++ * Protect extent trees against block allocations ++ * via delalloc ++ */ ++ double_down_write_data_sem(orig_inode, donor_inode); + replaced_count = mext_replace_branches(handle, orig_inode, + donor_inode, orig_blk_offset, + block_len_in_page, err); +@@ -858,6 +832,7 @@ move_extent_per_page(struct file *o_filp + /* Clear the inode cache not to refer to the old data */ + ext4_ext_invalidate_cache(orig_inode); + ext4_ext_invalidate_cache(donor_inode); ++ double_up_write_data_sem(orig_inode, donor_inode); + goto out2; + } + +@@ -905,6 +880,8 @@ move_extent_per_page(struct file *o_filp + /* Release old bh and drop refs */ + try_to_release_page(page, 0); + ++ /* Protect extent trees against block allocations via delalloc */ ++ double_down_write_data_sem(orig_inode, donor_inode); + replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, + orig_blk_offset, block_len_in_page, + &err2); +@@ -913,14 +890,18 @@ move_extent_per_page(struct file *o_filp + block_len_in_page = replaced_count; + replaced_size = + block_len_in_page << orig_inode->i_blkbits; +- } else ++ } else { ++ double_up_write_data_sem(orig_inode, donor_inode); + goto out; ++ } + } + + /* Clear the inode cache not to refer to the old data */ + ext4_ext_invalidate_cache(orig_inode); + ext4_ext_invalidate_cache(donor_inode); + ++ double_up_write_data_sem(orig_inode, donor_inode); ++ + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); + +@@ -1236,16 +1217,16 @@ ext4_move_extents(struct file *o_filp, s + return -EINVAL; + } + +- /* protect orig and donor against a truncate */ ++ /* Protect orig and donor inodes against a truncate */ + ret1 = mext_inode_double_lock(orig_inode, donor_inode); + if (ret1 < 0) + return ret1; + +- mext_double_down_read(orig_inode, donor_inode); ++ /* Protect extent tree against block allocations via delalloc */ ++ double_down_write_data_sem(orig_inode, donor_inode); + /* Check the filesystem environment whether move_extent can be done */ + ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, + donor_start, &len, *moved_len); +- mext_double_up_read(orig_inode, donor_inode); + if (ret1) + goto out; + +@@ -1308,6 +1289,10 @@ ext4_move_extents(struct file *o_filp, s + ext4_ext_get_actual_len(ext_cur), block_end + 1) - + max(le32_to_cpu(ext_cur->ee_block), block_start); + ++ /* Discard preallocations of two inodes */ ++ ext4_discard_preallocations(orig_inode); ++ ext4_discard_preallocations(donor_inode); ++ + while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { + seq_blocks += add_blocks; + +@@ -1359,14 +1344,14 @@ ext4_move_extents(struct file *o_filp, s + seq_start = le32_to_cpu(ext_cur->ee_block); + rest_blocks = seq_blocks; + +- /* Discard preallocations of two inodes */ +- down_write(&EXT4_I(orig_inode)->i_data_sem); +- ext4_discard_preallocations(orig_inode); +- up_write(&EXT4_I(orig_inode)->i_data_sem); +- +- down_write(&EXT4_I(donor_inode)->i_data_sem); +- ext4_discard_preallocations(donor_inode); +- up_write(&EXT4_I(donor_inode)->i_data_sem); ++ /* ++ * Up semaphore to avoid following problems: ++ * a. transaction deadlock among ext4_journal_start, ++ * ->write_begin via pagefault, and jbd2_journal_commit ++ * b. racing with ->readpage, ->write_begin, and ext4_get_block ++ * in move_extent_per_page ++ */ ++ double_up_write_data_sem(orig_inode, donor_inode); + + while (orig_page_offset <= seq_end_page) { + +@@ -1381,14 +1366,14 @@ ext4_move_extents(struct file *o_filp, s + /* Count how many blocks we have exchanged */ + *moved_len += block_len_in_page; + if (ret1 < 0) +- goto out; ++ break; + if (*moved_len > len) { + ext4_error(orig_inode->i_sb, __func__, + "We replaced blocks too much! " + "sum of replaced: %llu requested: %llu", + *moved_len, len); + ret1 = -EIO; +- goto out; ++ break; + } + + orig_page_offset++; +@@ -1400,6 +1385,10 @@ ext4_move_extents(struct file *o_filp, s + block_len_in_page = rest_blocks; + } + ++ double_down_write_data_sem(orig_inode, donor_inode); ++ if (ret1 < 0) ++ break; ++ + /* Decrease buffer counter */ + if (holecheck_path) + ext4_ext_drop_refs(holecheck_path); +@@ -1429,7 +1418,7 @@ out: + ext4_ext_drop_refs(holecheck_path); + kfree(holecheck_path); + } +- ++ double_up_write_data_sem(orig_inode, donor_inode); + ret2 = mext_inode_double_unlock(orig_inode, donor_inode); + + if (ret1) diff --git a/queue-2.6.31/0061-ext4-fix-possible-recursive-locking-warning-in-EXT4_.patch b/queue-2.6.31/0061-ext4-fix-possible-recursive-locking-warning-in-EXT4_.patch new file mode 100644 index 00000000000..5528890a0a4 --- /dev/null +++ b/queue-2.6.31/0061-ext4-fix-possible-recursive-locking-warning-in-EXT4_.patch @@ -0,0 +1,36 @@ +From 25e65c388a1b7d4163e12289c070b5da0cc933c9 Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Mon, 23 Nov 2009 07:24:41 -0500 +Subject: [PATCH 61/85] ext4: fix possible recursive locking warning in EXT4_IOC_MOVE_EXT + +(cherry picked from commit 49bd22bc4d603a2a4fc2a6a60e156cbea52eb494) + +If CONFIG_PROVE_LOCKING is enabled, the double_down_write_data_sem() +will trigger a false-positive warning of a recursive lock. Since we +take i_data_sem for the two inodes ordered by their inode numbers, +this isn't a problem. Use of down_write_nested() will notify the lock +dependency checker machinery that there is no problem here. + +This problem was reported by Brian Rogers: + + http://marc.info/?l=linux-ext4&m=125115356928011&w=1 + +Reported-by: Brian Rogers +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/move_extent.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -189,7 +189,7 @@ double_down_write_data_sem(struct inode + } + + down_write(&EXT4_I(first)->i_data_sem); +- down_write(&EXT4_I(second)->i_data_sem); ++ down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING); + } + + /** diff --git a/queue-2.6.31/0062-ext4-plug-a-buffer_head-leak-in-an-error-path-of-ext.patch b/queue-2.6.31/0062-ext4-plug-a-buffer_head-leak-in-an-error-path-of-ext.patch new file mode 100644 index 00000000000..ec20ef9f176 --- /dev/null +++ b/queue-2.6.31/0062-ext4-plug-a-buffer_head-leak-in-an-error-path-of-ext.patch @@ -0,0 +1,86 @@ +From a3b3756ab5968074429c05ce891a297a25f79d8e Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sat, 14 Nov 2009 08:19:05 -0500 +Subject: [PATCH 62/85] ext4: plug a buffer_head leak in an error path of ext4_iget() + +(cherry picked from commit 567f3e9a70d71e5c9be03701b8578be77857293b) + +One of the invalid error paths in ext4_iget() forgot to brelse() the +inode buffer head. Fix it by adding a brelse() in the common error +return path, which also simplifies function. + +Thanks to Andi Kleen reporting the problem. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 11 +++-------- + 1 file changed, 3 insertions(+), 8 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -4771,7 +4771,6 @@ struct inode *ext4_iget(struct super_blo + struct ext4_iloc iloc; + struct ext4_inode *raw_inode; + struct ext4_inode_info *ei; +- struct buffer_head *bh; + struct inode *inode; + long ret; + int block; +@@ -4783,11 +4782,11 @@ struct inode *ext4_iget(struct super_blo + return inode; + + ei = EXT4_I(inode); ++ iloc.bh = 0; + + ret = __ext4_get_inode_loc(inode, &iloc, 0); + if (ret < 0) + goto bad_inode; +- bh = iloc.bh; + raw_inode = ext4_raw_inode(&iloc); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); +@@ -4810,7 +4809,6 @@ struct inode *ext4_iget(struct super_blo + if (inode->i_mode == 0 || + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { + /* this inode is deleted */ +- brelse(bh); + ret = -ESTALE; + goto bad_inode; + } +@@ -4842,7 +4840,6 @@ struct inode *ext4_iget(struct super_blo + ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > + EXT4_INODE_SIZE(inode->i_sb)) { +- brelse(bh); + ret = -EIO; + goto bad_inode; + } +@@ -4895,10 +4892,8 @@ struct inode *ext4_iget(struct super_blo + /* Validate block references which are part of inode */ + ret = ext4_check_inode_blockref(inode); + } +- if (ret) { +- brelse(bh); ++ if (ret) + goto bad_inode; +- } + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext4_file_inode_operations; +@@ -4926,7 +4921,6 @@ struct inode *ext4_iget(struct super_blo + init_special_inode(inode, inode->i_mode, + new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); + } else { +- brelse(bh); + ret = -EIO; + ext4_error(inode->i_sb, __func__, + "bogus i_mode (%o) for inode=%lu", +@@ -4939,6 +4933,7 @@ struct inode *ext4_iget(struct super_blo + return inode; + + bad_inode: ++ brelse(iloc.bh); + iget_failed(inode); + return ERR_PTR(ret); + } diff --git a/queue-2.6.31/0063-ext4-make-sure-directory-and-symlink-blocks-are-revo.patch b/queue-2.6.31/0063-ext4-make-sure-directory-and-symlink-blocks-are-revo.patch new file mode 100644 index 00000000000..5b42b3ec73a --- /dev/null +++ b/queue-2.6.31/0063-ext4-make-sure-directory-and-symlink-blocks-are-revo.patch @@ -0,0 +1,62 @@ +From 3a9b0aea089a6b0df0dc16f2040c1719571929db Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Mon, 23 Nov 2009 07:17:34 -0500 +Subject: [PATCH 63/85] ext4: make sure directory and symlink blocks are revoked + +(cherry picked from commit 50689696867d95b38d9c7be640a311494a04fb86) + +When an inode gets unlinked, the functions ext4_clear_blocks() and +ext4_remove_blocks() call ext4_forget() for all the buffer heads +corresponding to the deleted inode's data blocks. If the inode is a +directory or a symlink, the is_metadata parameter must be non-zero so +ext4_forget() will revoke them via jbd2_journal_revoke(). Otherwise, +if these blocks are reused for a data file, and the system crashes +before a journal checkpoint, the journal replay could end up +corrupting these data blocks. + +Thanks to Curt Wohlgemuth for pointing out potential problems in this +area. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/extents.c | 2 +- + fs/ext4/inode.c | 6 ++++-- + 2 files changed, 5 insertions(+), 3 deletions(-) + +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -2055,7 +2055,7 @@ static int ext4_remove_blocks(handle_t * + ext_debug("free last %u blocks starting %llu\n", num, start); + for (i = 0; i < num; i++) { + bh = sb_find_get_block(inode->i_sb, start + i); +- ext4_forget(handle, 0, inode, bh, start + i); ++ ext4_forget(handle, metadata, inode, bh, start + i); + } + ext4_free_blocks(handle, inode, start, num, metadata); + } else if (from == le32_to_cpu(ex->ee_block) +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -4110,6 +4110,8 @@ static void ext4_clear_blocks(handle_t * + __le32 *last) + { + __le32 *p; ++ int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode); ++ + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); +@@ -4140,11 +4142,11 @@ static void ext4_clear_blocks(handle_t * + + *p = 0; + tbh = sb_find_get_block(inode->i_sb, nr); +- ext4_forget(handle, 0, inode, tbh, nr); ++ ext4_forget(handle, is_metadata, inode, tbh, nr); + } + } + +- ext4_free_blocks(handle, inode, block_to_free, count, 0); ++ ext4_free_blocks(handle, inode, block_to_free, count, is_metadata); + } + + /** diff --git a/queue-2.6.31/0064-ext4-fix-i_flags-access-in-ext4_da_writepages_trans_.patch b/queue-2.6.31/0064-ext4-fix-i_flags-access-in-ext4_da_writepages_trans_.patch new file mode 100644 index 00000000000..1fef360fe7f --- /dev/null +++ b/queue-2.6.31/0064-ext4-fix-i_flags-access-in-ext4_da_writepages_trans_.patch @@ -0,0 +1,29 @@ +From c990dc0ba4c47fd47465c1b670bd4298a47e1c8f Mon Sep 17 00:00:00 2001 +From: Julia Lawall +Date: Sun, 15 Nov 2009 15:30:58 -0500 +Subject: [PATCH 64/85] ext4: fix i_flags access in ext4_da_writepages_trans_blocks() + +(cherry picked from commit 30c6e07a92ea4cb87160d32ffa9bce172576ae4c) + +We need to be testing the i_flags field in the ext4 specific portion +of the inode, instead of the (confusingly aliased) i_flags field in +the generic struct inode. + +Signed-off-by: Julia Lawall +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -2785,7 +2785,7 @@ static int ext4_da_writepages_trans_bloc + * number of contiguous block. So we will limit + * number of contiguous block to a sane value + */ +- if (!(inode->i_flags & EXT4_EXTENTS_FL) && ++ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && + (max_blocks > EXT4_MAX_TRANS_DATA)) + max_blocks = EXT4_MAX_TRANS_DATA; + diff --git a/queue-2.6.31/0065-ext4-journal-all-modifications-in-ext4_xattr_set_han.patch b/queue-2.6.31/0065-ext4-journal-all-modifications-in-ext4_xattr_set_han.patch new file mode 100644 index 00000000000..b6f180d7366 --- /dev/null +++ b/queue-2.6.31/0065-ext4-journal-all-modifications-in-ext4_xattr_set_han.patch @@ -0,0 +1,43 @@ +From 1435fc00f13cfaa0b296ca4163464148d30c7917 Mon Sep 17 00:00:00 2001 +From: Eric Sandeen +Date: Sun, 15 Nov 2009 15:30:52 -0500 +Subject: [PATCH 65/85] ext4: journal all modifications in ext4_xattr_set_handle + +(cherry picked from commit 86ebfd08a1930ccedb8eac0aeb1ed4b8b6a41dbc) + +ext4_xattr_set_handle() was zeroing out an inode outside +of journaling constraints; this is one of the accesses that +was causing the crc errors in journal replay as seen in +kernel.org bugzilla #14354. + +Reviewed-by: Andreas Dilger +Signed-off-by: Eric Sandeen +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/xattr.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -988,6 +988,10 @@ ext4_xattr_set_handle(handle_t *handle, + if (error) + goto cleanup; + ++ error = ext4_journal_get_write_access(handle, is.iloc.bh); ++ if (error) ++ goto cleanup; ++ + if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { + struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); + memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); +@@ -1013,9 +1017,6 @@ ext4_xattr_set_handle(handle_t *handle, + if (flags & XATTR_CREATE) + goto cleanup; + } +- error = ext4_journal_get_write_access(handle, is.iloc.bh); +- if (error) +- goto cleanup; + if (!value) { + if (!is.s.not_found) + error = ext4_xattr_ibody_set(handle, inode, &i, &is); diff --git a/queue-2.6.31/0066-ext4-don-t-update-the-superblock-in-ext4_statfs.patch b/queue-2.6.31/0066-ext4-don-t-update-the-superblock-in-ext4_statfs.patch new file mode 100644 index 00000000000..849366707ef --- /dev/null +++ b/queue-2.6.31/0066-ext4-don-t-update-the-superblock-in-ext4_statfs.patch @@ -0,0 +1,35 @@ +From 6bd9ab6fff91164288304a41b76f7623d1167fd5 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Mon, 23 Nov 2009 07:24:52 -0500 +Subject: [PATCH 66/85] ext4: don't update the superblock in ext4_statfs() + +(cherry picked from commit 3f8fb9490efbd300887470a2a880a64e04dcc3f5) + +commit a71ce8c6c9bf269b192f352ea555217815cf027e updated ext4_statfs() +to update the on-disk superblock counters, but modified this buffer +directly without any journaling of the change. This is one of the +accesses that was causing the crc errors in journal replay as seen in +kernel.org bugzilla #14354. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/super.c | 2 -- + 1 file changed, 2 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -3693,13 +3693,11 @@ static int ext4_statfs(struct dentry *de + buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; + buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - + percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); +- ext4_free_blocks_count_set(es, buf->f_bfree); + buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); + if (buf->f_bfree < ext4_r_blocks_count(es)) + buf->f_bavail = 0; + buf->f_files = le32_to_cpu(es->s_inodes_count); + buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); +- es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); + buf->f_namelen = EXT4_NAME_LEN; + fsid = le64_to_cpup((void *)es->s_uuid) ^ + le64_to_cpup((void *)es->s_uuid + sizeof(u64)); diff --git a/queue-2.6.31/0067-ext4-fix-uninit-block-bitmap-initialization-when-s_m.patch b/queue-2.6.31/0067-ext4-fix-uninit-block-bitmap-initialization-when-s_m.patch new file mode 100644 index 00000000000..fc60f69775f --- /dev/null +++ b/queue-2.6.31/0067-ext4-fix-uninit-block-bitmap-initialization-when-s_m.patch @@ -0,0 +1,33 @@ +From 3ae02f241db96170c369a41f68fd06c907af3360 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Mon, 23 Nov 2009 07:24:38 -0500 +Subject: [PATCH 67/85] ext4: fix uninit block bitmap initialization when s_meta_first_bg is non-zero + +(cherry picked from commit 8dadb198cb70ef811916668fe67eeec82e8858dd) + +The number of old-style block group descriptor blocks is +s_meta_first_bg when the meta_bg feature flag is set. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/balloc.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -761,7 +761,13 @@ static unsigned long ext4_bg_num_gdb_met + static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, + ext4_group_t group) + { +- return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0; ++ if (!ext4_bg_has_super(sb, group)) ++ return 0; ++ ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG)) ++ return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); ++ else ++ return EXT4_SB(sb)->s_gdb_count; + } + + /** diff --git a/queue-2.6.31/0068-ext4-fix-block-validity-checks-so-they-work-correctl.patch b/queue-2.6.31/0068-ext4-fix-block-validity-checks-so-they-work-correctl.patch new file mode 100644 index 00000000000..8490800ad17 --- /dev/null +++ b/queue-2.6.31/0068-ext4-fix-block-validity-checks-so-they-work-correctl.patch @@ -0,0 +1,43 @@ +From b390118afb3c4c38a2c42d8b905c7382bbd69bca Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sun, 15 Nov 2009 15:29:56 -0500 +Subject: [PATCH 68/85] ext4: fix block validity checks so they work correctly with meta_bg + +(cherry picked from commit 1032988c71f3f85483b2b4319684d1205a704c02) + +The block validity checks used by ext4_data_block_valid() wasn't +correctly written to check file systems with the meta_bg feature. Fix +this. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/block_validity.c | 2 +- + fs/ext4/inode.c | 5 +---- + 2 files changed, 2 insertions(+), 5 deletions(-) + +--- a/fs/ext4/block_validity.c ++++ b/fs/ext4/block_validity.c +@@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_ + if (ext4_bg_has_super(sb, i) && + ((i < 5) || ((i % flex_size) == 0))) + add_system_zone(sbi, ext4_group_first_block_no(sb, i), +- sbi->s_gdb_count + 1); ++ ext4_bg_num_gdb(sb, i) + 1); + gdp = ext4_get_group_desc(sb, i, NULL); + ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); + if (ret) +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -4873,10 +4873,7 @@ struct inode *ext4_iget(struct super_blo + + ret = 0; + if (ei->i_file_acl && +- ((ei->i_file_acl < +- (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + +- EXT4_SB(sb)->s_gdb_count)) || +- (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) { ++ !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { + ext4_error(sb, __func__, + "bad extended attribute block %llu in inode #%lu", + ei->i_file_acl, inode->i_ino); diff --git a/queue-2.6.31/0069-ext4-avoid-issuing-unnecessary-barriers.patch b/queue-2.6.31/0069-ext4-avoid-issuing-unnecessary-barriers.patch new file mode 100644 index 00000000000..a846d053246 --- /dev/null +++ b/queue-2.6.31/0069-ext4-avoid-issuing-unnecessary-barriers.patch @@ -0,0 +1,41 @@ +From 7f93bc075c55fa4dfdf1bd96fed3778794ca8d2a Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Mon, 23 Nov 2009 07:24:57 -0500 +Subject: [PATCH 69/85] ext4: avoid issuing unnecessary barriers + +(cherry picked from commit 6b17d902fdd241adfa4ce780df20547b28bf5801) + +We don't to issue an I/O barrier on an error or if we force commit +because we are doing data journaling. + +Signed-off-by: "Theodore Ts'o" +Cc: Jan Kara +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/fsync.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +--- a/fs/ext4/fsync.c ++++ b/fs/ext4/fsync.c +@@ -60,7 +60,7 @@ int ext4_sync_file(struct file *file, st + + ret = flush_aio_dio_completed_IO(inode); + if (ret < 0) +- goto out; ++ return ret; + /* + * data=writeback: + * The caller's filemap_fdatawrite()/wait will sync the data. +@@ -79,10 +79,8 @@ int ext4_sync_file(struct file *file, st + * (they were dirtied by commit). But that's OK - the blocks are + * safe in-journal, which is all fsync() needs to ensure. + */ +- if (ext4_should_journal_data(inode)) { +- ret = ext4_force_commit(inode->i_sb); +- goto out; +- } ++ if (ext4_should_journal_data(inode)) ++ return ext4_force_commit(inode->i_sb); + + if (!journal) + ret = sync_mapping_buffers(inode->i_mapping); diff --git a/queue-2.6.31/0070-ext4-fix-error-handling-in-ext4_ind_get_blocks.patch b/queue-2.6.31/0070-ext4-fix-error-handling-in-ext4_ind_get_blocks.patch new file mode 100644 index 00000000000..0427bea274b --- /dev/null +++ b/queue-2.6.31/0070-ext4-fix-error-handling-in-ext4_ind_get_blocks.patch @@ -0,0 +1,29 @@ +From f2747391268a891f6a3308f40c62afca16b6c292 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Mon, 23 Nov 2009 07:24:48 -0500 +Subject: [PATCH 70/85] ext4: fix error handling in ext4_ind_get_blocks() + +(cherry picked from commit 2bba702d4f88d7b010ec37e2527b552588404ae7) + +When an error happened in ext4_splice_branch we failed to notice that +in ext4_ind_get_blocks and mapped the buffer anyway. Fix the problem +by checking for error properly. + +Signed-off-by: Jan Kara +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1022,7 +1022,7 @@ static int ext4_ind_get_blocks(handle_t + if (!err) + err = ext4_splice_branch(handle, inode, iblock, + partial, indirect_blks, count); +- else ++ if (err) + goto cleanup; + + set_buffer_new(bh_result); diff --git a/queue-2.6.31/0071-ext4-make-trim-discard-optional-and-off-by-default.patch b/queue-2.6.31/0071-ext4-make-trim-discard-optional-and-off-by-default.patch new file mode 100644 index 00000000000..8f1d1404516 --- /dev/null +++ b/queue-2.6.31/0071-ext4-make-trim-discard-optional-and-off-by-default.patch @@ -0,0 +1,128 @@ +From 1cc420c479f028e811b68b3e67a0695c26642223 Mon Sep 17 00:00:00 2001 +From: Eric Sandeen +Date: Thu, 19 Nov 2009 14:25:42 -0500 +Subject: [PATCH 71/85] ext4: make trim/discard optional (and off by default) + +(cherry picked from commit 5328e635315734d42080de9a5a1ee87bf4cae0a4) + +It is anticipated that when sb_issue_discard starts doing +real work on trim-capable devices, we may see issues. Make +this mount-time optional, and default it to off until we know +that things are working out OK. + +Signed-off-by: Eric Sandeen +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/filesystems/ext4.txt | 6 ++++++ + fs/ext4/ext4.h | 1 + + fs/ext4/mballoc.c | 21 +++++++++++++-------- + fs/ext4/super.c | 14 +++++++++++++- + 4 files changed, 33 insertions(+), 9 deletions(-) + +--- a/Documentation/filesystems/ext4.txt ++++ b/Documentation/filesystems/ext4.txt +@@ -338,6 +338,12 @@ noauto_da_alloc replacing existing file + system crashes before the delayed allocation + blocks are forced to disk. + ++discard Controls whether ext4 should issue discard/TRIM ++nodiscard(*) commands to the underlying block device when ++ blocks are freed. This is useful for SSD devices ++ and sparse/thinly-provisioned LUNs, but it is off ++ by default until sufficient testing has been done. ++ + Data Mode + ========= + There are 3 different data modes: +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -747,6 +747,7 @@ struct ext4_inode_info { + #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ + #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ + #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ ++#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ + + #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt + #define set_opt(o, opt) o |= EXT4_MOUNT_##opt +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -2810,7 +2810,6 @@ static void release_blocks_on_commit(jou + struct ext4_group_info *db; + int err, count = 0, count2 = 0; + struct ext4_free_data *entry; +- ext4_fsblk_t discard_block; + struct list_head *l, *ltmp; + + list_for_each_safe(l, ltmp, &txn->t_private_list) { +@@ -2840,13 +2839,19 @@ static void release_blocks_on_commit(jou + page_cache_release(e4b.bd_bitmap_page); + } + ext4_unlock_group(sb, entry->group); +- discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) +- + entry->start_blk +- + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); +- trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, +- entry->count); +- sb_issue_discard(sb, discard_block, entry->count); +- ++ if (test_opt(sb, DISCARD)) { ++ ext4_fsblk_t discard_block; ++ struct ext4_super_block *es = EXT4_SB(sb)->s_es; ++ ++ discard_block = (ext4_fsblk_t)entry->group * ++ EXT4_BLOCKS_PER_GROUP(sb) ++ + entry->start_blk ++ + le32_to_cpu(es->s_first_data_block); ++ trace_ext4_discard_blocks(sb, ++ (unsigned long long)discard_block, ++ entry->count); ++ sb_issue_discard(sb, discard_block, entry->count); ++ } + kmem_cache_free(ext4_free_ext_cachep, entry); + ext4_mb_release_desc(&e4b); + } +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -906,6 +906,9 @@ static int ext4_show_options(struct seq_ + if (test_opt(sb, NO_AUTO_DA_ALLOC)) + seq_puts(seq, ",noauto_da_alloc"); + ++ if (test_opt(sb, DISCARD)) ++ seq_puts(seq, ",discard"); ++ + ext4_show_quota_options(seq, sb); + + return 0; +@@ -1086,7 +1089,8 @@ enum { + Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_block_validity, Opt_noblock_validity, +- Opt_inode_readahead_blks, Opt_journal_ioprio ++ Opt_inode_readahead_blks, Opt_journal_ioprio, ++ Opt_discard, Opt_nodiscard, + }; + + static const match_table_t tokens = { +@@ -1152,6 +1156,8 @@ static const match_table_t tokens = { + {Opt_auto_da_alloc, "auto_da_alloc=%u"}, + {Opt_auto_da_alloc, "auto_da_alloc"}, + {Opt_noauto_da_alloc, "noauto_da_alloc"}, ++ {Opt_discard, "discard"}, ++ {Opt_nodiscard, "nodiscard"}, + {Opt_err, NULL}, + }; + +@@ -1580,6 +1586,12 @@ set_qf_format: + else + set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); + break; ++ case Opt_discard: ++ set_opt(sbi->s_mount_opt, DISCARD); ++ break; ++ case Opt_nodiscard: ++ clear_opt(sbi->s_mount_opt, DISCARD); ++ break; + default: + ext4_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " diff --git a/queue-2.6.31/0072-ext4-make-norecovery-an-alias-for-noload.patch b/queue-2.6.31/0072-ext4-make-norecovery-an-alias-for-noload.patch new file mode 100644 index 00000000000..1e0f099e7d4 --- /dev/null +++ b/queue-2.6.31/0072-ext4-make-norecovery-an-alias-for-noload.patch @@ -0,0 +1,57 @@ +From 726beafb270e939bb40a2729ae848b80d146ddd6 Mon Sep 17 00:00:00 2001 +From: Eric Sandeen +Date: Thu, 19 Nov 2009 14:28:50 -0500 +Subject: [PATCH 72/85] ext4: make "norecovery" an alias for "noload" + +(cherry picked from commit e3bb52ae2bb9573e84c17b8e3560378d13a5c798) + +Users on the linux-ext4 list recently complained about differences +across filesystems w.r.t. how to mount without a journal replay. + +In the discussion it was noted that xfs's "norecovery" option is +perhaps more descriptively accurate than "noload," so let's make +that an alias for ext4. + +Also show this status in /proc/mounts + +Signed-off-by: Eric Sandeen +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/filesystems/ext4.txt | 4 ++-- + fs/ext4/super.c | 4 ++++ + 2 files changed, 6 insertions(+), 2 deletions(-) + +--- a/Documentation/filesystems/ext4.txt ++++ b/Documentation/filesystems/ext4.txt +@@ -153,8 +153,8 @@ journal_dev=devnum When the external jou + identified through its new major/minor numbers encoded + in devnum. + +-noload Don't load the journal on mounting. Note that +- if the filesystem was not unmounted cleanly, ++norecovery Don't load the journal on mounting. Note that ++noload if the filesystem was not unmounted cleanly, + skipping the journal replay will lead to the + filesystem containing inconsistencies that can + lead to any number of problems. +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -909,6 +909,9 @@ static int ext4_show_options(struct seq_ + if (test_opt(sb, DISCARD)) + seq_puts(seq, ",discard"); + ++ if (test_opt(sb, NOLOAD)) ++ seq_puts(seq, ",norecovery"); ++ + ext4_show_quota_options(seq, sb); + + return 0; +@@ -1115,6 +1118,7 @@ static const match_table_t tokens = { + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_noload, "noload"}, ++ {Opt_noload, "norecovery"}, + {Opt_nobh, "nobh"}, + {Opt_bh, "bh"}, + {Opt_commit, "commit=%u"}, diff --git a/queue-2.6.31/0073-ext4-Fix-double-free-of-blocks-with-EXT4_IOC_MOVE_EX.patch b/queue-2.6.31/0073-ext4-Fix-double-free-of-blocks-with-EXT4_IOC_MOVE_EX.patch new file mode 100644 index 00000000000..e09c39d443d --- /dev/null +++ b/queue-2.6.31/0073-ext4-Fix-double-free-of-blocks-with-EXT4_IOC_MOVE_EX.patch @@ -0,0 +1,79 @@ +From 0049f11e6571cac0b24c7483c4df1caa16a5cb7f Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Tue, 24 Nov 2009 10:19:57 -0500 +Subject: [PATCH 73/85] ext4: Fix double-free of blocks with EXT4_IOC_MOVE_EXT + +(cherry picked from commit 94d7c16cbbbd0e03841fcf272bcaf0620ad39618) + +At the beginning of ext4_move_extent(), we call +ext4_discard_preallocations() to discard inode PAs of orig and donor +inodes. But in the following case, blocks can be double freed, so +move ext4_discard_preallocations() to the end of ext4_move_extents(). + +1. Discard inode PAs of orig and donor inodes with + ext4_discard_preallocations() in ext4_move_extents(). + + orig : [ DATA1 ] + donor: [ DATA2 ] + +2. While data blocks are exchanging between orig and donor inodes, new + inode PAs is created to orig by other process's block allocation. + (Since there are semaphore gaps in ext4_move_extents().) And new + inode PAs is used partially (2-1). + + 2-1 Create new inode PAs to orig inode + orig : [ DATA1 | used PA1 | free PA1 ] + donor: [ DATA2 ] + +3. Donor inode which has old orig inode's blocks is deleted after + EXT4_IOC_MOVE_EXT finished (3-1, 3-2). So the block bitmap + corresponds to old orig inode's blocks are freed. + + 3-1 After EXT4_IOC_MOVE_EXT finished + orig : [ DATA2 | free PA1 ] + donor: [ DATA1 | used PA1 ] + + 3-2 Delete donor inode + orig : [ DATA2 | free PA1 ] + donor: [ FREE SPACE(DATA1) | FREE SPACE(used PA1) ] + +4. The double-free of blocks is occurred, when close() is called to + orig inode. Because ext4_discard_preallocations() for orig inode + frees used PA1 and free PA1, though used PA1 is already freed in 3. + + 4-1 Double-free of blocks is occurred + orig : [ DATA2 | FREE SPACE(free PA1) ] + donor: [ FREE SPACE(DATA1) | DOUBLE FREE(used PA1) ] + +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/move_extent.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -1289,10 +1289,6 @@ ext4_move_extents(struct file *o_filp, s + ext4_ext_get_actual_len(ext_cur), block_end + 1) - + max(le32_to_cpu(ext_cur->ee_block), block_start); + +- /* Discard preallocations of two inodes */ +- ext4_discard_preallocations(orig_inode); +- ext4_discard_preallocations(donor_inode); +- + while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { + seq_blocks += add_blocks; + +@@ -1410,6 +1406,11 @@ ext4_move_extents(struct file *o_filp, s + + } + out: ++ if (*moved_len) { ++ ext4_discard_preallocations(orig_inode); ++ ext4_discard_preallocations(donor_inode); ++ } ++ + if (orig_path) { + ext4_ext_drop_refs(orig_path); + kfree(orig_path); diff --git a/queue-2.6.31/0074-ext4-initialize-moved_len-before-calling-ext4_move_e.patch b/queue-2.6.31/0074-ext4-initialize-moved_len-before-calling-ext4_move_e.patch new file mode 100644 index 00000000000..797e5965335 --- /dev/null +++ b/queue-2.6.31/0074-ext4-initialize-moved_len-before-calling-ext4_move_e.patch @@ -0,0 +1,76 @@ +From 5fff7e08334b2441b44b19736801b76610191ae9 Mon Sep 17 00:00:00 2001 +From: Kazuya Mio +Date: Tue, 24 Nov 2009 10:28:48 -0500 +Subject: [PATCH 74/85] ext4: initialize moved_len before calling ext4_move_extents() + +(cherry picked from commit 446aaa6e7e993b38a6f21c6acfa68f3f1af3dbe3) + +The move_extent.moved_len is used to pass back the number of exchanged +blocks count to user space. Currently the caller must clear this +field; but we spend more code space checking for this requirement than +simply zeroing the field ourselves, so let's just make life easier for +everyone all around. + +Signed-off-by: Kazuya Mio +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ioctl.c | 1 + + fs/ext4/move_extent.c | 14 +++----------- + 2 files changed, 4 insertions(+), 11 deletions(-) + +--- a/fs/ext4/ioctl.c ++++ b/fs/ext4/ioctl.c +@@ -239,6 +239,7 @@ setversion_out: + } + } + ++ me.moved_len = 0; + err = ext4_move_extents(filp, donor_filp, me.orig_start, + me.donor_start, me.len, &me.moved_len); + fput(donor_filp); +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -947,7 +947,6 @@ out2: + * @orig_start: logical start offset in block for orig + * @donor_start: logical start offset in block for donor + * @len: the number of blocks to be moved +- * @moved_len: moved block length + * + * Check the arguments of ext4_move_extents() whether the files can be + * exchanged with each other. +@@ -955,8 +954,8 @@ out2: + */ + static int + mext_check_arguments(struct inode *orig_inode, +- struct inode *donor_inode, __u64 orig_start, +- __u64 donor_start, __u64 *len, __u64 moved_len) ++ struct inode *donor_inode, __u64 orig_start, ++ __u64 donor_start, __u64 *len) + { + ext4_lblk_t orig_blocks, donor_blocks; + unsigned int blkbits = orig_inode->i_blkbits; +@@ -1010,13 +1009,6 @@ mext_check_arguments(struct inode *orig_ + return -EINVAL; + } + +- if (moved_len) { +- ext4_debug("ext4 move extent: moved_len should be 0 " +- "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, +- donor_inode->i_ino); +- return -EINVAL; +- } +- + if ((orig_start > MAX_DEFRAG_SIZE) || + (donor_start > MAX_DEFRAG_SIZE) || + (*len > MAX_DEFRAG_SIZE) || +@@ -1226,7 +1218,7 @@ ext4_move_extents(struct file *o_filp, s + double_down_write_data_sem(orig_inode, donor_inode); + /* Check the filesystem environment whether move_extent can be done */ + ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, +- donor_start, &len, *moved_len); ++ donor_start, &len); + if (ret1) + goto out; + diff --git a/queue-2.6.31/0075-ext4-move_extent_per_page-cleanup.patch b/queue-2.6.31/0075-ext4-move_extent_per_page-cleanup.patch new file mode 100644 index 00000000000..7cd8a816922 --- /dev/null +++ b/queue-2.6.31/0075-ext4-move_extent_per_page-cleanup.patch @@ -0,0 +1,91 @@ +From 10ea35c834241a2400b470cf574105e9cf3a9679 Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Tue, 24 Nov 2009 10:31:56 -0500 +Subject: [PATCH 75/85] ext4: move_extent_per_page() cleanup + +(cherry picked from commit ac48b0a1d068887141581bea8285de5fcab182b0) + +Integrate duplicate lines (acquire/release semaphore and invalidate +extent cache in move_extent_per_page()) into mext_replace_branches(), +to reduce source and object code size. + +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/move_extent.c | 30 +++++++++--------------------- + 1 file changed, 9 insertions(+), 21 deletions(-) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -660,6 +660,9 @@ mext_replace_branches(handle_t *handle, + int replaced_count = 0; + int dext_alen; + ++ /* Protect extent trees against block allocations via delalloc */ ++ double_down_write_data_sem(orig_inode, donor_inode); ++ + /* Get the original extent for the block "orig_off" */ + *err = get_ext_path(orig_inode, orig_off, &orig_path); + if (*err) +@@ -755,6 +758,11 @@ out: + kfree(donor_path); + } + ++ ext4_ext_invalidate_cache(orig_inode); ++ ext4_ext_invalidate_cache(donor_inode); ++ ++ double_up_write_data_sem(orig_inode, donor_inode); ++ + return replaced_count; + } + +@@ -820,19 +828,9 @@ move_extent_per_page(struct file *o_filp + * Just swap data blocks between orig and donor. + */ + if (uninit) { +- /* +- * Protect extent trees against block allocations +- * via delalloc +- */ +- double_down_write_data_sem(orig_inode, donor_inode); + replaced_count = mext_replace_branches(handle, orig_inode, + donor_inode, orig_blk_offset, + block_len_in_page, err); +- +- /* Clear the inode cache not to refer to the old data */ +- ext4_ext_invalidate_cache(orig_inode); +- ext4_ext_invalidate_cache(donor_inode); +- double_up_write_data_sem(orig_inode, donor_inode); + goto out2; + } + +@@ -880,8 +878,6 @@ move_extent_per_page(struct file *o_filp + /* Release old bh and drop refs */ + try_to_release_page(page, 0); + +- /* Protect extent trees against block allocations via delalloc */ +- double_down_write_data_sem(orig_inode, donor_inode); + replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, + orig_blk_offset, block_len_in_page, + &err2); +@@ -890,18 +886,10 @@ move_extent_per_page(struct file *o_filp + block_len_in_page = replaced_count; + replaced_size = + block_len_in_page << orig_inode->i_blkbits; +- } else { +- double_up_write_data_sem(orig_inode, donor_inode); ++ } else + goto out; +- } + } + +- /* Clear the inode cache not to refer to the old data */ +- ext4_ext_invalidate_cache(orig_inode); +- ext4_ext_invalidate_cache(donor_inode); +- +- double_up_write_data_sem(orig_inode, donor_inode); +- + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); + diff --git a/queue-2.6.31/0076-jbd2-Add-ENOMEM-checking-in-and-for-jbd2_journal_wri.patch b/queue-2.6.31/0076-jbd2-Add-ENOMEM-checking-in-and-for-jbd2_journal_wri.patch new file mode 100644 index 00000000000..e05164d902c --- /dev/null +++ b/queue-2.6.31/0076-jbd2-Add-ENOMEM-checking-in-and-for-jbd2_journal_wri.patch @@ -0,0 +1,42 @@ +From fa1b33dd4ff545b7bfd10eac8b0833d338135306 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Tue, 1 Dec 2009 09:04:42 -0500 +Subject: [PATCH 76/85] jbd2: Add ENOMEM checking in and for jbd2_journal_write_metadata_buffer() + +(cherry picked from commit e6ec116b67f46e0e7808276476554727b2e6240b) + +OOM happens. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/jbd2/commit.c | 4 ++++ + fs/jbd2/journal.c | 4 ++++ + 2 files changed, 8 insertions(+) + +--- a/fs/jbd2/commit.c ++++ b/fs/jbd2/commit.c +@@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(jou + JBUFFER_TRACE(jh, "ph3: write metadata"); + flags = jbd2_journal_write_metadata_buffer(commit_transaction, + jh, &new_jh, blocknr); ++ if (flags < 0) { ++ jbd2_journal_abort(journal, flags); ++ continue; ++ } + set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); + wbuf[bufs++] = jh2bh(new_jh); + +--- a/fs/jbd2/journal.c ++++ b/fs/jbd2/journal.c +@@ -361,6 +361,10 @@ repeat: + + jbd_unlock_bh_state(bh_in); + tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); ++ if (!tmp) { ++ jbd2_journal_put_journal_head(new_jh); ++ return -ENOMEM; ++ } + jbd_lock_bh_state(bh_in); + if (jh_in->b_frozen_data) { + jbd2_free(tmp, bh_in->b_size); diff --git a/queue-2.6.31/0077-ext4-Return-the-PTR_ERR-of-the-correct-pointer-in-se.patch b/queue-2.6.31/0077-ext4-Return-the-PTR_ERR-of-the-correct-pointer-in-se.patch new file mode 100644 index 00000000000..5cc137084e7 --- /dev/null +++ b/queue-2.6.31/0077-ext4-Return-the-PTR_ERR-of-the-correct-pointer-in-se.patch @@ -0,0 +1,25 @@ +From c2615d59849368a9f7e8f56b580831b0221d4fad Mon Sep 17 00:00:00 2001 +From: Roel Kluin +Date: Mon, 7 Dec 2009 10:38:16 -0500 +Subject: [PATCH 77/85] ext4: Return the PTR_ERR of the correct pointer in setup_new_group_blocks() + +(cherry picked from commit c09eef305dd43846360944ad072f051f964fa383) + +Signed-off-by: Roel Kluin +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/resize.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/resize.c ++++ b/fs/ext4/resize.c +@@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct + goto exit_bh; + + if (IS_ERR(gdb = bclean(handle, sb, block))) { +- err = PTR_ERR(bh); ++ err = PTR_ERR(gdb); + goto exit_bh; + } + ext4_handle_dirty_metadata(handle, NULL, gdb); diff --git a/queue-2.6.31/0078-ext4-Avoid-data-filesystem-corruption-when-write-fai.patch b/queue-2.6.31/0078-ext4-Avoid-data-filesystem-corruption-when-write-fai.patch new file mode 100644 index 00000000000..89dbacb4f39 --- /dev/null +++ b/queue-2.6.31/0078-ext4-Avoid-data-filesystem-corruption-when-write-fai.patch @@ -0,0 +1,88 @@ +From 11517f4e2c0350204f02d970abff784bc139a094 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Tue, 8 Dec 2009 21:24:33 -0500 +Subject: [PATCH 78/85] ext4: Avoid data / filesystem corruption when write fails to copy data + +(cherry picked from commit b9a4207d5e911b938f73079a83cc2ae10524ec7f) + +When ext4_write_begin fails after allocating some blocks or +generic_perform_write fails to copy data to write, we truncate blocks +already instantiated beyond i_size. Although these blocks were never +inside i_size, we have to truncate the pagecache of these blocks so +that corresponding buffers get unmapped. Otherwise subsequent +__block_prepare_write (called because we are retrying the write) will +find the buffers mapped, not call ->get_block, and thus the page will +be backed by already freed blocks leading to filesystem and data +corruption. + +Signed-off-by: Jan Kara +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 20 +++++++++++++++----- + 1 file changed, 15 insertions(+), 5 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1535,6 +1535,16 @@ static int do_journal_get_write_access(h + return ext4_journal_get_write_access(handle, bh); + } + ++/* ++ * Truncate blocks that were not used by write. We have to truncate the ++ * pagecache as well so that corresponding buffers get properly unmapped. ++ */ ++static void ext4_truncate_failed_write(struct inode *inode) ++{ ++ truncate_inode_pages(inode->i_mapping, inode->i_size); ++ ext4_truncate(inode); ++} ++ + static int ext4_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +@@ -1600,7 +1610,7 @@ retry: + + ext4_journal_stop(handle); + if (pos + len > inode->i_size) { +- ext4_truncate(inode); ++ ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might + * still be on the orphan list; we need to +@@ -1710,7 +1720,7 @@ static int ext4_ordered_write_end(struct + ret = ret2; + + if (pos + len > inode->i_size) { +- ext4_truncate(inode); ++ ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode +@@ -1752,7 +1762,7 @@ static int ext4_writeback_write_end(stru + ret = ret2; + + if (pos + len > inode->i_size) { +- ext4_truncate(inode); ++ ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode +@@ -1815,7 +1825,7 @@ static int ext4_journalled_write_end(str + if (!ret) + ret = ret2; + if (pos + len > inode->i_size) { +- ext4_truncate(inode); ++ ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode +@@ -3087,7 +3097,7 @@ retry: + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) +- ext4_truncate(inode); ++ ext4_truncate_failed_write(inode); + } + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) diff --git a/queue-2.6.31/0079-ext4-wait-for-log-to-commit-when-umounting.patch b/queue-2.6.31/0079-ext4-wait-for-log-to-commit-when-umounting.patch new file mode 100644 index 00000000000..37196534e2b --- /dev/null +++ b/queue-2.6.31/0079-ext4-wait-for-log-to-commit-when-umounting.patch @@ -0,0 +1,50 @@ +From c086b107c572cba6d3600ae292a1e23627756fb0 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Tue, 8 Dec 2009 21:48:58 -0500 +Subject: [PATCH 79/85] ext4: wait for log to commit when umounting + +(cherry picked from commit d4edac314e9ad0b21ba20ba8bc61b61f186f79e1) + +There is a potential race when a transaction is committing right when +the file system is being umounting. This could reduce in a race +because EXT4_SB(sb)->s_group_info could be freed in ext4_put_super +before the commit code calls a callback so the mballoc code can +release freed blocks in the transaction, resulting in a panic trying +to access the freed s_group_info. + +The fix is to wait for the transaction to finish committing before we +shutdown the multiblock allocator. + +Signed-off-by: Josef Bacik +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/super.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -610,10 +610,6 @@ static void ext4_put_super(struct super_ + if (sb->s_dirt) + ext4_commit_super(sb, 1); + +- ext4_release_system_zone(sb); +- ext4_mb_release(sb); +- ext4_ext_release(sb); +- ext4_xattr_put_super(sb); + if (sbi->s_journal) { + err = jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; +@@ -621,6 +617,12 @@ static void ext4_put_super(struct super_ + ext4_abort(sb, __func__, + "Couldn't clean up the journal"); + } ++ ++ ext4_release_system_zone(sb); ++ ext4_mb_release(sb); ++ ext4_ext_release(sb); ++ ext4_xattr_put_super(sb); ++ + if (!(sb->s_flags & MS_RDONLY)) { + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + es->s_state = cpu_to_le16(sbi->s_mount_state); diff --git a/queue-2.6.31/0080-ext4-remove-blocks-from-inode-prealloc-list-on-failu.patch b/queue-2.6.31/0080-ext4-remove-blocks-from-inode-prealloc-list-on-failu.patch new file mode 100644 index 00000000000..9dcdb2a14cf --- /dev/null +++ b/queue-2.6.31/0080-ext4-remove-blocks-from-inode-prealloc-list-on-failu.patch @@ -0,0 +1,53 @@ +From a6b78cd05191ca940c21046c35d0845fc7eed57e Mon Sep 17 00:00:00 2001 +From: Curt Wohlgemuth +Date: Tue, 8 Dec 2009 22:18:25 -0500 +Subject: [PATCH 80/85] ext4: remove blocks from inode prealloc list on failure + +(cherry picked from commit b844167edc7fcafda9623955c05e4c1b3c32ebc7) + +This fixes a leak of blocks in an inode prealloc list if device failures +cause ext4_mb_mark_diskspace_used() to fail. + +Signed-off-by: Curt Wohlgemuth +Acked-by: Aneesh Kumar K.V +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -3258,6 +3258,24 @@ static void ext4_mb_collect_stats(struct + } + + /* ++ * Called on failure; free up any blocks from the inode PA for this ++ * context. We don't need this for MB_GROUP_PA because we only change ++ * pa_free in ext4_mb_release_context(), but on failure, we've already ++ * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. ++ */ ++static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) ++{ ++ struct ext4_prealloc_space *pa = ac->ac_pa; ++ int len; ++ ++ if (pa && pa->pa_type == MB_INODE_PA) { ++ len = ac->ac_b_ex.fe_len; ++ pa->pa_free += len; ++ } ++ ++} ++ ++/* + * use blocks preallocated to inode + */ + static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, +@@ -4546,6 +4564,7 @@ repeat: + ac->ac_status = AC_STATUS_CONTINUE; + goto repeat; + } else if (*errp) { ++ ext4_discard_allocated_blocks(ac); + ac->ac_b_ex.fe_len = 0; + ar->len = 0; + ext4_mb_show_ac(ac); diff --git a/queue-2.6.31/0081-ext4-ext4_get_reserved_space-must-return-bytes-inste.patch b/queue-2.6.31/0081-ext4-ext4_get_reserved_space-must-return-bytes-inste.patch new file mode 100644 index 00000000000..505a6574397 --- /dev/null +++ b/queue-2.6.31/0081-ext4-ext4_get_reserved_space-must-return-bytes-inste.patch @@ -0,0 +1,27 @@ +From 63df45b254921e67d027cac706cabb06032ea836 Mon Sep 17 00:00:00 2001 +From: Dmitry Monakhov +Date: Tue, 8 Dec 2009 22:41:52 -0500 +Subject: [PATCH 81/85] ext4: ext4_get_reserved_space() must return bytes instead of blocks + +(cherry picked from commit 8aa6790f876e81f5a2211fe1711a5fe3fe2d7b20) + +Signed-off-by: Dmitry Monakhov +Reviewed-by: Eric Sandeen +Acked-by: Mingming Cao +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1053,7 +1053,7 @@ qsize_t ext4_get_reserved_space(struct i + EXT4_I(inode)->i_reserved_meta_blocks; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + +- return total; ++ return (total << inode->i_blkbits); + } + /* + * Calculate the number of metadata blocks need to reserve diff --git a/queue-2.6.31/0082-ext4-quota-macros-cleanup.patch b/queue-2.6.31/0082-ext4-quota-macros-cleanup.patch new file mode 100644 index 00000000000..270f5620c6d --- /dev/null +++ b/queue-2.6.31/0082-ext4-quota-macros-cleanup.patch @@ -0,0 +1,142 @@ +From e6590a67bfe0a6efeb55de4cbecb22081c1d8ea3 Mon Sep 17 00:00:00 2001 +From: Dmitry Monakhov +Date: Tue, 8 Dec 2009 22:42:15 -0500 +Subject: [PATCH 82/85] ext4: quota macros cleanup + +(cherry picked from commit 5aca07eb7d8f14d90c740834d15ca15277f4820c) + +Currently all quota block reservation macros contains hard-coded "2" +aka MAXQUOTAS value. This is no good because in some places it is not +obvious to understand what does this digit represent. Let's introduce +new macro with self descriptive name. + +Signed-off-by: Dmitry Monakhov +Acked-by: Mingming Cao +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4_jbd2.h | 8 ++++++-- + fs/ext4/extents.c | 2 +- + fs/ext4/inode.c | 2 +- + fs/ext4/migrate.c | 4 ++-- + fs/ext4/namei.c | 8 ++++---- + 5 files changed, 14 insertions(+), 10 deletions(-) + +--- a/fs/ext4/ext4_jbd2.h ++++ b/fs/ext4/ext4_jbd2.h +@@ -49,7 +49,7 @@ + + #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ + EXT4_XATTR_TRANS_BLOCKS - 2 + \ +- 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) ++ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) + + /* + * Define the number of metadata blocks we need to account to modify data. +@@ -57,7 +57,7 @@ + * This include super block, inode block, quota blocks and xattr blocks + */ + #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ +- 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) ++ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) + + /* Delete operations potentially hit one directory's namespace plus an + * entire inode, plus arbitrary amounts of bitmap/indirection data. Be +@@ -92,6 +92,7 @@ + * but inode, sb and group updates are done only once */ + #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ + (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) ++ + #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ + (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) + #else +@@ -99,6 +100,9 @@ + #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 + #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 + #endif ++#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) ++#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) ++#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) + + int + ext4_mark_iloc_dirty(handle_t *handle, +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -2147,7 +2147,7 @@ ext4_ext_rm_leaf(handle_t *handle, struc + correct_index = 1; + credits += (ext_depth(inode)) + 1; + } +- credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); ++ credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); + + err = ext4_ext_truncate_extend_restart(handle, inode, credits); + if (err) +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -5221,7 +5221,7 @@ int ext4_setattr(struct dentry *dentry, + + /* (user+group)*(old+new) structure, inode write (sb, + * inode block, ? - but truncate inode update has it) */ +- handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ ++ handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ + EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); +--- a/fs/ext4/migrate.c ++++ b/fs/ext4/migrate.c +@@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(hand + * So allocate a credit of 3. We may update + * quota (user and group). + */ +- needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); ++ needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); + + if (ext4_journal_extend(handle, needed) != 0) + retval = ext4_journal_restart(handle, needed); +@@ -477,7 +477,7 @@ int ext4_ext_migrate(struct inode *inode + handle = ext4_journal_start(inode, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + +- 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) ++ EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + + 1); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -1769,7 +1769,7 @@ static int ext4_create(struct inode *dir + retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + +- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); ++ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + +@@ -1803,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, + retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + +- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); ++ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + +@@ -1840,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, + retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + +- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); ++ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + +@@ -2253,7 +2253,7 @@ static int ext4_symlink(struct inode *di + retry: + handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + +- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); ++ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + if (IS_ERR(handle)) + return PTR_ERR(handle); + diff --git a/queue-2.6.31/0083-ext4-fix-incorrect-block-reservation-on-quota-transf.patch b/queue-2.6.31/0083-ext4-fix-incorrect-block-reservation-on-quota-transf.patch new file mode 100644 index 00000000000..4ba274b1e15 --- /dev/null +++ b/queue-2.6.31/0083-ext4-fix-incorrect-block-reservation-on-quota-transf.patch @@ -0,0 +1,31 @@ +From 456f450ba99dab1a4109b53744161ba0eddcf6ee Mon Sep 17 00:00:00 2001 +From: Dmitry Monakhov +Date: Tue, 8 Dec 2009 22:42:28 -0500 +Subject: [PATCH 83/85] ext4: fix incorrect block reservation on quota transfer. + +(cherry picked from commit 194074acacebc169ded90a4657193f5180015051) + +Inside ->setattr() call both ATTR_UID and ATTR_GID may be valid +This means that we may end-up with transferring all quotas. Add +we have to reserve QUOTA_DEL_BLOCKS for all quotas, as we do in +case of QUOTA_INIT_BLOCKS. + +Signed-off-by: Dmitry Monakhov +Reviewed-by: Mingming Cao +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -5222,7 +5222,7 @@ int ext4_setattr(struct dentry *dentry, + /* (user+group)*(old+new) structure, inode write (sb, + * inode block, ? - but truncate inode update has it) */ + handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ +- EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); ++ EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; diff --git a/queue-2.6.31/0084-ext4-Wait-for-proper-transaction-commit-on-fsync.patch b/queue-2.6.31/0084-ext4-Wait-for-proper-transaction-commit-on-fsync.patch new file mode 100644 index 00000000000..865a062350c --- /dev/null +++ b/queue-2.6.31/0084-ext4-Wait-for-proper-transaction-commit-on-fsync.patch @@ -0,0 +1,256 @@ +From 54c03a4d3da37205aac158f59210e3cab24b7b36 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Thu, 10 Dec 2009 00:50:57 -0500 +Subject: [PATCH 84/85] ext4: Wait for proper transaction commit on fsync + +(cherry picked from commit b436b9bef84de6893e86346d8fbf7104bc520645) + +We cannot rely on buffer dirty bits during fsync because pdflush can come +before fsync is called and clear dirty bits without forcing a transaction +commit. What we do is that we track which transaction has last changed +the inode and which transaction last changed allocation and force it to +disk on fsync. + +Signed-off-by: Jan Kara +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 7 +++++++ + fs/ext4/ext4_jbd2.h | 13 +++++++++++++ + fs/ext4/extents.c | 14 ++++++++++++-- + fs/ext4/fsync.c | 46 +++++++++++++++++----------------------------- + fs/ext4/inode.c | 29 +++++++++++++++++++++++++++++ + fs/ext4/super.c | 2 ++ + fs/jbd2/journal.c | 1 + + 7 files changed, 81 insertions(+), 31 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -700,6 +700,13 @@ struct ext4_inode_info { + struct list_head i_aio_dio_complete_list; + /* current io_end structure for async DIO write*/ + ext4_io_end_t *cur_aio_dio; ++ ++ /* ++ * Transactions that contain inode's metadata needed to complete ++ * fsync and fdatasync, respectively. ++ */ ++ tid_t i_sync_tid; ++ tid_t i_datasync_tid; + }; + + /* +--- a/fs/ext4/ext4_jbd2.h ++++ b/fs/ext4/ext4_jbd2.h +@@ -258,6 +258,19 @@ static inline int ext4_jbd2_file_inode(h + return 0; + } + ++static inline void ext4_update_inode_fsync_trans(handle_t *handle, ++ struct inode *inode, ++ int datasync) ++{ ++ struct ext4_inode_info *ei = EXT4_I(inode); ++ ++ if (ext4_handle_valid(handle)) { ++ ei->i_sync_tid = handle->h_transaction->t_tid; ++ if (datasync) ++ ei->i_datasync_tid = handle->h_transaction->t_tid; ++ } ++} ++ + /* super.c */ + int ext4_force_commit(struct super_block *sb); + +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3041,6 +3041,8 @@ ext4_ext_handle_uninitialized_extents(ha + if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { + ret = ext4_convert_unwritten_extents_dio(handle, inode, + path); ++ if (ret >= 0) ++ ext4_update_inode_fsync_trans(handle, inode, 1); + goto out2; + } + /* buffered IO case */ +@@ -3068,6 +3070,8 @@ ext4_ext_handle_uninitialized_extents(ha + ret = ext4_ext_convert_to_initialized(handle, inode, + path, iblock, + max_blocks); ++ if (ret >= 0) ++ ext4_update_inode_fsync_trans(handle, inode, 1); + out: + if (ret <= 0) { + err = ret; +@@ -3306,10 +3310,16 @@ int ext4_ext_get_blocks(handle_t *handle + allocated = ext4_ext_get_actual_len(&newex); + set_buffer_new(bh_result); + +- /* Cache only when it is _not_ an uninitialized extent */ +- if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) ++ /* ++ * Cache the extent and update transaction to commit on fdatasync only ++ * when it is _not_ an uninitialized extent. ++ */ ++ if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { + ext4_ext_put_in_cache(inode, iblock, allocated, newblock, + EXT4_EXT_CACHE_EXTENT); ++ ext4_update_inode_fsync_trans(handle, inode, 1); ++ } else ++ ext4_update_inode_fsync_trans(handle, inode, 0); + out: + if (allocated > max_blocks) + allocated = max_blocks; +--- a/fs/ext4/fsync.c ++++ b/fs/ext4/fsync.c +@@ -51,25 +51,30 @@ + int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) + { + struct inode *inode = dentry->d_inode; ++ struct ext4_inode_info *ei = EXT4_I(inode); + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; +- int err, ret = 0; ++ int ret; ++ tid_t commit_tid; + + J_ASSERT(ext4_journal_current_handle() == NULL); + + trace_ext4_sync_file(file, dentry, datasync); + ++ if (inode->i_sb->s_flags & MS_RDONLY) ++ return 0; ++ + ret = flush_aio_dio_completed_IO(inode); + if (ret < 0) + return ret; ++ ++ if (!journal) ++ return simple_fsync(file, dentry, datasync); ++ + /* +- * data=writeback: ++ * data=writeback,ordered: + * The caller's filemap_fdatawrite()/wait will sync the data. +- * sync_inode() will sync the metadata +- * +- * data=ordered: +- * The caller's filemap_fdatawrite() will write the data and +- * sync_inode() will write the inode if it is dirty. Then the caller's +- * filemap_fdatawait() will wait on the pages. ++ * Metadata is in the journal, we wait for proper transaction to ++ * commit here. + * + * data=journal: + * filemap_fdatawrite won't do anything (the buffers are clean). +@@ -82,27 +87,10 @@ int ext4_sync_file(struct file *file, st + if (ext4_should_journal_data(inode)) + return ext4_force_commit(inode->i_sb); + +- if (!journal) +- ret = sync_mapping_buffers(inode->i_mapping); +- +- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) +- goto out; +- +- /* +- * The VFS has written the file data. If the inode is unaltered +- * then we need not start a commit. +- */ +- if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) { +- struct writeback_control wbc = { +- .sync_mode = WB_SYNC_ALL, +- .nr_to_write = 0, /* sys_fsync did this */ +- }; +- err = sync_inode(inode, &wbc); +- if (ret == 0) +- ret = err; +- } +-out: +- if (journal && (journal->j_flags & JBD2_BARRIER)) ++ commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; ++ if (jbd2_log_start_commit(journal, commit_tid)) ++ jbd2_log_wait_commit(journal, commit_tid); ++ else if (journal->j_flags & JBD2_BARRIER) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + return ret; + } +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1026,6 +1026,8 @@ static int ext4_ind_get_blocks(handle_t + goto cleanup; + + set_buffer_new(bh_result); ++ ++ ext4_update_inode_fsync_trans(handle, inode, 1); + got_it: + map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); + if (count > blocks_to_boundary) +@@ -4784,6 +4786,7 @@ struct inode *ext4_iget(struct super_blo + struct ext4_inode *raw_inode; + struct ext4_inode_info *ei; + struct inode *inode; ++ journal_t *journal = EXT4_SB(sb)->s_journal; + long ret; + int block; + +@@ -4848,6 +4851,31 @@ struct inode *ext4_iget(struct super_blo + ei->i_data[block] = raw_inode->i_block[block]; + INIT_LIST_HEAD(&ei->i_orphan); + ++ /* ++ * Set transaction id's of transactions that have to be committed ++ * to finish f[data]sync. We set them to currently running transaction ++ * as we cannot be sure that the inode or some of its metadata isn't ++ * part of the transaction - the inode could have been reclaimed and ++ * now it is reread from disk. ++ */ ++ if (journal) { ++ transaction_t *transaction; ++ tid_t tid; ++ ++ spin_lock(&journal->j_state_lock); ++ if (journal->j_running_transaction) ++ transaction = journal->j_running_transaction; ++ else ++ transaction = journal->j_committing_transaction; ++ if (transaction) ++ tid = transaction->t_tid; ++ else ++ tid = journal->j_commit_sequence; ++ spin_unlock(&journal->j_state_lock); ++ ei->i_sync_tid = tid; ++ ei->i_datasync_tid = tid; ++ } ++ + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { + ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > +@@ -5102,6 +5130,7 @@ static int ext4_do_update_inode(handle_t + err = rc; + ei->i_state &= ~EXT4_STATE_NEW; + ++ ext4_update_inode_fsync_trans(handle, inode, 0); + out_brelse: + brelse(bh); + ext4_std_error(inode->i_sb, err); +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -713,6 +713,8 @@ static struct inode *ext4_alloc_inode(st + spin_lock_init(&(ei->i_block_reservation_lock)); + INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); + ei->cur_aio_dio = NULL; ++ ei->i_sync_tid = 0; ++ ei->i_datasync_tid = 0; + + return &ei->vfs_inode; + } +--- a/fs/jbd2/journal.c ++++ b/fs/jbd2/journal.c +@@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno); + EXPORT_SYMBOL(jbd2_journal_ack_err); + EXPORT_SYMBOL(jbd2_journal_clear_err); + EXPORT_SYMBOL(jbd2_log_wait_commit); ++EXPORT_SYMBOL(jbd2_log_start_commit); + EXPORT_SYMBOL(jbd2_journal_start_commit); + EXPORT_SYMBOL(jbd2_journal_force_commit_nested); + EXPORT_SYMBOL(jbd2_journal_wipe); diff --git a/queue-2.6.31/0085-ext4-Fix-insufficient-checks-in-EXT4_IOC_MOVE_EXT.patch b/queue-2.6.31/0085-ext4-Fix-insufficient-checks-in-EXT4_IOC_MOVE_EXT.patch new file mode 100644 index 00000000000..6b0e7483bd4 --- /dev/null +++ b/queue-2.6.31/0085-ext4-Fix-insufficient-checks-in-EXT4_IOC_MOVE_EXT.patch @@ -0,0 +1,98 @@ +From fb61e3f94098765f325a882dbb183fb01f549863 Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Sun, 6 Dec 2009 23:38:31 -0500 +Subject: [PATCH 85/85] ext4: Fix insufficient checks in EXT4_IOC_MOVE_EXT + +(cherry picked from commit 4a58579b9e4e2a35d57e6c9c8483e52f6f1b7fd6) + +This patch fixes three problems in the handling of the +EXT4_IOC_MOVE_EXT ioctl: + +1. In current EXT4_IOC_MOVE_EXT, there are read access mode checks for +original and donor files, but they allow the illegal write access to +donor file, since donor file is overwritten by original file data. To +fix this problem, change access mode checks of original (r->r/w) and +donor (r->w) files. + +2. Disallow the use of donor files that have a setuid or setgid bits. + +3. Call mnt_want_write() and mnt_drop_write() before and after +ext4_move_extents() calling to get write access to a mount. + +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ioctl.c | 30 ++++++++++++++++++------------ + fs/ext4/move_extent.c | 7 +++++++ + 2 files changed, 25 insertions(+), 12 deletions(-) + +--- a/fs/ext4/ioctl.c ++++ b/fs/ext4/ioctl.c +@@ -221,32 +221,38 @@ setversion_out: + struct file *donor_filp; + int err; + ++ if (!(filp->f_mode & FMODE_READ) || ++ !(filp->f_mode & FMODE_WRITE)) ++ return -EBADF; ++ + if (copy_from_user(&me, + (struct move_extent __user *)arg, sizeof(me))) + return -EFAULT; ++ me.moved_len = 0; + + donor_filp = fget(me.donor_fd); + if (!donor_filp) + return -EBADF; + +- if (!capable(CAP_DAC_OVERRIDE)) { +- if ((current->real_cred->fsuid != inode->i_uid) || +- !(inode->i_mode & S_IRUSR) || +- !(donor_filp->f_dentry->d_inode->i_mode & +- S_IRUSR)) { +- fput(donor_filp); +- return -EACCES; +- } ++ if (!(donor_filp->f_mode & FMODE_WRITE)) { ++ err = -EBADF; ++ goto mext_out; + } + +- me.moved_len = 0; ++ err = mnt_want_write(filp->f_path.mnt); ++ if (err) ++ goto mext_out; ++ + err = ext4_move_extents(filp, donor_filp, me.orig_start, + me.donor_start, me.len, &me.moved_len); +- fput(donor_filp); ++ mnt_drop_write(filp->f_path.mnt); ++ if (me.moved_len > 0) ++ file_remove_suid(donor_filp); + + if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) +- return -EFAULT; +- ++ err = -EFAULT; ++mext_out: ++ fput(donor_filp); + return err; + } + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -957,6 +957,13 @@ mext_check_arguments(struct inode *orig_ + return -EINVAL; + } + ++ if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { ++ ext4_debug("ext4 move extent: suid or sgid is set" ++ " to donor file [ino:orig %lu, donor %lu]\n", ++ orig_inode->i_ino, donor_inode->i_ino); ++ return -EINVAL; ++ } ++ + /* Ext4 move extent does not support swapfile */ + if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { + ext4_debug("ext4 move extent: The argument files should " diff --git a/queue-2.6.31/series b/queue-2.6.31/series new file mode 100644 index 00000000000..0ac4db7f91e --- /dev/null +++ b/queue-2.6.31/series @@ -0,0 +1,85 @@ +0001-ext4-Fix-memory-leak-fix-when-mounting-an-ext4-files.patch +0002-ext4-Avoid-null-pointer-dereference-when-decoding-ER.patch +0003-jbd2-Fail-to-load-a-journal-if-it-is-too-short.patch +0004-jbd2-round-commit-timer-up-to-avoid-uncommitted-tran.patch +0005-ext4-fix-journal-ref-count-in-move_extent_par_page.patch +0006-ext4-Fix-bugs-in-mballoc-s-stream-allocation-mode.patch +0007-ext4-Avoid-group-preallocation-for-closed-files.patch +0008-jbd2-Annotate-transaction-start-also-for-jbd2_journa.patch +0009-ext4-Fix-possible-deadlock-between-ext4_truncate-and.patch +0010-ext4-reject-too-large-filesystems-on-32-bit-kernels.patch +0011-ext4-Add-feature-set-check-helper-for-mount-remount-.patch +0012-ext4-Add-missing-unlock_new_inode-call-in-extent-mig.patch +0013-ext4-Allow-rename-to-create-more-than-EXT4_LINK_MAX-.patch +0014-ext4-Limit-number-of-links-that-can-be-created-by-ex.patch +0015-ext4-Restore-wbc-range_start-in-ext4_da_writepages.patch +0016-ext4-fix-cache-flush-in-ext4_sync_file.patch +0017-ext4-Fix-wrong-comparisons-in-mext_check_arguments.patch +0018-ext4-Remove-unneeded-BUG_ON-in-ext4_move_extents.patch +0019-ext4-Return-exchanged-blocks-count-to-user-space-in-.patch +0020-ext4-Take-page-lock-before-looking-at-attached-buffe.patch +0021-ext4-print-more-sysadmin-friendly-message-in-check_b.patch +0022-ext4-Use-bforget-in-no-journal-mode-for-ext4_journal.patch +0023-ext4-Assure-that-metadata-blocks-are-written-during-.patch +0024-ext4-Make-non-journal-fsync-work-properly.patch +0025-ext4-move-ext4_mb_init_group-function-earlier-in-the.patch +0026-ext4-check-for-need-init-flag-in-ext4_mb_load_buddy.patch +0027-ext4-Don-t-update-superblock-write-time-when-filesys.patch +0028-ext4-Always-set-dx_node-s-fake_dirent-explicitly.patch +0029-ext4-Fix-initalization-of-s_flex_groups.patch +0030-ext4-Fix-include-trace-events-ext4.h-to-work-with-Sy.patch +0031-ext4-Fix-small-typo-for-move_extent_per_page.patch +0032-ext4-Replace-get_ext_path-macro-with-an-inline-funci.patch +0033-ext4-Replace-BUG_ON-with-ext4_error-in-move_extents..patch +0034-ext4-Add-null-extent-check-to-ext_get_path.patch +0035-ext4-Fix-different-block-exchange-issue-in-EXT4_IOC_.patch +0036-ext4-limit-block-allocations-for-indirect-block-file.patch +0037-ext4-store-EXT4_EXT_MIGRATE-in-i_state-instead-of-i_.patch +0038-ext4-Fix-the-alloc-on-close-after-a-truncate-huerist.patch +0039-ext4-Fix-hueristic-which-avoids-group-preallocation-.patch +0040-ext4-Adjust-ext4_da_writepages-to-write-out-larger-c.patch +0041-ext4-release-reserved-quota-when-block-reservation-f.patch +0042-ext4-Split-uninitialized-extents-for-direct-I-O.patch +0043-ext4-Use-end_io-callback-to-avoid-direct-I-O-fallbac.patch +0044-ext4-async-direct-IO-for-holes-and-fallocate-support.patch +0045-ext4-EXT4_IOC_MOVE_EXT-Check-for-different-original-.patch +0046-ext4-Avoid-updating-the-inode-table-bh-twice-in-no-j.patch +0047-ext4-Make-sure-ext4_dirty_inode-updates-the-inode-in.patch +0048-ext4-Handle-nested-ext4_journal_start-stop-calls-wit.patch +0049-ext4-Fix-time-encoding-with-extra-epoch-bits.patch +0050-ext4-fix-a-BUG_ON-crash-by-checking-that-page-has-bu.patch +0051-ext4-retry-failed-direct-IO-allocations.patch +0052-ext4-discard-preallocation-when-restarting-a-transac.patch +0053-ext4-fix-ext4_ext_direct_IO-s-return-value-after-con.patch +0054-ext4-skip-conversion-of-uninit-extents-after-direct-.patch +0055-ext4-code-clean-up-for-dio-fallocate-handling.patch +0056-ext4-Fix-return-value-of-ext4_split_unwritten_extent.patch +0057-ext4-fix-potential-buffer-head-leak-when-add_dirent_.patch +0058-ext4-avoid-divide-by-zero-when-trying-to-mount-a-cor.patch +0059-ext4-fix-the-returned-block-count-if-EXT4_IOC_MOVE_E.patch +0060-ext4-fix-lock-order-problem-in-ext4_move_extents.patch +0061-ext4-fix-possible-recursive-locking-warning-in-EXT4_.patch +0062-ext4-plug-a-buffer_head-leak-in-an-error-path-of-ext.patch +0063-ext4-make-sure-directory-and-symlink-blocks-are-revo.patch +0064-ext4-fix-i_flags-access-in-ext4_da_writepages_trans_.patch +0065-ext4-journal-all-modifications-in-ext4_xattr_set_han.patch +0066-ext4-don-t-update-the-superblock-in-ext4_statfs.patch +0067-ext4-fix-uninit-block-bitmap-initialization-when-s_m.patch +0068-ext4-fix-block-validity-checks-so-they-work-correctl.patch +0069-ext4-avoid-issuing-unnecessary-barriers.patch +0070-ext4-fix-error-handling-in-ext4_ind_get_blocks.patch +0071-ext4-make-trim-discard-optional-and-off-by-default.patch +0072-ext4-make-norecovery-an-alias-for-noload.patch +0073-ext4-Fix-double-free-of-blocks-with-EXT4_IOC_MOVE_EX.patch +0074-ext4-initialize-moved_len-before-calling-ext4_move_e.patch +0075-ext4-move_extent_per_page-cleanup.patch +0076-jbd2-Add-ENOMEM-checking-in-and-for-jbd2_journal_wri.patch +0077-ext4-Return-the-PTR_ERR-of-the-correct-pointer-in-se.patch +0078-ext4-Avoid-data-filesystem-corruption-when-write-fai.patch +0079-ext4-wait-for-log-to-commit-when-umounting.patch +0080-ext4-remove-blocks-from-inode-prealloc-list-on-failu.patch +0081-ext4-ext4_get_reserved_space-must-return-bytes-inste.patch +0082-ext4-quota-macros-cleanup.patch +0083-ext4-fix-incorrect-block-reservation-on-quota-transf.patch +0084-ext4-Wait-for-proper-transaction-commit-on-fsync.patch +0085-ext4-Fix-insufficient-checks-in-EXT4_IOC_MOVE_EXT.patch