--- /dev/null
+From linux@linux.site Thu Dec 10 20:27:25 2009
+Message-Id: <20091211042724.642198428@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:39 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [01/90] ext4: Fix memory leak fix when mounting an ext4 filesystem
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0001-ext4-Fix-memory-leak-fix-when-mounting-an-ext4-files.patch
+Content-Length: 2371
+Lines: 69
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 024eab4d5bf7e3168a2b71038b3e04e6b1f376ed)
+
+The allocation of the ext4_group_info array was moved to a new
+function ext4_mb_add_group_info() in commit 5f21b0e6 so that online
+resize would use a common (and correct) codepath. Unfortunately, the
+call to the new ext4_mb_add_group_info() function was added without
+removing the code which originally allocated the array. This caused a
+memory leak each time an ext4 filesystem was mounted.
+
+The fix is simple; remove the code that did the original allocation,
+since it is no longer needed.
+
+Reported-by: Catalin Marinas <catalin.marinas@arm.com>
+Tested-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/mballoc.c | 19 -------------------
+ 1 file changed, 19 deletions(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -2571,13 +2571,11 @@ static int ext4_mb_init_backend(struct s
+ {
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ ext4_group_t i;
+- int metalen;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int num_meta_group_infos;
+ int num_meta_group_infos_max;
+ int array_size;
+- struct ext4_group_info **meta_group_info;
+ struct ext4_group_desc *desc;
+
+ /* This is the number of blocks used by GDT */
+@@ -2622,22 +2620,6 @@ static int ext4_mb_init_backend(struct s
+ goto err_freesgi;
+ }
+ EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
+-
+- metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
+- for (i = 0; i < num_meta_group_infos; i++) {
+- if ((i + 1) == num_meta_group_infos)
+- metalen = sizeof(*meta_group_info) *
+- (ngroups -
+- (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
+- meta_group_info = kmalloc(metalen, GFP_KERNEL);
+- if (meta_group_info == NULL) {
+- printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+- "buddy group\n");
+- goto err_freemeta;
+- }
+- sbi->s_group_info[i] = meta_group_info;
+- }
+-
+ for (i = 0; i < ngroups; i++) {
+ desc = ext4_get_group_desc(sb, i, NULL);
+ if (desc == NULL) {
+@@ -2655,7 +2637,6 @@ err_freebuddy:
+ while (i-- > 0)
+ kfree(ext4_get_group_info(sb, i));
+ i = num_meta_group_infos;
+-err_freemeta:
+ while (i-- > 0)
+ kfree(sbi->s_group_info[i]);
+ iput(sbi->s_buddy_cache);
+
+
+From linux@linux.site Thu Dec 10 20:27:25 2009
+Message-Id: <20091211042725.199040442@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:40 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Eric Sesterhenn <eric.sesterhenn@lsexperts.de>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [02/90] ext4: Avoid null pointer dereference when decoding EROFS w/o a journal
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0002-ext4-Avoid-null-pointer-dereference-when-decoding-ER.patch
+Content-Length: 817
+Lines: 25
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 78f1ddbb498283c2445c11b0dfa666424c301803)
+
+We need to check to make sure a journal is present before checking the
+journal flags in ext4_decode_error().
+
+Signed-off-by: Eric Sesterhenn <eric.sesterhenn@lsexperts.de>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/super.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -344,7 +344,8 @@ static const char *ext4_decode_error(str
+ errstr = "Out of memory";
+ break;
+ case -EROFS:
+- if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
++ if (!sb || (EXT4_SB(sb)->s_journal &&
++ EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
+ errstr = "Journal has aborted";
+ else
+ errstr = "Readonly filesystem";
+
+
+From linux@linux.site Thu Dec 10 20:27:26 2009
+Message-Id: <20091211042725.802559277@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:41 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Jan Kara <jack@suse.cz>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [03/90] jbd2: Fail to load a journal if it is too short
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0003-jbd2-Fail-to-load-a-journal-if-it-is-too-short.patch
+Content-Length: 861
+Lines: 28
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit f6f50e28f0cb8d7bcdfaacc83129f005dede11b1)
+
+Due to on disk corruption, it can happen that journal is too short. Fail
+to load it in such case so that we don't oops somewhere later.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/jbd2/journal.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/fs/jbd2/journal.c
++++ b/fs/jbd2/journal.c
+@@ -1187,6 +1187,12 @@ static int journal_reset(journal_t *jour
+
+ first = be32_to_cpu(sb->s_first);
+ last = be32_to_cpu(sb->s_maxlen);
++ if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
++ printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
++ first, last);
++ journal_fail_superblock(journal);
++ return -EINVAL;
++ }
+
+ journal->j_first = first;
+ journal->j_last = last;
+
+
+From linux@linux.site Thu Dec 10 20:27:26 2009
+Message-Id: <20091211042726.354254054@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:42 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Alex Zhuravlev (Tomas)" <alex.zhuravlev@sun.com>,
+ Andreas Dilger <adilger@sun.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [04/90] jbd2: round commit timer up to avoid uncommitted transaction
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0004-jbd2-round-commit-timer-up-to-avoid-uncommitted-tran.patch
+Content-Length: 1099
+Lines: 27
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit b1f485f20eb9b02cc7d2009556287f3939d480cc)
+
+fix jiffie rounding in jbd commit timer setup code. Rounding down
+could cause the timer to be fired before the corresponding transaction
+has expired. That transaction can stay not committed forever if no
+new transaction is created or expicit sync/umount happens.
+
+Signed-off-by: Alex Zhuravlev (Tomas) <alex.zhuravlev@sun.com>
+Signed-off-by: Andreas Dilger <adilger@sun.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/jbd2/transaction.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/jbd2/transaction.c
++++ b/fs/jbd2/transaction.c
+@@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal,
+ INIT_LIST_HEAD(&transaction->t_private_list);
+
+ /* Set up the commit timer for the new transaction. */
+- journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
++ journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
+ add_timer(&journal->j_commit_timer);
+
+ J_ASSERT(journal->j_running_transaction == NULL);
+
+
+From linux@linux.site Thu Dec 10 20:27:27 2009
+Message-Id: <20091211042726.960452135@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:43 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Peng Tao <bergwolf@gmail.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [05/90] ext4: fix journal ref count in move_extent_par_page
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0005-ext4-fix-journal-ref-count-in-move_extent_par_page.patch
+Content-Length: 920
+Lines: 27
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 91cc219ad963731191247c5f2db4118be2bc341a)
+
+move_extent_par_page calls a_ops->write_begin() to increase journal
+handler's reference count. However, if either mext_replace_branches()
+or ext4_get_block fails, the increased reference count isn't
+decreased. This will cause a later attempt to umount of the fs to hang
+forever. The patch addresses the issue by calling ext4_journal_stop()
+if page is not NULL (which means a_ops->write_end() isn't invoked).
+
+Signed-off-by: Peng Tao <bergwolf@gmail.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -871,6 +871,7 @@ out:
+ if (PageLocked(page))
+ unlock_page(page);
+ page_cache_release(page);
++ ext4_journal_stop(handle);
+ }
+ out2:
+ ext4_journal_stop(handle);
+
+
+From linux@linux.site Thu Dec 10 20:27:28 2009
+Message-Id: <20091211042727.486958503@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:44 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [06/90] ext4: Fix bugs in mballocs stream allocation mode
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0006-ext4-Fix-bugs-in-mballoc-s-stream-allocation-mode.patch
+Content-Length: 3374
+Lines: 99
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 4ba74d00a20256e22f159cb288ff34b587608917)
+
+The logic around sbi->s_mb_last_group and sbi->s_mb_last_start was all
+screwed up. These fields were getting unconditionally all the time,
+set even when stream allocation had not taken place, and if they were
+being used when the file was smaller than s_mb_stream_request, which
+is when the allocation should _not_ be doing stream allocation.
+
+Fix this by determining whether or not we stream allocation should
+take place once, in ext4_mb_group_or_file(), and setting a flag which
+gets used in ext4_mb_regular_allocator() and ext4_mb_use_best_found().
+This simplifies the code and assures that we are consistently using
+(or not using) the stream allocation logic.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 2 ++
+ fs/ext4/mballoc.c | 23 ++++++++++-------------
+ 2 files changed, 12 insertions(+), 13 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -88,6 +88,8 @@ typedef unsigned int ext4_group_t;
+ #define EXT4_MB_HINT_TRY_GOAL 512
+ /* blocks already pre-reserved by delayed allocation */
+ #define EXT4_MB_DELALLOC_RESERVED 1024
++/* We are doing stream allocation */
++#define EXT4_MB_STREAM_ALLOC 2048
+
+
+ struct ext4_allocation_request {
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -1360,7 +1360,7 @@ static void ext4_mb_use_best_found(struc
+ ac->alloc_semp = e4b->alloc_semp;
+ e4b->alloc_semp = NULL;
+ /* store last allocated for subsequent stream allocation */
+- if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
++ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
+ spin_lock(&sbi->s_md_lock);
+ sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
+ sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
+@@ -1938,7 +1938,6 @@ ext4_mb_regular_allocator(struct ext4_al
+ struct ext4_sb_info *sbi;
+ struct super_block *sb;
+ struct ext4_buddy e4b;
+- loff_t size, isize;
+
+ sb = ac->ac_sb;
+ sbi = EXT4_SB(sb);
+@@ -1974,20 +1973,16 @@ ext4_mb_regular_allocator(struct ext4_al
+ }
+
+ bsbits = ac->ac_sb->s_blocksize_bits;
+- /* if stream allocation is enabled, use global goal */
+- size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+- isize = i_size_read(ac->ac_inode) >> bsbits;
+- if (size < isize)
+- size = isize;
+
+- if (size < sbi->s_mb_stream_request &&
+- (ac->ac_flags & EXT4_MB_HINT_DATA)) {
++ /* if stream allocation is enabled, use global goal */
++ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
+ /* TBD: may be hot point */
+ spin_lock(&sbi->s_md_lock);
+ ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
+ ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
+ spin_unlock(&sbi->s_md_lock);
+ }
++
+ /* Let's just scan groups to find more-less suitable blocks */
+ cr = ac->ac_2order ? 0 : 1;
+ /*
+@@ -4155,16 +4150,18 @@ static void ext4_mb_group_or_file(struct
+ if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
+ return;
+
++ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
++ return;
++
+ size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+ isize = i_size_read(ac->ac_inode) >> bsbits;
+ size = max(size, isize);
+
+ /* don't use group allocation for large files */
+- if (size >= sbi->s_mb_stream_request)
+- return;
+-
+- if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
++ if (size >= sbi->s_mb_stream_request) {
++ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ return;
++ }
+
+ BUG_ON(ac->ac_lg != NULL);
+ /*
+
+
+From linux@linux.site Thu Dec 10 20:27:28 2009
+Message-Id: <20091211042728.124212000@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:45 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [07/90] ext4: Avoid group preallocation for closed files
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0007-ext4-Avoid-group-preallocation-for-closed-files.patch
+Content-Length: 3603
+Lines: 103
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 50797481a7bdee548589506d7d7b48b08bc14dcd)
+
+Currently the group preallocation code tries to find a large (512)
+free block from which to do per-cpu group allocation for small files.
+The problem with this scheme is that it leaves the filesystem horribly
+fragmented. In the worst case, if the filesystem is unmounted and
+remounted (after a system shutdown, for example) we forget the fact
+that wee were using a particular (now-partially filled) 512 block
+extent. So the next time we try to allocate space for a small file,
+we will find *another* completely free 512 block chunk to allocate
+small files. Given that there are 32,768 blocks in a block group,
+after 64 iterations of "mount, write one 4k file in a directory,
+unmount", the block group will have 64 files, each separated by 511
+blocks, and the block group will no longer have any free 512
+completely free chunks of blocks for group preallocation space.
+
+So if we try to allocate blocks for a file that has been closed, such
+that we know the final size of the file, and the filesystem is not
+busy, avoid using group preallocation.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 30 +++++++++++++++++++++++++++++-
+ fs/ext4/mballoc.c | 10 +++++++++-
+ 2 files changed, 38 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -952,6 +952,7 @@ struct ext4_sb_info {
+ atomic_t s_mb_lost_chunks;
+ atomic_t s_mb_preallocated;
+ atomic_t s_mb_discarded;
++ atomic_t s_lock_busy;
+
+ /* locality groups */
+ struct ext4_locality_group *s_locality_groups;
+@@ -1593,15 +1594,42 @@ struct ext4_group_info {
+ #define EXT4_MB_GRP_NEED_INIT(grp) \
+ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+
++#define EXT4_MAX_CONTENTION 8
++#define EXT4_CONTENTION_THRESHOLD 2
++
+ static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
+ ext4_group_t group)
+ {
+ return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
+ }
+
++/*
++ * Returns true if the filesystem is busy enough that attempts to
++ * access the block group locks has run into contention.
++ */
++static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
++{
++ return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
++}
++
+ static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+ {
+- spin_lock(ext4_group_lock_ptr(sb, group));
++ spinlock_t *lock = ext4_group_lock_ptr(sb, group);
++ if (spin_trylock(lock))
++ /*
++ * We're able to grab the lock right away, so drop the
++ * lock contention counter.
++ */
++ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
++ else {
++ /*
++ * The lock is busy, so bump the contention counter,
++ * and then wait on the spin lock.
++ */
++ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
++ EXT4_MAX_CONTENTION);
++ spin_lock(lock);
++ }
+ }
+
+ static inline void ext4_unlock_group(struct super_block *sb,
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -4154,9 +4154,17 @@ static void ext4_mb_group_or_file(struct
+ return;
+
+ size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+- isize = i_size_read(ac->ac_inode) >> bsbits;
++ isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
++ >> bsbits;
+ size = max(size, isize);
+
++ if ((size == isize) &&
++ !ext4_fs_is_busy(sbi) &&
++ (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
++ ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
++ return;
++ }
++
+ /* don't use group allocation for large files */
+ if (size >= sbi->s_mb_stream_request) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+
+
+From linux@linux.site Thu Dec 10 20:27:29 2009
+Message-Id: <20091211042728.672704124@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:46 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Jan Kara <jack@suse.cz>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [08/90] jbd2: Annotate transaction start also for jbd2_journal_restart()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0008-jbd2-Annotate-transaction-start-also-for-jbd2_journa.patch
+Content-Length: 1349
+Lines: 43
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 9599b0e597d810be9b8f759ea6e9619c4f983c5e)
+
+lockdep annotation for a transaction start has been at the end of
+jbd2_journal_start(). But a transaction is also started from
+jbd2_journal_restart(). Move the lockdep annotation to start_this_handle()
+which covers both cases.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/jbd2/transaction.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/fs/jbd2/transaction.c
++++ b/fs/jbd2/transaction.c
+@@ -238,6 +238,8 @@ repeat_locked:
+ __jbd2_log_space_left(journal));
+ spin_unlock(&transaction->t_handle_lock);
+ spin_unlock(&journal->j_state_lock);
++
++ lock_map_acquire(&handle->h_lockdep_map);
+ out:
+ if (unlikely(new_transaction)) /* It's usually NULL */
+ kfree(new_transaction);
+@@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *
+ handle = ERR_PTR(err);
+ goto out;
+ }
+-
+- lock_map_acquire(&handle->h_lockdep_map);
+ out:
+ return handle;
+ }
+@@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handl
+ __jbd2_log_start_commit(journal, transaction->t_tid);
+ spin_unlock(&journal->j_state_lock);
+
++ lock_map_release(&handle->h_lockdep_map);
+ handle->h_buffer_credits = nblocks;
+ ret = start_this_handle(journal, handle);
+ return ret;
+
+
+From linux@linux.site Thu Dec 10 20:27:29 2009
+Message-Id: <20091211042729.262525249@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:47 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Jan Kara <jack@suse.cz>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [09/90] ext4: Fix possible deadlock between ext4_truncate() and ext4_get_blocks()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0009-ext4-Fix-possible-deadlock-between-ext4_truncate-and.patch
+Content-Length: 4792
+Lines: 132
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+During truncate we are sometimes forced to start a new transaction as
+the amount of blocks to be journaled is both quite large and hard to
+predict. So far we restarted a transaction while holding i_data_sem
+and that violates lock ordering because i_data_sem ranks below a
+transaction start (and it can lead to a real deadlock with
+ext4_get_blocks() mapping blocks in some page while having a
+transaction open).
+
+(cherry picked from commit 487caeef9fc08c0565e082c40a8aaf58dad92bbb)
+
+We fix the problem by dropping the i_data_sem before restarting the
+transaction and acquire it afterwards. It's slightly subtle that this
+works:
+
+1) By the time ext4_truncate() is called, all the page cache for the
+truncated part of the file is dropped so get_block() should not be
+called on it (we only have to invalidate extent cache after we
+reacquire i_data_sem because some extent from not-truncated part could
+extend also into the part we are going to truncate).
+
+2) Writes, migrate or defrag hold i_mutex so they are stopped for all
+the time of the truncate.
+
+This bug has been found and analyzed by Theodore Tso <tytso@mit.edu>.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 1 +
+ fs/ext4/extents.c | 15 ++++++++++++---
+ fs/ext4/inode.c | 23 +++++++++++++++++++----
+ 3 files changed, 32 insertions(+), 7 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1370,6 +1370,7 @@ extern int ext4_change_inode_journal_fla
+ extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+ extern int ext4_can_truncate(struct inode *inode);
+ extern void ext4_truncate(struct inode *);
++extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
+ extern void ext4_set_inode_flags(struct inode *);
+ extern void ext4_get_inode_flags(struct ext4_inode_info *);
+ extern int ext4_alloc_da_blocks(struct inode *inode);
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct
+ ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
+ }
+
+-static int ext4_ext_journal_restart(handle_t *handle, int needed)
++static int ext4_ext_truncate_extend_restart(handle_t *handle,
++ struct inode *inode,
++ int needed)
+ {
+ int err;
+
+@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(hand
+ err = ext4_journal_extend(handle, needed);
+ if (err <= 0)
+ return err;
+- return ext4_journal_restart(handle, needed);
++ err = ext4_truncate_restart_trans(handle, inode, needed);
++ /*
++ * We have dropped i_data_sem so someone might have cached again
++ * an extent we are going to truncate.
++ */
++ ext4_ext_invalidate_cache(inode);
++
++ return err;
+ }
+
+ /*
+@@ -2138,7 +2147,7 @@ ext4_ext_rm_leaf(handle_t *handle, struc
+ }
+ credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
+
+- err = ext4_ext_journal_restart(handle, credits);
++ err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+ if (err)
+ goto out;
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -192,11 +192,24 @@ static int try_to_extend_transaction(han
+ * so before we call here everything must be consistently dirtied against
+ * this transaction.
+ */
+-static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
++ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
++ int nblocks)
+ {
++ int ret;
++
++ /*
++ * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
++ * moment, get_block can be called only for blocks inside i_size since
++ * page cache has been already dropped and writes are blocked by
++ * i_mutex. So we can safely drop the i_data_sem here.
++ */
+ BUG_ON(EXT4_JOURNAL(inode) == NULL);
+ jbd_debug(2, "restarting handle %p\n", handle);
+- return ext4_journal_restart(handle, blocks_for_truncate(inode));
++ up_write(&EXT4_I(inode)->i_data_sem);
++ ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
++ down_write(&EXT4_I(inode)->i_data_sem);
++
++ return ret;
+ }
+
+ /*
+@@ -3659,7 +3672,8 @@ static void ext4_clear_blocks(handle_t *
+ ext4_handle_dirty_metadata(handle, inode, bh);
+ }
+ ext4_mark_inode_dirty(handle, inode);
+- ext4_journal_test_restart(handle, inode);
++ ext4_truncate_restart_trans(handle, inode,
++ blocks_for_truncate(inode));
+ if (bh) {
+ BUFFER_TRACE(bh, "retaking write access");
+ ext4_journal_get_write_access(handle, bh);
+@@ -3870,7 +3884,8 @@ static void ext4_free_branches(handle_t
+ return;
+ if (try_to_extend_transaction(handle, inode)) {
+ ext4_mark_inode_dirty(handle, inode);
+- ext4_journal_test_restart(handle, inode);
++ ext4_truncate_restart_trans(handle, inode,
++ blocks_for_truncate(inode));
+ }
+
+ ext4_free_blocks(handle, inode, nr, 1, 1);
+
+
+From linux@linux.site Thu Dec 10 20:27:30 2009
+Message-Id: <20091211042730.000017969@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:48 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Eric Sandeen <sandeen@redhat.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [10/90] ext4: reject too-large filesystems on 32-bit kernels
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0010-ext4-reject-too-large-filesystems-on-32-bit-kernels.patch
+Content-Length: 1508
+Lines: 45
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit bf43d84b185e2ff54598f8c58a5a8e63148b6e90)
+
+ext4 will happily mount a > 16T filesystem on a 32-bit box, but
+this is not safe; writes to the block device will wrap past 16T
+and the page cache can't index past 16T (232 index * 4k pages).
+
+Adding another test to the existing "too many sectors" test
+should do the trick.
+
+Add a comment, a relevant return value, and fix the reference
+to the CONFIG_LBD(AF) option as well.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/super.c | 13 ++++++++++---
+ 1 file changed, 10 insertions(+), 3 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -2550,12 +2550,19 @@ static int ext4_fill_super(struct super_
+ goto failed_mount;
+ }
+
+- if (ext4_blocks_count(es) >
+- (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
++ /*
++ * Test whether we have more sectors than will fit in sector_t,
++ * and whether the max offset is addressable by the page cache.
++ */
++ if ((ext4_blocks_count(es) >
++ (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
++ (ext4_blocks_count(es) >
++ (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
+ ext4_msg(sb, KERN_ERR, "filesystem"
+- " too large to mount safely");
++ " too large to mount safely on this system");
+ if (sizeof(sector_t) < 8)
+ ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
++ ret = -EFBIG;
+ goto failed_mount;
+ }
+
+
+
+From linux@linux.site Thu Dec 10 20:27:31 2009
+Message-Id: <20091211042730.719283784@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:49 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Eric Sandeen <sandeen@redhat.com>,
+ "Theodore Tso" <tytso@mit.edu>
+Subject: [11/90] ext4: Add feature set check helper for mount & remount paths
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0011-ext4-Add-feature-set-check-helper-for-mount-remount-.patch
+Content-Length: 5384
+Lines: 157
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit a13fb1a4533f26c1e2b0204d5283b696689645af)
+
+A user reported that although his root ext4 filesystem was mounting
+fine, other filesystems would not mount, with the:
+
+"Filesystem with huge files cannot be mounted RDWR without CONFIG_LBDAF"
+
+error on his 32-bit box built without CONFIG_LBDAF. This is because
+the test at mount time for this situation was not being re-checked
+on remount, and the normal boot process makes an ro->rw transition,
+so this was being missed.
+
+Refactor to make a common helper function to test the filesystem
+features against the type of mount request (RO vs. RW) so that we
+stay consistent.
+
+Addresses Red-Hat-Bugzilla: #517650
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+---
+ fs/ext4/super.c | 91 ++++++++++++++++++++++++++++++--------------------------
+ 1 file changed, 49 insertions(+), 42 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -2254,6 +2254,49 @@ static struct kobj_type ext4_ktype = {
+ .release = ext4_sb_release,
+ };
+
++/*
++ * Check whether this filesystem can be mounted based on
++ * the features present and the RDONLY/RDWR mount requested.
++ * Returns 1 if this filesystem can be mounted as requested,
++ * 0 if it cannot be.
++ */
++static int ext4_feature_set_ok(struct super_block *sb, int readonly)
++{
++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
++ ext4_msg(sb, KERN_ERR,
++ "Couldn't mount because of "
++ "unsupported optional features (%x)",
++ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
++ ~EXT4_FEATURE_INCOMPAT_SUPP));
++ return 0;
++ }
++
++ if (readonly)
++ return 1;
++
++ /* Check that feature set is OK for a read-write mount */
++ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
++ ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
++ "unsupported optional features (%x)",
++ (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
++ ~EXT4_FEATURE_RO_COMPAT_SUPP));
++ return 0;
++ }
++ /*
++ * Large file size enabled file system can only be mounted
++ * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
++ */
++ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
++ if (sizeof(blkcnt_t) < sizeof(u64)) {
++ ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
++ "cannot be mounted RDWR without "
++ "CONFIG_LBDAF");
++ return 0;
++ }
++ }
++ return 1;
++}
++
+ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+ __releases(kernel_lock)
+ __acquires(kernel_lock)
+@@ -2275,7 +2318,6 @@ static int ext4_fill_super(struct super_
+ unsigned int db_count;
+ unsigned int i;
+ int needs_recovery, has_huge_files;
+- int features;
+ __u64 blocks_count;
+ int err;
+ unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+@@ -2402,39 +2444,9 @@ static int ext4_fill_super(struct super_
+ * previously didn't change the revision level when setting the flags,
+ * so there is a chance incompat flags are set on a rev 0 filesystem.
+ */
+- features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
+- if (features) {
+- ext4_msg(sb, KERN_ERR,
+- "Couldn't mount because of "
+- "unsupported optional features (%x)",
+- (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+- ~EXT4_FEATURE_INCOMPAT_SUPP));
++ if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
+ goto failed_mount;
+- }
+- features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
+- if (!(sb->s_flags & MS_RDONLY) && features) {
+- ext4_msg(sb, KERN_ERR,
+- "Couldn't mount RDWR because of "
+- "unsupported optional features (%x)",
+- (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+- ~EXT4_FEATURE_RO_COMPAT_SUPP));
+- goto failed_mount;
+- }
+- has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+- if (has_huge_files) {
+- /*
+- * Large file size enabled file system can only be
+- * mount if kernel is build with CONFIG_LBDAF
+- */
+- if (sizeof(root->i_blocks) < sizeof(u64) &&
+- !(sb->s_flags & MS_RDONLY)) {
+- ext4_msg(sb, KERN_ERR, "Filesystem with huge "
+- "files cannot be mounted read-write "
+- "without CONFIG_LBDAF");
+- goto failed_mount;
+- }
+- }
++
+ blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+
+ if (blocksize < EXT4_MIN_BLOCK_SIZE ||
+@@ -2470,6 +2482,8 @@ static int ext4_fill_super(struct super_
+ }
+ }
+
++ has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
++ EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+ sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
+ has_huge_files);
+ sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
+@@ -3485,18 +3499,11 @@ static int ext4_remount(struct super_blo
+ if (sbi->s_journal)
+ ext4_mark_recovery_complete(sb, es);
+ } else {
+- int ret;
+- if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+- ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
+- ext4_msg(sb, KERN_WARNING, "couldn't "
+- "remount RDWR because of unsupported "
+- "optional features (%x)",
+- (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
+- ~EXT4_FEATURE_RO_COMPAT_SUPP));
++ /* Make sure we can mount this feature set readwrite */
++ if (!ext4_feature_set_ok(sb, 0)) {
+ err = -EROFS;
+ goto restore_opts;
+ }
+-
+ /*
+ * Make sure the group descriptor checksums
+ * are sane. If they aren't, refuse to remount r/w.
+
+
+From linux@linux.site Thu Dec 10 20:27:32 2009
+Message-Id: <20091211042731.668869144@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:50 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>,
+ "Theodore Tso" <tytso@mit.edu>
+Subject: [12/90] ext4: Add missing unlock_new_inode() call in extent migration code
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0012-ext4-Add-missing-unlock_new_inode-call-in-extent-mig.patch
+Content-Length: 1785
+Lines: 46
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit a8526e84ac758ac6da45cf273aa1538a6a7aa3de)
+
+We need to unlock the new inode before iput. This patch fixes the
+following warning when calling chattr +e to migrate a file to use
+extents. It also fixes problems in when e4defrag attempts to
+defragment an inode.
+
+[ 470.400044] ------------[ cut here ]------------
+[ 470.400065] WARNING: at fs/inode.c:1210 generic_delete_inode+0x65/0x16a()
+[ 470.400072] Hardware name: N/A
+.....
+...
+[ 470.400353] Pid: 4451, comm: chattr Not tainted 2.6.31-rc7-red-debug #4
+[ 470.400359] Call Trace:
+[ 470.400372] [<ffffffff81037771>] warn_slowpath_common+0x77/0x8f
+[ 470.400385] [<ffffffff81037798>] warn_slowpath_null+0xf/0x11
+[ 470.400395] [<ffffffff810b7f28>] generic_delete_inode+0x65/0x16a
+[ 470.400405] [<ffffffff810b8044>] generic_drop_inode+0x17/0x1bd
+[ 470.400413] [<ffffffff810b7083>] iput+0x61/0x65
+[ 470.400455] [<ffffffffa003b229>] ext4_ext_migrate+0x5eb/0x66a [ext4]
+[ 470.400492] [<ffffffffa002b1f8>] ext4_ioctl+0x340/0x756 [ext4]
+[ 470.400507] [<ffffffff810b1a91>] vfs_ioctl+0x1d/0x82
+[ 470.400517] [<ffffffff810b1ff0>] do_vfs_ioctl+0x483/0x4c9
+[ 470.400527] [<ffffffff81059c30>] ? trace_hardirqs_on+0xd/0xf
+[ 470.400537] [<ffffffff810b2087>] sys_ioctl+0x51/0x74
+[ 470.400549] [<ffffffff8100ba6b>] system_call_fastpath+0x16/0x1b
+[ 470.400557] ---[ end trace ab85723542352dac ]---
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+---
+ fs/ext4/migrate.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/migrate.c
++++ b/fs/ext4/migrate.c
+@@ -618,7 +618,7 @@ err_out:
+ tmp_inode->i_nlink = 0;
+
+ ext4_journal_stop(handle);
+-
++ unlock_new_inode(tmp_inode);
+ iput(tmp_inode);
+
+ return retval;
+
+
+From linux@linux.site Thu Dec 10 20:27:33 2009
+Message-Id: <20091211042732.645012343@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:51 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>,
+ "Theodore Tso" <tytso@mit.edu>
+Subject: [13/90] ext4: Allow rename to create more than EXT4_LINK_MAX subdirectories
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0013-ext4-Allow-rename-to-create-more-than-EXT4_LINK_MAX-.patch
+Content-Length: 705
+Lines: 23
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 2c94eb86c66e1eaaa1e7d8a2120f4fad5e7e7736)
+
+Use EXT4_DIR_LINK_MAX so that rename() can move a directory into new
+parent directory without running into the EXT4_LINK_MAX limit.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+---
+ fs/ext4/namei.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -2413,7 +2413,7 @@ static int ext4_rename(struct inode *old
+ goto end_rename;
+ retval = -EMLINK;
+ if (!new_inode && new_dir != old_dir &&
+- new_dir->i_nlink >= EXT4_LINK_MAX)
++ EXT4_DIR_LINK_MAX(new_dir))
+ goto end_rename;
+ }
+ if (!new_bh) {
+
+
+From linux@linux.site Thu Dec 10 20:27:34 2009
+Message-Id: <20091211042733.492375175@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:52 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>
+Subject: [14/90] ext4: Limit number of links that can be created by ext4_link()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0014-ext4-Limit-number-of-links-that-can-be-created-by-ex.patch
+Content-Length: 633
+Lines: 23
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit b05ab1dc3795e6f997fb0d34f38fce5012533c3e)
+
+In ext4_link we need to check using EXT4_LINK_MAX, and not
+EXT4_DIR_LINK_MAX(), since ext4_link() is creating hard links of
+regular files, and not directories.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+---
+ fs/ext4/namei.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -2310,7 +2310,7 @@ static int ext4_link(struct dentry *old_
+ struct inode *inode = old_dentry->d_inode;
+ int err, retries = 0;
+
+- if (EXT4_DIR_LINK_MAX(inode))
++ if (inode->i_nlink >= EXT4_LINK_MAX)
+ return -EMLINK;
+
+ /*
+
+
+From linux@linux.site Thu Dec 10 20:27:35 2009
+Message-Id: <20091211042734.391001793@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:53 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>
+Subject: [15/90] ext4: Restore wbc->range_start in ext4_da_writepages()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0015-ext4-Restore-wbc-range_start-in-ext4_da_writepages.patch
+Content-Length: 1228
+Lines: 35
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit de89de6e0cf4b1eb13f27137cf2aa40d287aabdf)
+
+To solve a lock inversion problem, we implement part of the
+range_cyclic algorithm in ext4_da_writepages(). (See commit 2acf2c26
+for more details.)
+
+As part of that change wbc->range_start was modified by ext4's
+writepages function, which causes its callers to get confused since
+they aren't expecting the filesystem to modify it. The simplest fix
+is to save and restore wbc->range_start in ext4_da_writepages.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+---
+ fs/ext4/inode.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -2750,6 +2750,7 @@ static int ext4_da_writepages(struct add
+ long pages_skipped;
+ int range_cyclic, cycled = 1, io_done = 0;
+ int needed_blocks, ret = 0, nr_to_writebump = 0;
++ loff_t range_start = wbc->range_start;
+ struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+
+ trace_ext4_da_writepages(inode, wbc);
+@@ -2918,6 +2919,7 @@ out_writepages:
+ if (!no_nrwrite_index_update)
+ wbc->no_nrwrite_index_update = 0;
+ wbc->nr_to_write -= nr_to_writebump;
++ wbc->range_start = range_start;
+ trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
+ return ret;
+ }
+
+
+From linux@linux.site Thu Dec 10 20:27:36 2009
+Message-Id: <20091211042735.238833324@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:54 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Christoph Hellwig <hch@lst.de>,
+ Eric Sandeen <sandeen@redhat.com>,
+ "Theodore Tso" <tytso@mit.edu>
+Subject: [16/90] ext4: fix cache flush in ext4_sync_file
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0016-ext4-fix-cache-flush-in-ext4_sync_file.patch
+Content-Length: 1069
+Lines: 31
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 5f3481e9a80c240f169b36ea886e2325b9aeb745)
+
+We need to flush the write cache unconditionally in ->fsync, otherwise
+writes into already allocated blocks can get lost. Writes into fully
+allocated files are very common when using disk images for
+virtualization, and without this fix can easily lose data after
+an fdatasync, which is the typical implementation for a cache flush on
+the virtual drive.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Acked-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+---
+ fs/ext4/fsync.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/fsync.c
++++ b/fs/ext4/fsync.c
+@@ -92,9 +92,9 @@ int ext4_sync_file(struct file *file, st
+ .nr_to_write = 0, /* sys_fsync did this */
+ };
+ ret = sync_inode(inode, &wbc);
+- if (journal && (journal->j_flags & JBD2_BARRIER))
+- blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ }
+ out:
++ if (journal && (journal->j_flags & JBD2_BARRIER))
++ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ return ret;
+ }
+
+
+From linux@linux.site Thu Dec 10 20:27:36 2009
+Message-Id: <20091211042736.230715168@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:55 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Akira Fujita <a-fujita@rs.jp.nec.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [17/90] ext4: Fix wrong comparisons in mext_check_arguments()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0017-ext4-Fix-wrong-comparisons-in-mext_check_arguments.patch
+Content-Length: 3782
+Lines: 97
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 70d5d3dcea47c16058d2b093c29e07fdf61b56ad)
+
+The mext_check_arguments() function in move_extents.c has wrong
+comparisons. orig_start which is passed from user-space is block
+unit, but i_size of inode is byte unit, therefore the checks do not
+work fine. This mis-check leads to the overflow of 'len' and then
+hits BUG_ON() in ext4_move_extents(). The patch fixes this issue.
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Reviewed-by: Greg Freemyer <greg.freemyer@gmail.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c | 46 +++++++++++++++++++++++++++-------------------
+ 1 file changed, 27 insertions(+), 19 deletions(-)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -898,6 +898,10 @@ mext_check_arguments(struct inode *orig_
+ struct inode *donor_inode, __u64 orig_start,
+ __u64 donor_start, __u64 *len, __u64 moved_len)
+ {
++ ext4_lblk_t orig_blocks, donor_blocks;
++ unsigned int blkbits = orig_inode->i_blkbits;
++ unsigned int blocksize = 1 << blkbits;
++
+ /* Regular file check */
+ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+ ext4_debug("ext4 move extent: The argument files should be "
+@@ -972,43 +976,47 @@ mext_check_arguments(struct inode *orig_
+ }
+
+ if (orig_inode->i_size > donor_inode->i_size) {
+- if (orig_start >= donor_inode->i_size) {
++ donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
++ /* TODO: eliminate this artificial restriction */
++ if (orig_start >= donor_blocks) {
+ ext4_debug("ext4 move extent: orig start offset "
+- "[%llu] should be less than donor file size "
+- "[%lld] [ino:orig %lu, donor_inode %lu]\n",
+- orig_start, donor_inode->i_size,
++ "[%llu] should be less than donor file blocks "
++ "[%u] [ino:orig %lu, donor %lu]\n",
++ orig_start, donor_blocks,
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+- if (orig_start + *len > donor_inode->i_size) {
++ /* TODO: eliminate this artificial restriction */
++ if (orig_start + *len > donor_blocks) {
+ ext4_debug("ext4 move extent: End offset [%llu] should "
+- "be less than donor file size [%lld]."
+- "So adjust length from %llu to %lld "
++ "be less than donor file blocks [%u]."
++ "So adjust length from %llu to %llu "
+ "[ino:orig %lu, donor %lu]\n",
+- orig_start + *len, donor_inode->i_size,
+- *len, donor_inode->i_size - orig_start,
++ orig_start + *len, donor_blocks,
++ *len, donor_blocks - orig_start,
+ orig_inode->i_ino, donor_inode->i_ino);
+- *len = donor_inode->i_size - orig_start;
++ *len = donor_blocks - orig_start;
+ }
+ } else {
+- if (orig_start >= orig_inode->i_size) {
++ orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
++ if (orig_start >= orig_blocks) {
+ ext4_debug("ext4 move extent: start offset [%llu] "
+- "should be less than original file size "
+- "[%lld] [inode:orig %lu, donor %lu]\n",
+- orig_start, orig_inode->i_size,
++ "should be less than original file blocks "
++ "[%u] [ino:orig %lu, donor %lu]\n",
++ orig_start, orig_blocks,
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
+
+- if (orig_start + *len > orig_inode->i_size) {
++ if (orig_start + *len > orig_blocks) {
+ ext4_debug("ext4 move extent: Adjust length "
+- "from %llu to %lld. Because it should be "
+- "less than original file size "
++ "from %llu to %llu. Because it should be "
++ "less than original file blocks "
+ "[ino:orig %lu, donor %lu]\n",
+- *len, orig_inode->i_size - orig_start,
++ *len, orig_blocks - orig_start,
+ orig_inode->i_ino, donor_inode->i_ino);
+- *len = orig_inode->i_size - orig_start;
++ *len = orig_blocks - orig_start;
+ }
+ }
+
+
+
+From linux@linux.site Thu Dec 10 20:27:37 2009
+Message-Id: <20091211042737.028935695@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:56 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Akira Fujita <a-fujita@rs.jp.nec.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [18/90] ext4: Remove unneeded BUG_ON() in ext4_move_extents()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0018-ext4-Remove-unneeded-BUG_ON-in-ext4_move_extents.patch
+Content-Length: 923
+Lines: 28
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit daea696dbac0e33af3cfe304efbfb8d74e0effe6)
+
+The ext4_move_extents() functions checks with BUG_ON() whether the
+exchanged blocks count accords with request blocks count. But, if the
+target range (orig_start + len) includes sparse block(s), 'moved_len'
+(exchanged blocks count) does not agree with 'len' (request blocks
+count), since sparse block is not counted in 'moved_len'. This causes
+us to hit the BUG_ON(), even though the function succeeded.
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c | 3 ---
+ 1 file changed, 3 deletions(-)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -1322,8 +1322,5 @@ out2:
+ if (ret)
+ return ret;
+
+- /* All of the specified blocks must be exchanged in succeed */
+- BUG_ON(*moved_len != len);
+-
+ return 0;
+ }
+
+
+From linux@linux.site Thu Dec 10 20:27:38 2009
+Message-Id: <20091211042737.946304901@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:57 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Akira Fujita <a-fujita@rs.jp.nec.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [19/90] ext4: Return exchanged blocks count to user space in failure
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0019-ext4-Return-exchanged-blocks-count-to-user-space-in-.patch
+Content-Length: 771
+Lines: 29
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 8d6669133d8cdbb7cbe0e1f0f3744e7802a84afe)
+
+Return exchanged blocks count (moved_len) to user space,
+if ext4_move_extents() failed on the way.
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ioctl.c | 7 +++----
+ 1 file changed, 3 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -243,10 +243,9 @@ setversion_out:
+ me.donor_start, me.len, &me.moved_len);
+ fput(donor_filp);
+
+- if (!err)
+- if (copy_to_user((struct move_extent *)arg,
+- &me, sizeof(me)))
+- return -EFAULT;
++ if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
++ return -EFAULT;
++
+ return err;
+ }
+
+
+
+From linux@linux.site Thu Dec 10 20:27:39 2009
+Message-Id: <20091211042738.873362487@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:58 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [20/90] ext4: Take page lock before looking at attached buffer_heads flags
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0020-ext4-Take-page-lock-before-looking-at-attached-buffe.patch
+Content-Length: 1236
+Lines: 39
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit a827eaffff07c7d58a4cb32158cbeb4849f4e33a)
+
+In order to check whether the buffer_heads are mapped we need to hold
+page lock. Otherwise a reclaim can cleanup the attached buffer_heads.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 13 +++++++++++--
+ 1 file changed, 11 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -5298,12 +5298,21 @@ int ext4_page_mkwrite(struct vm_area_str
+ else
+ len = PAGE_CACHE_SIZE;
+
++ lock_page(page);
++ /*
++ * return if we have all the buffers mapped. This avoid
++ * the need to call write_begin/write_end which does a
++ * journal_start/journal_stop which can block and take
++ * long time
++ */
+ if (page_has_buffers(page)) {
+- /* return if we have all the buffers mapped */
+ if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+- ext4_bh_unmapped))
++ ext4_bh_unmapped)) {
++ unlock_page(page);
+ goto out_unlock;
++ }
+ }
++ unlock_page(page);
+ /*
+ * OK, we need to fill the hole... Do write_begin write_end
+ * to do block allocation/reservation.We are not holding
+
+
+From linux@linux.site Thu Dec 10 20:27:40 2009
+Message-Id: <20091211042739.750528035@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:59 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [21/90] ext4: print more sysadmin-friendly message in check_block_validity()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0021-ext4-print-more-sysadmin-friendly-message-in-check_b.patch
+Content-Length: 2085
+Lines: 60
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 80e42468d65475e92651e62175bb7807773321d0)
+
+Drop the WARN_ON(1), as he stack trace is not appropriate, since it is
+triggered by file system corruption, and it misleads users into
+thinking there is a kernel bug. In addition, change the message
+displayed by ext4_error() to make it clear that this is a file system
+corruption problem.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1122,16 +1122,15 @@ static void ext4_da_update_reserve_space
+ ext4_discard_preallocations(inode);
+ }
+
+-static int check_block_validity(struct inode *inode, sector_t logical,
+- sector_t phys, int len)
++static int check_block_validity(struct inode *inode, const char *msg,
++ sector_t logical, sector_t phys, int len)
+ {
+ if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
+- ext4_error(inode->i_sb, "check_block_validity",
++ ext4_error(inode->i_sb, msg,
+ "inode #%lu logical block %llu mapped to %llu "
+ "(size %d)", inode->i_ino,
+ (unsigned long long) logical,
+ (unsigned long long) phys, len);
+- WARN_ON(1);
+ return -EIO;
+ }
+ return 0;
+@@ -1183,8 +1182,8 @@ int ext4_get_blocks(handle_t *handle, st
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+ if (retval > 0 && buffer_mapped(bh)) {
+- int ret = check_block_validity(inode, block,
+- bh->b_blocknr, retval);
++ int ret = check_block_validity(inode, "file system corruption",
++ block, bh->b_blocknr, retval);
+ if (ret != 0)
+ return ret;
+ }
+@@ -1265,8 +1264,9 @@ int ext4_get_blocks(handle_t *handle, st
+
+ up_write((&EXT4_I(inode)->i_data_sem));
+ if (retval > 0 && buffer_mapped(bh)) {
+- int ret = check_block_validity(inode, block,
+- bh->b_blocknr, retval);
++ int ret = check_block_validity(inode, "file system "
++ "corruption after allocation",
++ block, bh->b_blocknr, retval);
+ if (ret != 0)
+ return ret;
+ }
+
+
+From linux@linux.site Thu Dec 10 20:27:41 2009
+Message-Id: <20091211042740.547415555@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:00 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [22/90] ext4: Use bforget() in no journal mode for ext4_journal_{forget,revoke}()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0022-ext4-Use-bforget-in-no-journal-mode-for-ext4_journal.patch
+Content-Length: 1230
+Lines: 41
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit c7acb4c16646943180bd221c167a077e0a084f9c)
+
+When ext4 is using a journal, a metadata block which is deallocated
+must be passed into the journal layer so it can be dropped from the
+current transaction and/or revoked. This is done by calling the
+functions ext4_journal_forget() and ext4_journal_revoke(), which call
+jbd2_journal_forget(), and jbd2_journal_revoke(), respectively.
+
+Since the jbd2_journal_forget() and jbd2_journal_revoke() call
+bforget(), if ext4 is not using a journal, ext4_journal_forget() and
+ext4_journal_revoke() must call bforget() to avoid a dirty metadata
+block overwriting a block after it has been reallocated and reused for
+another inode's data block.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4_jbd2.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/ext4_jbd2.c
++++ b/fs/ext4/ext4_jbd2.c
+@@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *wh
+ handle, err);
+ }
+ else
+- brelse(bh);
++ bforget(bh);
+ return err;
+ }
+
+@@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *wh
+ handle, err);
+ }
+ else
+- brelse(bh);
++ bforget(bh);
+ return err;
+ }
+
+
+
+From linux@linux.site Thu Dec 10 20:27:41 2009
+Message-Id: <20091211042741.225776673@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:01 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [23/90] ext4: Assure that metadata blocks are written during fsync in no journal mode
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0023-ext4-Assure-that-metadata-blocks-are-written-during-.patch
+Content-Length: 1900
+Lines: 63
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit fe188c0e084bdf3038dc0ac963c21d764f53f7da)
+
+When there is no journal present, we must attach buffer heads
+associated with extent tree and indirect blocks to the inode's
+mapping->private_list via mark_buffer_dirty_inode() so that
+ext4_sync_file() --- which is called to service fsync() and
+fdatasync() system calls --- can write out the inode's metadata blocks
+by calling sync_mapping_buffers().
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4_jbd2.c | 5 ++++-
+ fs/ext4/fsync.c | 9 +++++++--
+ 2 files changed, 11 insertions(+), 3 deletions(-)
+
+--- a/fs/ext4/ext4_jbd2.c
++++ b/fs/ext4/ext4_jbd2.c
+@@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const c
+ ext4_journal_abort_handle(where, __func__, bh,
+ handle, err);
+ } else {
+- mark_buffer_dirty(bh);
++ if (inode && bh)
++ mark_buffer_dirty_inode(bh, inode);
++ else
++ mark_buffer_dirty(bh);
+ if (inode && inode_needs_sync(inode)) {
+ sync_dirty_buffer(bh);
+ if (buffer_req(bh) && !buffer_uptodate(bh)) {
+--- a/fs/ext4/fsync.c
++++ b/fs/ext4/fsync.c
+@@ -50,7 +50,7 @@ int ext4_sync_file(struct file *file, st
+ {
+ struct inode *inode = dentry->d_inode;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+- int ret = 0;
++ int err, ret = 0;
+
+ J_ASSERT(ext4_journal_current_handle() == NULL);
+
+@@ -79,6 +79,9 @@ int ext4_sync_file(struct file *file, st
+ goto out;
+ }
+
++ if (!journal)
++ ret = sync_mapping_buffers(inode->i_mapping);
++
+ if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ goto out;
+
+@@ -91,7 +94,9 @@ int ext4_sync_file(struct file *file, st
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = 0, /* sys_fsync did this */
+ };
+- ret = sync_inode(inode, &wbc);
++ err = sync_inode(inode, &wbc);
++ if (ret == 0)
++ ret = err;
+ }
+ out:
+ if (journal && (journal->j_flags & JBD2_BARRIER))
+
+
+From linux@linux.site Thu Dec 10 20:27:42 2009
+Message-Id: <20091211042741.985036844@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:02 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Frank Mayhar <fmayhar@google.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [24/90] ext4: Make non-journal fsync work properly
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0024-ext4-Make-non-journal-fsync-work-properly.patch
+Content-Length: 3529
+Lines: 113
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 91ac6f43317c0bf99969665f98016548011dfa38)
+
+Teach ext4_write_inode() and ext4_do_update_inode() about non-journal
+mode: If we're not using a journal, ext4_write_inode() now calls
+ext4_do_update_inode() (after getting the iloc via ext4_get_inode_loc())
+with a new "do_sync" parameter. If that parameter is nonzero _and_ we're
+not using a journal, ext4_do_update_inode() calls sync_dirty_buffer()
+instead of ext4_handle_dirty_metadata().
+
+This problem was found in power-fail testing, checking the amount of
+loss of files and blocks after a power failure when using fsync() and
+when not using fsync(). It turned out that using fsync() was actually
+worse than not doing so, possibly because it increased the likelihood
+that the inodes would remain unflushed and would therefore be lost at
+the power failure.
+
+Signed-off-by: Frank Mayhar <fmayhar@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 54 ++++++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 40 insertions(+), 14 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -4550,7 +4550,8 @@ static int ext4_inode_blocks_set(handle_
+ */
+ static int ext4_do_update_inode(handle_t *handle,
+ struct inode *inode,
+- struct ext4_iloc *iloc)
++ struct ext4_iloc *iloc,
++ int do_sync)
+ {
+ struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+@@ -4652,10 +4653,22 @@ static int ext4_do_update_inode(handle_t
+ raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+ }
+
+- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+- rc = ext4_handle_dirty_metadata(handle, inode, bh);
+- if (!err)
+- err = rc;
++ /*
++ * If we're not using a journal and we were called from
++ * ext4_write_inode() to sync the inode (making do_sync true),
++ * we can just use sync_dirty_buffer() directly to do our dirty
++ * work. Testing s_journal here is a bit redundant but it's
++ * worth it to avoid potential future trouble.
++ */
++ if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
++ BUFFER_TRACE(bh, "call sync_dirty_buffer");
++ sync_dirty_buffer(bh);
++ } else {
++ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
++ rc = ext4_handle_dirty_metadata(handle, inode, bh);
++ if (!err)
++ err = rc;
++ }
+ ei->i_state &= ~EXT4_STATE_NEW;
+
+ out_brelse:
+@@ -4701,19 +4714,32 @@ out_brelse:
+ */
+ int ext4_write_inode(struct inode *inode, int wait)
+ {
++ int err;
++
+ if (current->flags & PF_MEMALLOC)
+ return 0;
+
+- if (ext4_journal_current_handle()) {
+- jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+- dump_stack();
+- return -EIO;
+- }
++ if (EXT4_SB(inode->i_sb)->s_journal) {
++ if (ext4_journal_current_handle()) {
++ jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
++ dump_stack();
++ return -EIO;
++ }
+
+- if (!wait)
+- return 0;
++ if (!wait)
++ return 0;
++
++ err = ext4_force_commit(inode->i_sb);
++ } else {
++ struct ext4_iloc iloc;
+
+- return ext4_force_commit(inode->i_sb);
++ err = ext4_get_inode_loc(inode, &iloc);
++ if (err)
++ return err;
++ err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
++ inode, &iloc, wait);
++ }
++ return err;
+ }
+
+ /*
+@@ -5007,7 +5033,7 @@ int ext4_mark_iloc_dirty(handle_t *handl
+ get_bh(iloc->bh);
+
+ /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
+- err = ext4_do_update_inode(handle, inode, iloc);
++ err = ext4_do_update_inode(handle, inode, iloc, 0);
+ put_bh(iloc->bh);
+ return err;
+ }
+
+
+From linux@linux.site Thu Dec 10 20:27:44 2009
+Message-Id: <20091211042742.991422660@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:03 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [25/90] ext4: move ext4_mb_init_group() function earlier in the mballoc.c
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0025-ext4-move-ext4_mb_init_group-function-earlier-in-the.patch
+Content-Length: 5462
+Lines: 211
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit b6a758ec3af3ec236dbfdcf6a06b84ac8f94957e)
+
+This moves the function around so that it can be called from
+ext4_mb_load_buddy().
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/mballoc.c | 182 +++++++++++++++++++++++++++---------------------------
+ 1 file changed, 91 insertions(+), 91 deletions(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -908,6 +908,97 @@ out:
+ return err;
+ }
+
++static noinline_for_stack
++int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
++{
++
++ int ret = 0;
++ void *bitmap;
++ int blocks_per_page;
++ int block, pnum, poff;
++ int num_grp_locked = 0;
++ struct ext4_group_info *this_grp;
++ struct ext4_sb_info *sbi = EXT4_SB(sb);
++ struct inode *inode = sbi->s_buddy_cache;
++ struct page *page = NULL, *bitmap_page = NULL;
++
++ mb_debug("init group %lu\n", group);
++ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
++ this_grp = ext4_get_group_info(sb, group);
++ /*
++ * This ensures we don't add group
++ * to this buddy cache via resize
++ */
++ num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
++ if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
++ /*
++ * somebody initialized the group
++ * return without doing anything
++ */
++ ret = 0;
++ goto err;
++ }
++ /*
++ * the buddy cache inode stores the block bitmap
++ * and buddy information in consecutive blocks.
++ * So for each group we need two blocks.
++ */
++ block = group * 2;
++ pnum = block / blocks_per_page;
++ poff = block % blocks_per_page;
++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
++ if (page) {
++ BUG_ON(page->mapping != inode->i_mapping);
++ ret = ext4_mb_init_cache(page, NULL);
++ if (ret) {
++ unlock_page(page);
++ goto err;
++ }
++ unlock_page(page);
++ }
++ if (page == NULL || !PageUptodate(page)) {
++ ret = -EIO;
++ goto err;
++ }
++ mark_page_accessed(page);
++ bitmap_page = page;
++ bitmap = page_address(page) + (poff * sb->s_blocksize);
++
++ /* init buddy cache */
++ block++;
++ pnum = block / blocks_per_page;
++ poff = block % blocks_per_page;
++ page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
++ if (page == bitmap_page) {
++ /*
++ * If both the bitmap and buddy are in
++ * the same page we don't need to force
++ * init the buddy
++ */
++ unlock_page(page);
++ } else if (page) {
++ BUG_ON(page->mapping != inode->i_mapping);
++ ret = ext4_mb_init_cache(page, bitmap);
++ if (ret) {
++ unlock_page(page);
++ goto err;
++ }
++ unlock_page(page);
++ }
++ if (page == NULL || !PageUptodate(page)) {
++ ret = -EIO;
++ goto err;
++ }
++ mark_page_accessed(page);
++err:
++ ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
++ if (bitmap_page)
++ page_cache_release(bitmap_page);
++ if (page)
++ page_cache_release(page);
++ return ret;
++}
++
+ static noinline_for_stack int
+ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+ struct ext4_buddy *e4b)
+@@ -1837,97 +1928,6 @@ void ext4_mb_put_buddy_cache_lock(struct
+
+ }
+
+-static noinline_for_stack
+-int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+-{
+-
+- int ret;
+- void *bitmap;
+- int blocks_per_page;
+- int block, pnum, poff;
+- int num_grp_locked = 0;
+- struct ext4_group_info *this_grp;
+- struct ext4_sb_info *sbi = EXT4_SB(sb);
+- struct inode *inode = sbi->s_buddy_cache;
+- struct page *page = NULL, *bitmap_page = NULL;
+-
+- mb_debug("init group %lu\n", group);
+- blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+- this_grp = ext4_get_group_info(sb, group);
+- /*
+- * This ensures we don't add group
+- * to this buddy cache via resize
+- */
+- num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
+- if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+- /*
+- * somebody initialized the group
+- * return without doing anything
+- */
+- ret = 0;
+- goto err;
+- }
+- /*
+- * the buddy cache inode stores the block bitmap
+- * and buddy information in consecutive blocks.
+- * So for each group we need two blocks.
+- */
+- block = group * 2;
+- pnum = block / blocks_per_page;
+- poff = block % blocks_per_page;
+- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+- if (page) {
+- BUG_ON(page->mapping != inode->i_mapping);
+- ret = ext4_mb_init_cache(page, NULL);
+- if (ret) {
+- unlock_page(page);
+- goto err;
+- }
+- unlock_page(page);
+- }
+- if (page == NULL || !PageUptodate(page)) {
+- ret = -EIO;
+- goto err;
+- }
+- mark_page_accessed(page);
+- bitmap_page = page;
+- bitmap = page_address(page) + (poff * sb->s_blocksize);
+-
+- /* init buddy cache */
+- block++;
+- pnum = block / blocks_per_page;
+- poff = block % blocks_per_page;
+- page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+- if (page == bitmap_page) {
+- /*
+- * If both the bitmap and buddy are in
+- * the same page we don't need to force
+- * init the buddy
+- */
+- unlock_page(page);
+- } else if (page) {
+- BUG_ON(page->mapping != inode->i_mapping);
+- ret = ext4_mb_init_cache(page, bitmap);
+- if (ret) {
+- unlock_page(page);
+- goto err;
+- }
+- unlock_page(page);
+- }
+- if (page == NULL || !PageUptodate(page)) {
+- ret = -EIO;
+- goto err;
+- }
+- mark_page_accessed(page);
+-err:
+- ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+- if (bitmap_page)
+- page_cache_release(bitmap_page);
+- if (page)
+- page_cache_release(page);
+- return ret;
+-}
+-
+ static noinline_for_stack int
+ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
+ {
+
+
+From linux@linux.site Thu Dec 10 20:27:45 2009
+Message-Id: <20091211042744.228047197@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:04 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [26/90] ext4: check for need init flag in ext4_mb_load_buddy
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0026-ext4-check-for-need-init-flag-in-ext4_mb_load_buddy.patch
+Content-Length: 2180
+Lines: 75
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit f41c0750538667b87a19c93952e5d42fcc069bd7)
+
+We should check for need init flag with the group's alloc_sem held, to
+make sure while we are loading the buddy cache and holding a reference
+to it, a file system resize can't add new blocks to same group.
+
+The patch also drops the need init flag check in
+ext4_mb_regular_allocator() because doing the check without holding
+alloc_sem is racy.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/mballoc.c | 39 ++++++++++++++++++---------------------
+ 1 file changed, 18 insertions(+), 21 deletions(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -1032,8 +1032,26 @@ ext4_mb_load_buddy(struct super_block *s
+ * groups mapped by the page is blocked
+ * till we are done with allocation
+ */
++repeat_load_buddy:
+ down_read(e4b->alloc_semp);
+
++ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
++ /* we need to check for group need init flag
++ * with alloc_semp held so that we can be sure
++ * that new blocks didn't get added to the group
++ * when we are loading the buddy cache
++ */
++ up_read(e4b->alloc_semp);
++ /*
++ * we need full data about the group
++ * to make a good selection
++ */
++ ret = ext4_mb_init_group(sb, group);
++ if (ret)
++ return ret;
++ goto repeat_load_buddy;
++ }
++
+ /*
+ * the buddy cache inode stores the block bitmap
+ * and buddy information in consecutive blocks.
+@@ -2010,27 +2028,6 @@ repeat:
+ if (grp->bb_free == 0)
+ continue;
+
+- /*
+- * if the group is already init we check whether it is
+- * a good group and if not we don't load the buddy
+- */
+- if (EXT4_MB_GRP_NEED_INIT(grp)) {
+- /*
+- * we need full data about the group
+- * to make a good selection
+- */
+- err = ext4_mb_init_group(sb, group);
+- if (err)
+- goto out;
+- }
+-
+- /*
+- * If the particular group doesn't satisfy our
+- * criteria we continue with the next group
+- */
+- if (!ext4_mb_good_group(ac, group, cr))
+- continue;
+-
+ err = ext4_mb_load_buddy(sb, group, &e4b);
+ if (err)
+ goto out;
+
+
+From linux@linux.site Thu Dec 10 20:27:46 2009
+Message-Id: <20091211042745.672220552@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:05 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [27/90] ext4: Dont update superblock write time when filesystem is read-only
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0027-ext4-Don-t-update-superblock-write-time-when-filesys.patch
+Content-Length: 1535
+Lines: 37
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 71290b368ad5e1e0b0b300c9d5638490a9fd1a2d)
+
+This avoids updating the superblock write time when we are mounting
+the root file system read/only but we need to replay the journal; at
+that point, for people who are east of GMT and who make their clock
+tick in localtime for Windows bug-for-bug compatibility, and this will
+cause e2fsck to complain and force a full file system check.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/super.c | 13 ++++++++++++-
+ 1 file changed, 12 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -3230,7 +3230,18 @@ static int ext4_commit_super(struct supe
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ }
+- es->s_wtime = cpu_to_le32(get_seconds());
++ /*
++ * If the file system is mounted read-only, don't update the
++ * superblock write time. This avoids updating the superblock
++ * write time when we are mounting the root file system
++ * read/only but we need to replay the journal; at that point,
++ * for people who are east of GMT and who make their clock
++ * tick in localtime for Windows bug-for-bug compatibility,
++ * the clock is set in the future, and this will cause e2fsck
++ * to complain and force a full file system check.
++ */
++ if (!(sb->s_flags & MS_RDONLY))
++ es->s_wtime = cpu_to_le32(get_seconds());
+ es->s_kbytes_written =
+ cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+
+
+From linux@linux.site Thu Dec 10 20:27:47 2009
+Message-Id: <20091211042746.862451041@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:06 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Andreas Schlick <schlick@lavabit.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [28/90] ext4: Always set dx_nodes fake_dirent explicitly.
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0028-ext4-Always-set-dx_node-s-fake_dirent-explicitly.patch
+Content-Length: 1031
+Lines: 28
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 1f7bebb9e911d870fa8f997ddff838e82b5715ea)
+
+When ext4_dx_add_entry() has to split an index node, it has to ensure that
+name_len of dx_node's fake_dirent is also zero, because otherwise e2fsck
+won't recognise it as an intermediate htree node and consider the htree to
+be corrupted.
+
+Signed-off-by: Andreas Schlick <schlick@lavabit.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/namei.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -1590,9 +1590,9 @@ static int ext4_dx_add_entry(handle_t *h
+ goto cleanup;
+ node2 = (struct dx_node *)(bh2->b_data);
+ entries2 = node2->entries;
++ memset(&node2->fake, 0, sizeof(struct fake_dirent));
+ node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
+ sb->s_blocksize);
+- node2->fake.inode = 0;
+ BUFFER_TRACE(frame->bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, frame->bh);
+ if (err)
+
+
+From linux@linux.site Thu Dec 10 20:27:48 2009
+Message-Id: <20091211042747.846949412@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:07 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [29/90] ext4: Fix initalization of s_flex_groups
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0029-ext4-Fix-initalization-of-s_flex_groups.patch
+Content-Length: 1510
+Lines: 40
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 7ad9bb651fc2036ea94bed94da76a4b08959a911)
+
+The s_flex_groups array should have been initialized using atomic_add
+to sum up the free counts from the block groups that make up a
+flex_bg. By using atomic_set, the value of the s_flex_groups array
+was set to the values of the last block group in the flex_bg.
+
+The impact of this bug is that the block and inode allocation
+algorithms might not pick the best flex_bg for new allocation.
+
+Thanks to Damien Guibouret for pointing out this problem!
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/super.c | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -1696,12 +1696,12 @@ static int ext4_fill_flex_info(struct su
+ gdp = ext4_get_group_desc(sb, i, NULL);
+
+ flex_group = ext4_flex_group(sbi, i);
+- atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
+- ext4_free_inodes_count(sb, gdp));
+- atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
+- ext4_free_blks_count(sb, gdp));
+- atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
+- ext4_used_dirs_count(sb, gdp));
++ atomic_add(ext4_free_inodes_count(sb, gdp),
++ &sbi->s_flex_groups[flex_group].free_inodes);
++ atomic_add(ext4_free_blks_count(sb, gdp),
++ &sbi->s_flex_groups[flex_group].free_blocks);
++ atomic_add(ext4_used_dirs_count(sb, gdp),
++ &sbi->s_flex_groups[flex_group].used_dirs);
+ }
+
+ return 1;
+
+
+From linux@linux.site Thu Dec 10 20:27:49 2009
+Message-Id: <20091211042748.760455091@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:08 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [30/90] ext4: Fix include/trace/events/ext4.h to work with Systemtap
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0030-ext4-Fix-include-trace-events-ext4.h-to-work-with-Sy.patch
+Content-Length: 1379
+Lines: 46
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 3661d28615ea580c1db02a972fd4d3898df1cb01)
+
+Using relative pathnames in #include statements interacts badly with
+SystemTap, since the fs/ext4/*.h header files are not packaged up as
+part of a distribution kernel's header files. Since systemtap doesn't
+use TP_fast_assign(), we can use a blind structure definition and then
+make sure the needed header files are defined before the ext4 source
+files #include the trace/events/ext4.h header file.
+
+https://bugzilla.redhat.com/show_bug.cgi?id=512478
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/super.c | 1 +
+ include/trace/events/ext4.h | 6 ++++--
+ 2 files changed, 5 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -45,6 +45,7 @@
+ #include "ext4_jbd2.h"
+ #include "xattr.h"
+ #include "acl.h"
++#include "mballoc.h"
+
+ #define CREATE_TRACE_POINTS
+ #include <trace/events/ext4.h>
+--- a/include/trace/events/ext4.h
++++ b/include/trace/events/ext4.h
+@@ -5,10 +5,12 @@
+ #define _TRACE_EXT4_H
+
+ #include <linux/writeback.h>
+-#include "../../../fs/ext4/ext4.h"
+-#include "../../../fs/ext4/mballoc.h"
+ #include <linux/tracepoint.h>
+
++struct ext4_allocation_context;
++struct ext4_allocation_request;
++struct ext4_prealloc_space;
++
+ TRACE_EVENT(ext4_free_inode,
+ TP_PROTO(struct inode *inode),
+
+
+
+From linux@linux.site Thu Dec 10 20:27:49 2009
+Message-Id: <20091211042749.363249380@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:09 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Akira Fujita <a-fujita@rs.jp.nec.co.jp>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [31/90] ext4: Fix small typo for move_extent_per_page()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0031-ext4-Fix-small-typo-for-move_extent_per_page.patch
+Content-Length: 1155
+Lines: 33
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 44fc48f7048ab9657b524938a832fec4e0acea98)
+
+This function means moving extents every page, so change its name from
+move_exgtent_par_page().
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.co.jp>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -740,7 +740,7 @@ out:
+ * on success, or a negative error value on failure.
+ */
+ static int
+-move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
++move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+ pgoff_t orig_page_offset, int data_offset_in_page,
+ int block_len_in_page, int uninit)
+ {
+@@ -1267,7 +1267,7 @@ ext4_move_extents(struct file *o_filp, s
+ while (orig_page_offset <= seq_end_page) {
+
+ /* Swap original branches with new branches */
+- ret = move_extent_par_page(o_filp, donor_inode,
++ ret = move_extent_per_page(o_filp, donor_inode,
+ orig_page_offset,
+ data_offset_in_page,
+ block_len_in_page, uninit);
+
+
+From linux@linux.site Thu Dec 10 20:27:50 2009
+Message-Id: <20091211042749.975038594@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:10 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Akira Fujita <a-fujita@rs.jp.nec.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [32/90] ext4: Replace get_ext_path macro with an inline funciton
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0032-ext4-Replace-get_ext_path-macro-with-an-inline-funci.patch
+Content-Length: 4445
+Lines: 142
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit e8505970af46658ece2545e9bc1fe594998fdcdf)
+
+Replace get_ext_path macro with an inline function,
+since this macro looks like a function call but its arguments
+get modified. Ted pointed this out, thanks.
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c | 55 ++++++++++++++++++++++++++++++--------------------
+ 1 file changed, 34 insertions(+), 21 deletions(-)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -19,14 +19,29 @@
+ #include "ext4_extents.h"
+ #include "ext4.h"
+
+-#define get_ext_path(path, inode, block, ret) \
+- do { \
+- path = ext4_ext_find_extent(inode, block, path); \
+- if (IS_ERR(path)) { \
+- ret = PTR_ERR(path); \
+- path = NULL; \
+- } \
+- } while (0)
++/**
++ * get_ext_path - Find an extent path for designated logical block number.
++ *
++ * @inode: an inode which is searched
++ * @lblock: logical block number to find an extent path
++ * @path: pointer to an extent path pointer (for output)
++ *
++ * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
++ * on failure.
++ */
++static inline int
++get_ext_path(struct inode *inode, ext4_lblk_t lblock,
++ struct ext4_ext_path **path)
++{
++ int ret = 0;
++
++ *path = ext4_ext_find_extent(inode, lblock, *path);
++ if (IS_ERR(*path)) {
++ ret = PTR_ERR(*path);
++ *path = NULL;
++ }
++ return ret;
++}
+
+ /**
+ * copy_extent_status - Copy the extent's initialization status
+@@ -283,7 +298,7 @@ mext_insert_across_blocks(handle_t *hand
+ }
+
+ if (new_flag) {
+- get_ext_path(orig_path, orig_inode, eblock, err);
++ err = get_ext_path(orig_inode, eblock, &orig_path);
+ if (orig_path == NULL)
+ goto out;
+
+@@ -293,8 +308,8 @@ mext_insert_across_blocks(handle_t *hand
+ }
+
+ if (end_flag) {
+- get_ext_path(orig_path, orig_inode,
+- le32_to_cpu(end_ext->ee_block) - 1, err);
++ err = get_ext_path(orig_inode,
++ le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
+ if (orig_path == NULL)
+ goto out;
+
+@@ -631,12 +646,12 @@ mext_replace_branches(handle_t *handle,
+ mext_double_down_write(orig_inode, donor_inode);
+
+ /* Get the original extent for the block "orig_off" */
+- get_ext_path(orig_path, orig_inode, orig_off, err);
++ err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (orig_path == NULL)
+ goto out;
+
+ /* Get the donor extent for the head */
+- get_ext_path(donor_path, donor_inode, donor_off, err);
++ err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (donor_path == NULL)
+ goto out;
+ depth = ext_depth(orig_inode);
+@@ -678,7 +693,7 @@ mext_replace_branches(handle_t *handle,
+
+ if (orig_path)
+ ext4_ext_drop_refs(orig_path);
+- get_ext_path(orig_path, orig_inode, orig_off, err);
++ err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (orig_path == NULL)
+ goto out;
+ depth = ext_depth(orig_inode);
+@@ -692,8 +707,7 @@ mext_replace_branches(handle_t *handle,
+
+ if (donor_path)
+ ext4_ext_drop_refs(donor_path);
+- get_ext_path(donor_path, donor_inode,
+- donor_off, err);
++ err = get_ext_path(donor_inode, donor_off, &donor_path);
+ if (donor_path == NULL)
+ goto out;
+ depth = ext_depth(donor_inode);
+@@ -1154,12 +1168,12 @@ ext4_move_extents(struct file *o_filp, s
+ if (file_end < block_end)
+ len -= block_end - file_end;
+
+- get_ext_path(orig_path, orig_inode, block_start, ret);
++ ret = get_ext_path(orig_inode, block_start, &orig_path);
+ if (orig_path == NULL)
+ goto out2;
+
+ /* Get path structure to check the hole */
+- get_ext_path(holecheck_path, orig_inode, block_start, ret);
++ ret = get_ext_path(orig_inode, block_start, &holecheck_path);
+ if (holecheck_path == NULL)
+ goto out;
+
+@@ -1289,8 +1303,7 @@ ext4_move_extents(struct file *o_filp, s
+ /* Decrease buffer counter */
+ if (holecheck_path)
+ ext4_ext_drop_refs(holecheck_path);
+- get_ext_path(holecheck_path, orig_inode,
+- seq_start, ret);
++ ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
+ if (holecheck_path == NULL)
+ break;
+ depth = holecheck_path->p_depth;
+@@ -1298,7 +1311,7 @@ ext4_move_extents(struct file *o_filp, s
+ /* Decrease buffer counter */
+ if (orig_path)
+ ext4_ext_drop_refs(orig_path);
+- get_ext_path(orig_path, orig_inode, seq_start, ret);
++ ret = get_ext_path(orig_inode, seq_start, &orig_path);
+ if (orig_path == NULL)
+ break;
+
+
+
+From linux@linux.site Thu Dec 10 20:27:51 2009
+Message-Id: <20091211042750.648840028@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:11 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Akira Fujita <a-fujita@rs.jp.nec.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [33/90] ext4: Replace BUG_ON() with ext4_error() in move_extents.c
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0033-ext4-Replace-BUG_ON-with-ext4_error-in-move_extents..patch
+Content-Length: 10385
+Lines: 352
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 2147b1a6a48e28399120ca51d4a91840a278611f)
+
+Replace BUG_ON calls with a call to ext4_error()
+to print an error message if EXT4_IOC_MOVE_EXT failed
+with some kind of reasons. This will help to debug.
+Ted pointed this out, thanks.
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c | 149 ++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 109 insertions(+), 40 deletions(-)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -128,6 +128,31 @@ mext_next_extent(struct inode *inode, st
+ }
+
+ /**
++ * mext_check_null_inode - NULL check for two inodes
++ *
++ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
++ */
++static int
++mext_check_null_inode(struct inode *inode1, struct inode *inode2,
++ const char *function)
++{
++ int ret = 0;
++
++ if (inode1 == NULL) {
++ ext4_error(inode2->i_sb, function,
++ "Both inodes should not be NULL: "
++ "inode1 NULL inode2 %lu", inode2->i_ino);
++ ret = -EIO;
++ } else if (inode2 == NULL) {
++ ext4_error(inode1->i_sb, function,
++ "Both inodes should not be NULL: "
++ "inode1 %lu inode2 NULL", inode1->i_ino);
++ ret = -EIO;
++ }
++ return ret;
++}
++
++/**
+ * mext_double_down_read - Acquire two inodes' read semaphore
+ *
+ * @orig_inode: original inode structure
+@@ -139,8 +164,6 @@ mext_double_down_read(struct inode *orig
+ {
+ struct inode *first = orig_inode, *second = donor_inode;
+
+- BUG_ON(orig_inode == NULL || donor_inode == NULL);
+-
+ /*
+ * Use the inode number to provide the stable locking order instead
+ * of its address, because the C language doesn't guarantee you can
+@@ -167,8 +190,6 @@ mext_double_down_write(struct inode *ori
+ {
+ struct inode *first = orig_inode, *second = donor_inode;
+
+- BUG_ON(orig_inode == NULL || donor_inode == NULL);
+-
+ /*
+ * Use the inode number to provide the stable locking order instead
+ * of its address, because the C language doesn't guarantee you can
+@@ -193,8 +214,6 @@ mext_double_down_write(struct inode *ori
+ static void
+ mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
+ {
+- BUG_ON(orig_inode == NULL || donor_inode == NULL);
+-
+ up_read(&EXT4_I(orig_inode)->i_data_sem);
+ up_read(&EXT4_I(donor_inode)->i_data_sem);
+ }
+@@ -209,8 +228,6 @@ mext_double_up_read(struct inode *orig_i
+ static void
+ mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
+ {
+- BUG_ON(orig_inode == NULL || donor_inode == NULL);
+-
+ up_write(&EXT4_I(orig_inode)->i_data_sem);
+ up_write(&EXT4_I(donor_inode)->i_data_sem);
+ }
+@@ -534,7 +551,15 @@ mext_leaf_block(handle_t *handle, struct
+ * oext |-----------|
+ * new_ext |-------|
+ */
+- BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
++ if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
++ ext4_error(orig_inode->i_sb, __func__,
++ "new_ext_end(%u) should be less than or equal to "
++ "oext->ee_block(%u) + oext_alen(%d) - 1",
++ new_ext_end, le32_to_cpu(oext->ee_block),
++ oext_alen);
++ ret = -EIO;
++ goto out;
++ }
+
+ /*
+ * Case: new_ext is smaller than original extent
+@@ -558,6 +583,7 @@ mext_leaf_block(handle_t *handle, struct
+
+ ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
+ o_end, &start_ext, &new_ext, &end_ext);
++out:
+ return ret;
+ }
+
+@@ -668,7 +694,20 @@ mext_replace_branches(handle_t *handle,
+ /* Loop for the donor extents */
+ while (1) {
+ /* The extent for donor must be found. */
+- BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
++ if (!dext) {
++ ext4_error(donor_inode->i_sb, __func__,
++ "The extent for donor must be found");
++ err = -EIO;
++ goto out;
++ } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
++ ext4_error(donor_inode->i_sb, __func__,
++ "Donor offset(%u) and the first block of donor "
++ "extent(%u) should be equal",
++ donor_off,
++ le32_to_cpu(tmp_dext.ee_block));
++ err = -EIO;
++ goto out;
++ }
+
+ /* Set donor extent to orig extent */
+ err = mext_leaf_block(handle, orig_inode,
+@@ -1050,18 +1089,23 @@ mext_check_arguments(struct inode *orig_
+ * @inode1: the inode structure
+ * @inode2: the inode structure
+ *
+- * Lock two inodes' i_mutex by i_ino order. This function is moved from
+- * fs/inode.c.
++ * Lock two inodes' i_mutex by i_ino order.
++ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ */
+-static void
++static int
+ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
+ {
+- if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
+- if (inode1)
+- mutex_lock(&inode1->i_mutex);
+- else if (inode2)
+- mutex_lock(&inode2->i_mutex);
+- return;
++ int ret = 0;
++
++ BUG_ON(inode1 == NULL && inode2 == NULL);
++
++ ret = mext_check_null_inode(inode1, inode2, __func__);
++ if (ret < 0)
++ goto out;
++
++ if (inode1 == inode2) {
++ mutex_lock(&inode1->i_mutex);
++ goto out;
+ }
+
+ if (inode1->i_ino < inode2->i_ino) {
+@@ -1071,6 +1115,9 @@ mext_inode_double_lock(struct inode *ino
+ mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
+ }
++
++out:
++ return ret;
+ }
+
+ /**
+@@ -1079,17 +1126,28 @@ mext_inode_double_lock(struct inode *ino
+ * @inode1: the inode that is released first
+ * @inode2: the inode that is released second
+ *
+- * This function is moved from fs/inode.c.
++ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ */
+
+-static void
++static int
+ mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
+ {
++ int ret = 0;
++
++ BUG_ON(inode1 == NULL && inode2 == NULL);
++
++ ret = mext_check_null_inode(inode1, inode2, __func__);
++ if (ret < 0)
++ goto out;
++
+ if (inode1)
+ mutex_unlock(&inode1->i_mutex);
+
+ if (inode2 && inode2 != inode1)
+ mutex_unlock(&inode2->i_mutex);
++
++out:
++ return ret;
+ }
+
+ /**
+@@ -1146,21 +1204,23 @@ ext4_move_extents(struct file *o_filp, s
+ ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
+ ext4_lblk_t rest_blocks;
+ pgoff_t orig_page_offset = 0, seq_end_page;
+- int ret, depth, last_extent = 0;
++ int ret1, ret2, depth, last_extent = 0;
+ int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+ int data_offset_in_page;
+ int block_len_in_page;
+ int uninit;
+
+ /* protect orig and donor against a truncate */
+- mext_inode_double_lock(orig_inode, donor_inode);
++ ret1 = mext_inode_double_lock(orig_inode, donor_inode);
++ if (ret1 < 0)
++ return ret1;
+
+ mext_double_down_read(orig_inode, donor_inode);
+ /* Check the filesystem environment whether move_extent can be done */
+- ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
++ ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
+ donor_start, &len, *moved_len);
+ mext_double_up_read(orig_inode, donor_inode);
+- if (ret)
++ if (ret1)
+ goto out2;
+
+ file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
+@@ -1168,19 +1228,19 @@ ext4_move_extents(struct file *o_filp, s
+ if (file_end < block_end)
+ len -= block_end - file_end;
+
+- ret = get_ext_path(orig_inode, block_start, &orig_path);
++ ret1 = get_ext_path(orig_inode, block_start, &orig_path);
+ if (orig_path == NULL)
+ goto out2;
+
+ /* Get path structure to check the hole */
+- ret = get_ext_path(orig_inode, block_start, &holecheck_path);
++ ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
+ if (holecheck_path == NULL)
+ goto out;
+
+ depth = ext_depth(orig_inode);
+ ext_cur = holecheck_path[depth].p_ext;
+ if (ext_cur == NULL) {
+- ret = -EINVAL;
++ ret1 = -EINVAL;
+ goto out;
+ }
+
+@@ -1193,13 +1253,13 @@ ext4_move_extents(struct file *o_filp, s
+ last_extent = mext_next_extent(orig_inode,
+ holecheck_path, &ext_cur);
+ if (last_extent < 0) {
+- ret = last_extent;
++ ret1 = last_extent;
+ goto out;
+ }
+ last_extent = mext_next_extent(orig_inode, orig_path,
+ &ext_dummy);
+ if (last_extent < 0) {
+- ret = last_extent;
++ ret1 = last_extent;
+ goto out;
+ }
+ }
+@@ -1209,7 +1269,7 @@ ext4_move_extents(struct file *o_filp, s
+ if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+ ext4_debug("ext4 move extent: The specified range of file "
+ "may be the hole\n");
+- ret = -EINVAL;
++ ret1 = -EINVAL;
+ goto out;
+ }
+
+@@ -1229,7 +1289,7 @@ ext4_move_extents(struct file *o_filp, s
+ last_extent = mext_next_extent(orig_inode, holecheck_path,
+ &ext_cur);
+ if (last_extent < 0) {
+- ret = last_extent;
++ ret1 = last_extent;
+ break;
+ }
+ add_blocks = ext4_ext_get_actual_len(ext_cur);
+@@ -1281,16 +1341,23 @@ ext4_move_extents(struct file *o_filp, s
+ while (orig_page_offset <= seq_end_page) {
+
+ /* Swap original branches with new branches */
+- ret = move_extent_per_page(o_filp, donor_inode,
++ ret1 = move_extent_per_page(o_filp, donor_inode,
+ orig_page_offset,
+ data_offset_in_page,
+ block_len_in_page, uninit);
+- if (ret < 0)
++ if (ret1 < 0)
+ goto out;
+ orig_page_offset++;
+ /* Count how many blocks we have exchanged */
+ *moved_len += block_len_in_page;
+- BUG_ON(*moved_len > len);
++ if (*moved_len > len) {
++ ext4_error(orig_inode->i_sb, __func__,
++ "We replaced blocks too much! "
++ "sum of replaced: %llu requested: %llu",
++ *moved_len, len);
++ ret1 = -EIO;
++ goto out;
++ }
+
+ data_offset_in_page = 0;
+ rest_blocks -= block_len_in_page;
+@@ -1303,7 +1370,7 @@ ext4_move_extents(struct file *o_filp, s
+ /* Decrease buffer counter */
+ if (holecheck_path)
+ ext4_ext_drop_refs(holecheck_path);
+- ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
++ ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
+ if (holecheck_path == NULL)
+ break;
+ depth = holecheck_path->p_depth;
+@@ -1311,7 +1378,7 @@ ext4_move_extents(struct file *o_filp, s
+ /* Decrease buffer counter */
+ if (orig_path)
+ ext4_ext_drop_refs(orig_path);
+- ret = get_ext_path(orig_inode, seq_start, &orig_path);
++ ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
+ if (orig_path == NULL)
+ break;
+
+@@ -1330,10 +1397,12 @@ out:
+ kfree(holecheck_path);
+ }
+ out2:
+- mext_inode_double_unlock(orig_inode, donor_inode);
++ ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
+
+- if (ret)
+- return ret;
++ if (ret1)
++ return ret1;
++ else if (ret2)
++ return ret2;
+
+ return 0;
+ }
+
+
+From linux@linux.site Thu Dec 10 20:27:51 2009
+Message-Id: <20091211042751.274203312@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:12 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Akira Fujita <a-fujita@rs.jp.nec.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [34/90] ext4: Add null extent check to ext_get_path
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0034-ext4-Add-null-extent-check-to-ext_get_path.patch
+Content-Length: 4115
+Lines: 142
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 347fa6f1c7cb5df2b38d3c9167cfe242ce0cd1da)
+
+There is the possibility that path structure which is taken
+by ext4_ext_find_extent() indicates null extents.
+Because during data block exchanging in ext4_move_extents(),
+constitution of an extent tree may be changed.
+As a solution, the patch adds null extent check
+to ext_get_path().
+
+Reported-by: Peng Tao <bergwolf@gmail.com>
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c | 34 ++++++++++++++++------------------
+ 1 file changed, 16 insertions(+), 18 deletions(-)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -39,7 +39,9 @@ get_ext_path(struct inode *inode, ext4_l
+ if (IS_ERR(*path)) {
+ ret = PTR_ERR(*path);
+ *path = NULL;
+- }
++ } else if ((*path)[ext_depth(inode)].p_ext == NULL)
++ ret = -ENODATA;
++
+ return ret;
+ }
+
+@@ -316,7 +318,7 @@ mext_insert_across_blocks(handle_t *hand
+
+ if (new_flag) {
+ err = get_ext_path(orig_inode, eblock, &orig_path);
+- if (orig_path == NULL)
++ if (err)
+ goto out;
+
+ if (ext4_ext_insert_extent(handle, orig_inode,
+@@ -327,7 +329,7 @@ mext_insert_across_blocks(handle_t *hand
+ if (end_flag) {
+ err = get_ext_path(orig_inode,
+ le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
+- if (orig_path == NULL)
++ if (err)
+ goto out;
+
+ if (ext4_ext_insert_extent(handle, orig_inode,
+@@ -673,12 +675,12 @@ mext_replace_branches(handle_t *handle,
+
+ /* Get the original extent for the block "orig_off" */
+ err = get_ext_path(orig_inode, orig_off, &orig_path);
+- if (orig_path == NULL)
++ if (err)
+ goto out;
+
+ /* Get the donor extent for the head */
+ err = get_ext_path(donor_inode, donor_off, &donor_path);
+- if (donor_path == NULL)
++ if (err)
+ goto out;
+ depth = ext_depth(orig_inode);
+ oext = orig_path[depth].p_ext;
+@@ -733,7 +735,7 @@ mext_replace_branches(handle_t *handle,
+ if (orig_path)
+ ext4_ext_drop_refs(orig_path);
+ err = get_ext_path(orig_inode, orig_off, &orig_path);
+- if (orig_path == NULL)
++ if (err)
+ goto out;
+ depth = ext_depth(orig_inode);
+ oext = orig_path[depth].p_ext;
+@@ -747,7 +749,7 @@ mext_replace_branches(handle_t *handle,
+ if (donor_path)
+ ext4_ext_drop_refs(donor_path);
+ err = get_ext_path(donor_inode, donor_off, &donor_path);
+- if (donor_path == NULL)
++ if (err)
+ goto out;
+ depth = ext_depth(donor_inode);
+ dext = donor_path[depth].p_ext;
+@@ -1221,7 +1223,7 @@ ext4_move_extents(struct file *o_filp, s
+ donor_start, &len, *moved_len);
+ mext_double_up_read(orig_inode, donor_inode);
+ if (ret1)
+- goto out2;
++ goto out;
+
+ file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
+ block_end = block_start + len - 1;
+@@ -1229,20 +1231,16 @@ ext4_move_extents(struct file *o_filp, s
+ len -= block_end - file_end;
+
+ ret1 = get_ext_path(orig_inode, block_start, &orig_path);
+- if (orig_path == NULL)
+- goto out2;
++ if (ret1)
++ goto out;
+
+ /* Get path structure to check the hole */
+ ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
+- if (holecheck_path == NULL)
++ if (ret1)
+ goto out;
+
+ depth = ext_depth(orig_inode);
+ ext_cur = holecheck_path[depth].p_ext;
+- if (ext_cur == NULL) {
+- ret1 = -EINVAL;
+- goto out;
+- }
+
+ /*
+ * Get proper extent whose ee_block is beyond block_start
+@@ -1371,7 +1369,7 @@ ext4_move_extents(struct file *o_filp, s
+ if (holecheck_path)
+ ext4_ext_drop_refs(holecheck_path);
+ ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
+- if (holecheck_path == NULL)
++ if (ret1)
+ break;
+ depth = holecheck_path->p_depth;
+
+@@ -1379,7 +1377,7 @@ ext4_move_extents(struct file *o_filp, s
+ if (orig_path)
+ ext4_ext_drop_refs(orig_path);
+ ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
+- if (orig_path == NULL)
++ if (ret1)
+ break;
+
+ ext_cur = holecheck_path[depth].p_ext;
+@@ -1396,7 +1394,7 @@ out:
+ ext4_ext_drop_refs(holecheck_path);
+ kfree(holecheck_path);
+ }
+-out2:
++
+ ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
+
+ if (ret1)
+
+
+From linux@linux.site Thu Dec 10 20:27:52 2009
+Message-Id: <20091211042751.864763427@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:13 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Akira Fujita <a-fujita@rs.jp.nec.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [35/90] ext4: Fix different block exchange issue in EXT4_IOC_MOVE_EXT
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0035-ext4-Fix-different-block-exchange-issue-in-EXT4_IOC_.patch
+Content-Length: 3955
+Lines: 122
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit c40ce3c9ea97425a12d7e44031a98fe50add6fc1)
+
+If logical block offset of original file which is passed to
+EXT4_IOC_MOVE_EXT is different from donor file's,
+a calculation error occurs in ext4_calc_swap_extents(),
+therefore wrong block is exchanged between original file and donor file.
+As a result, we hit ext4_error() in check_block_validity().
+To detect the logical offset difference in EXT4_IOC_MOVE_EXT,
+add checks to mext_calc_swap_extents() and handle it as error,
+since data exchange must be done between the same blocks in EXT4_IOC_MOVE_EXT.
+
+Reported-by: Peng Tao <bergwolf@gmail.com>
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c | 46 +++++++++++++++++++++++++++++++++++++---------
+ 1 file changed, 37 insertions(+), 9 deletions(-)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -597,8 +597,10 @@ out:
+ * @orig_off: block offset of original inode
+ * @donor_off: block offset of donor inode
+ * @max_count: the maximun length of extents
++ *
++ * Return 0 on success, or a negative error value on failure.
+ */
+-static void
++static int
+ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
+ struct ext4_extent *tmp_oext,
+ ext4_lblk_t orig_off, ext4_lblk_t donor_off,
+@@ -607,6 +609,19 @@ mext_calc_swap_extents(struct ext4_exten
+ ext4_lblk_t diff, orig_diff;
+ struct ext4_extent dext_old, oext_old;
+
++ BUG_ON(orig_off != donor_off);
++
++ /* original and donor extents have to cover the same block offset */
++ if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
++ le32_to_cpu(tmp_oext->ee_block) +
++ ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
++ return -ENODATA;
++
++ if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
++ le32_to_cpu(tmp_dext->ee_block) +
++ ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
++ return -ENODATA;
++
+ dext_old = *tmp_dext;
+ oext_old = *tmp_oext;
+
+@@ -634,6 +649,8 @@ mext_calc_swap_extents(struct ext4_exten
+
+ copy_extent_status(&oext_old, tmp_dext);
+ copy_extent_status(&dext_old, tmp_oext);
++
++ return 0;
+ }
+
+ /**
+@@ -690,8 +707,10 @@ mext_replace_branches(handle_t *handle,
+ dext = donor_path[depth].p_ext;
+ tmp_dext = *dext;
+
+- mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
++ err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ donor_off, count);
++ if (err)
++ goto out;
+
+ /* Loop for the donor extents */
+ while (1) {
+@@ -760,9 +779,10 @@ mext_replace_branches(handle_t *handle,
+ }
+ tmp_dext = *dext;
+
+- mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+- donor_off,
+- count - replaced_count);
++ err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
++ donor_off, count - replaced_count);
++ if (err)
++ goto out;
+ }
+
+ out:
+@@ -1243,11 +1263,15 @@ ext4_move_extents(struct file *o_filp, s
+ ext_cur = holecheck_path[depth].p_ext;
+
+ /*
+- * Get proper extent whose ee_block is beyond block_start
+- * if block_start was within the hole.
++ * Get proper starting location of block replacement if block_start was
++ * within the hole.
+ */
+ if (le32_to_cpu(ext_cur->ee_block) +
+ ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
++ /*
++ * The hole exists between extents or the tail of
++ * original file.
++ */
+ last_extent = mext_next_extent(orig_inode,
+ holecheck_path, &ext_cur);
+ if (last_extent < 0) {
+@@ -1260,8 +1284,12 @@ ext4_move_extents(struct file *o_filp, s
+ ret1 = last_extent;
+ goto out;
+ }
+- }
+- seq_start = block_start;
++ seq_start = le32_to_cpu(ext_cur->ee_block);
++ } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
++ /* The hole exists at the beginning of original file. */
++ seq_start = le32_to_cpu(ext_cur->ee_block);
++ else
++ seq_start = block_start;
+
+ /* No blocks within the specified range. */
+ if (le32_to_cpu(ext_cur->ee_block) > block_end) {
+
+
+From linux@linux.site Thu Dec 10 20:27:52 2009
+Message-Id: <20091211042752.421711582@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:14 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Eric Sandeen <sandeen@redhat.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [36/90] ext4: limit block allocations for indirect-block files to < 2^32
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0036-ext4-limit-block-allocations-for-indirect-block-file.patch
+Content-Length: 5868
+Lines: 174
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit fb0a387dcdcd21aab1b09ee7fd80b7c979bdbbfd)
+
+Today, the ext4 allocator will happily allocate blocks past
+2^32 for indirect-block files, which results in the block
+numbers getting truncated, and corruption ensues.
+
+This patch limits such allocations to < 2^32, and adds
+BUG_ONs if we do get blocks larger than that.
+
+This should address RH Bug 519471, ext4 bitmap allocator
+must limit blocks to < 2^32
+
+* ext4_find_goal() is modified to choose a goal < UINT_MAX,
+ so that our starting point is in an acceptable range.
+
+* ext4_xattr_block_set() is modified such that the goal block
+ is < UINT_MAX, as above.
+
+* ext4_mb_regular_allocator() is modified so that the group
+ search does not continue into groups which are too high
+
+* ext4_mb_use_preallocated() has a check that we don't use
+ preallocated space which is too far out
+
+* ext4_alloc_blocks() and ext4_xattr_block_set() add some BUG_ONs
+
+No attempt has been made to limit inode locations to < 2^32,
+so we may wind up with blocks far from their inodes. Doing
+this much already will lead to some odd ENOSPC issues when the
+"lower 32" gets full, and further restricting inodes could
+make that even weirder.
+
+For high inodes, choosing a goal of the original, % UINT_MAX,
+may be a bit odd, but then we're in an odd situation anyway,
+and I don't know of a better heuristic.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 4 ++++
+ fs/ext4/inode.c | 11 ++++++++++-
+ fs/ext4/mballoc.c | 9 +++++++++
+ fs/ext4/super.c | 2 ++
+ fs/ext4/xattr.c | 15 +++++++++++++--
+ 5 files changed, 38 insertions(+), 3 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -388,6 +388,9 @@ struct ext4_mount_options {
+ #endif
+ };
+
++/* Max physical block we can addres w/o extents */
++#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
++
+ /*
+ * Structure of an inode on the disk
+ */
+@@ -843,6 +846,7 @@ struct ext4_sb_info {
+ unsigned long s_gdb_count; /* Number of group descriptor blocks */
+ unsigned long s_desc_per_block; /* Number of group descriptors per block */
+ ext4_group_t s_groups_count; /* Number of groups in the fs */
++ ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
+ unsigned long s_overhead_last; /* Last calculated overhead */
+ unsigned long s_blocks_last; /* Last seen block count */
+ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -564,15 +564,21 @@ static ext4_fsblk_t ext4_find_near(struc
+ *
+ * Normally this function find the preferred place for block allocation,
+ * returns it.
++ * Because this is only used for non-extent files, we limit the block nr
++ * to 32 bits.
+ */
+ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
+ Indirect *partial)
+ {
++ ext4_fsblk_t goal;
++
+ /*
+ * XXX need to get goal block from mballoc's data structures
+ */
+
+- return ext4_find_near(inode, partial);
++ goal = ext4_find_near(inode, partial);
++ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
++ return goal;
+ }
+
+ /**
+@@ -653,6 +659,8 @@ static int ext4_alloc_blocks(handle_t *h
+ if (*err)
+ goto failed_out;
+
++ BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
++
+ target -= count;
+ /* allocate blocks for indirect blocks */
+ while (index < indirect_blks && count) {
+@@ -687,6 +695,7 @@ static int ext4_alloc_blocks(handle_t *h
+ ar.flags = EXT4_MB_HINT_DATA;
+
+ current_block = ext4_mb_new_blocks(handle, &ar, err);
++ BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
+
+ if (*err && (target == blks)) {
+ /*
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -1960,6 +1960,10 @@ ext4_mb_regular_allocator(struct ext4_al
+ sb = ac->ac_sb;
+ sbi = EXT4_SB(sb);
+ ngroups = ext4_get_groups_count(sb);
++ /* non-extent files are limited to low blocks/groups */
++ if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
++ ngroups = sbi->s_blockfile_groups;
++
+ BUG_ON(ac->ac_status == AC_STATUS_FOUND);
+
+ /* first, try the goal */
+@@ -3355,6 +3359,11 @@ ext4_mb_use_preallocated(struct ext4_all
+ ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
+ continue;
+
++ /* non-extent files can't have physical blocks past 2^32 */
++ if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
++ pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
++ continue;
++
+ /* found preallocated blocks, use them */
+ spin_lock(&pa->pa_lock);
+ if (pa->pa_deleted == 0 && pa->pa_free) {
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -2618,6 +2618,8 @@ static int ext4_fill_super(struct super_
+ goto failed_mount;
+ }
+ sbi->s_groups_count = blocks_count;
++ sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
++ (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
+ db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+ EXT4_DESC_PER_BLOCK(sb);
+ sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -810,12 +810,23 @@ inserted:
+ get_bh(new_bh);
+ } else {
+ /* We need to allocate a new block */
+- ext4_fsblk_t goal = ext4_group_first_block_no(sb,
++ ext4_fsblk_t goal, block;
++
++ goal = ext4_group_first_block_no(sb,
+ EXT4_I(inode)->i_block_group);
+- ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
++
++ /* non-extent files can't have physical blocks past 2^32 */
++ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
++
++ block = ext4_new_meta_blocks(handle, inode,
+ goal, NULL, &error);
+ if (error)
+ goto cleanup;
++
++ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++ BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
++
+ ea_idebug(inode, "creating block %d", block);
+
+ new_bh = sb_getblk(sb, block);
+
+
+From linux@linux.site Thu Dec 10 20:27:53 2009
+Message-Id: <20091211042753.004509392@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:15 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [37/90] ext4: store EXT4_EXT_MIGRATE in i_state instead of i_flags
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0037-ext4-store-EXT4_EXT_MIGRATE-in-i_state-instead-of-i_.patch
+Content-Length: 4204
+Lines: 103
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 1b9c12f44c1eb614fd3b8822bfe8f1f5d8e53737)
+
+EXT4_EXT_MIGRATE is only intended to be used for an in-memory flag,
+and the hex value assigned to it collides with FS_DIRECTIO_FL (which
+is also stored in i_flags). There's no reason for the
+EXT4_EXT_MIGRATE bit to be stored in i_flags, so we switch it to use
+i_state instead.
+
+Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 2 +-
+ fs/ext4/inode.c | 6 ++----
+ fs/ext4/migrate.c | 20 ++++++++++----------
+ 3 files changed, 13 insertions(+), 15 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -253,7 +253,6 @@ struct flex_groups {
+ #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
+ #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
+ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
+-#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */
+ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
+
+ #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
+@@ -291,6 +290,7 @@ static inline __u32 ext4_mask_flags(umod
+ #define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
+ #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
+ #define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
++#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
+
+ /* Used to pass group descriptor data when online resize is done */
+ struct ext4_new_group_input {
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1256,8 +1256,7 @@ int ext4_get_blocks(handle_t *handle, st
+ * i_data's format changing. Force the migrate
+ * to fail by clearing migrate flags
+ */
+- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
+- ~EXT4_EXT_MIGRATE;
++ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+ }
+ }
+
+@@ -4608,8 +4607,7 @@ static int ext4_do_update_inode(handle_t
+ if (ext4_inode_blocks_set(handle, raw_inode, ei))
+ goto out_brelse;
+ raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
+- /* clear the migrate flag in the raw_inode */
+- raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
++ raw_inode->i_flags = cpu_to_le32(ei->i_flags);
+ if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+ cpu_to_le32(EXT4_OS_HURD))
+ raw_inode->i_file_acl_high =
+--- a/fs/ext4/migrate.c
++++ b/fs/ext4/migrate.c
+@@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(hand
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ /*
+- * if EXT4_EXT_MIGRATE is cleared a block allocation
++ * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
+ * happened after we started the migrate. We need to
+ * fail the migrate
+ */
+- if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) {
++ if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
+ retval = -EAGAIN;
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto err_out;
+ } else
+- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
+- ~EXT4_EXT_MIGRATE;
++ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+ /*
+ * We have the extent map build with the tmp inode.
+ * Now copy the i_data across
+@@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode
+ * when we add extents we extent the journal
+ */
+ /*
+- * Even though we take i_mutex we can still cause block allocation
+- * via mmap write to holes. If we have allocated new blocks we fail
+- * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag.
+- * The flag is updated with i_data_sem held to prevent racing with
+- * block allocation.
++ * Even though we take i_mutex we can still cause block
++ * allocation via mmap write to holes. If we have allocated
++ * new blocks we fail migrate. New block allocation will
++ * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
++ * with i_data_sem held to prevent racing with block
++ * allocation.
+ */
+ down_read((&EXT4_I(inode)->i_data_sem));
+- EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE;
++ EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+ handle = ext4_journal_start(inode, 1);
+
+
+From linux@linux.site Thu Dec 10 20:27:54 2009
+Message-Id: <20091211042753.567755833@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:16 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [38/90] ext4: Fix the alloc on close after a truncate hueristic
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0038-ext4-Fix-the-alloc-on-close-after-a-truncate-huerist.patch
+Content-Length: 1235
+Lines: 33
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 5534fb5bb35a62a94e0bd1fa2421f7fb6e894f10)
+
+In an attempt to avoid doing an unneeded flush after opening a
+(previously non-existent) file with O_CREAT|O_TRUNC, the code only
+triggered the hueristic if ei->disksize was non-zero. Turns out that
+the VFS doesn't call ->truncate() if the file doesn't exist, and
+ei->disksize is always zero even if the file previously existed. So
+remove the test, since it isn't necessary and in fact disabled the
+hueristic.
+
+Thanks to Clemens Eisserer that he was seeing problems with files
+written using kwrite and eclipse after sudden crashes caused by a
+buggy Intel video driver.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3983,8 +3983,7 @@ void ext4_truncate(struct inode *inode)
+ if (!ext4_can_truncate(inode))
+ return;
+
+- if (ei->i_disksize && inode->i_size == 0 &&
+- !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
++ if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+ ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
+
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+
+
+From linux@linux.site Thu Dec 10 20:27:54 2009
+Message-Id: <20091211042754.189709948@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:17 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [39/90] ext4: Fix hueristic which avoids group preallocation for closed files
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0039-ext4-Fix-hueristic-which-avoids-group-preallocation-.patch
+Content-Length: 1048
+Lines: 32
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 71780577306fd1e76c7a92e3b308db624d03adb9)
+
+The hueristic was designed to avoid using locality group preallocation
+when writing the last segment of a closed file. Fix it by move
+setting size to the maximum of size and isize until after we check
+whether size == isize.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/mballoc.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -4162,7 +4162,6 @@ static void ext4_mb_group_or_file(struct
+ size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
+ isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+ >> bsbits;
+- size = max(size, isize);
+
+ if ((size == isize) &&
+ !ext4_fs_is_busy(sbi) &&
+@@ -4172,6 +4171,7 @@ static void ext4_mb_group_or_file(struct
+ }
+
+ /* don't use group allocation for large files */
++ size = max(size, isize);
+ if (size >= sbi->s_mb_stream_request) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ return;
+
+
+From linux@linux.site Thu Dec 10 20:27:55 2009
+Message-Id: <20091211042754.655932661@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:18 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [40/90] ext4: Adjust ext4_da_writepages() to write out larger contiguous chunks
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0040-ext4-Adjust-ext4_da_writepages-to-write-out-larger-c.patch
+Content-Length: 11442
+Lines: 340
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 55138e0bc29c0751e2152df9ad35deea542f29b3)
+
+Work around problems in the writeback code to force out writebacks in
+larger chunks than just 4mb, which is just too small. This also works
+around limitations in the ext4 block allocator, which can't allocate
+more than 2048 blocks at a time. So we need to defeat the round-robin
+characteristics of the writeback code and try to write out as many
+blocks in one inode before allowing the writeback code to move on to
+another inode. We add a a new per-filesystem tunable,
+max_writeback_mb_bump, which caps this to a default of 128mb per
+inode.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 17 ++++++
+ fs/ext4/inode.c | 121 +++++++++++++++++++++++++++++++++-----------
+ fs/ext4/super.c | 3 +
+ include/trace/events/ext4.h | 54 +++++++++++++++++--
+ 4 files changed, 161 insertions(+), 34 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -114,6 +114,22 @@ struct ext4_allocation_request {
+ };
+
+ /*
++ * Delayed allocation stuff
++ */
++
++struct mpage_da_data {
++ struct inode *inode;
++ sector_t b_blocknr; /* start block number of extent */
++ size_t b_size; /* size of extent */
++ unsigned long b_state; /* state of the extent */
++ unsigned long first_page, next_page; /* extent of pages */
++ struct writeback_control *wbc;
++ int io_done;
++ int pages_written;
++ int retval;
++};
++
++/*
+ * Special inodes numbers
+ */
+ #define EXT4_BAD_INO 1 /* Bad blocks inode */
+@@ -929,6 +945,7 @@ struct ext4_sb_info {
+ unsigned int s_mb_stats;
+ unsigned int s_mb_order2_reqs;
+ unsigned int s_mb_group_prealloc;
++ unsigned int s_max_writeback_mb_bump;
+ /* where last allocation was done - for stream allocation */
+ unsigned long s_mb_last_group;
+ unsigned long s_mb_last_start;
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1146,6 +1146,64 @@ static int check_block_validity(struct i
+ }
+
+ /*
++ * Return the number of dirty pages in the given inode starting at
++ * page frame idx.
++ */
++static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
++ unsigned int max_pages)
++{
++ struct address_space *mapping = inode->i_mapping;
++ pgoff_t index;
++ struct pagevec pvec;
++ pgoff_t num = 0;
++ int i, nr_pages, done = 0;
++
++ if (max_pages == 0)
++ return 0;
++ pagevec_init(&pvec, 0);
++ while (!done) {
++ index = idx;
++ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
++ PAGECACHE_TAG_DIRTY,
++ (pgoff_t)PAGEVEC_SIZE);
++ if (nr_pages == 0)
++ break;
++ for (i = 0; i < nr_pages; i++) {
++ struct page *page = pvec.pages[i];
++ struct buffer_head *bh, *head;
++
++ lock_page(page);
++ if (unlikely(page->mapping != mapping) ||
++ !PageDirty(page) ||
++ PageWriteback(page) ||
++ page->index != idx) {
++ done = 1;
++ unlock_page(page);
++ break;
++ }
++ head = page_buffers(page);
++ bh = head;
++ do {
++ if (!buffer_delay(bh) &&
++ !buffer_unwritten(bh)) {
++ done = 1;
++ break;
++ }
++ } while ((bh = bh->b_this_page) != head);
++ unlock_page(page);
++ if (done)
++ break;
++ idx++;
++ num++;
++ if (num >= max_pages)
++ break;
++ }
++ pagevec_release(&pvec);
++ }
++ return num;
++}
++
++/*
+ * The ext4_get_blocks() function tries to look up the requested blocks,
+ * and returns if the blocks are already mapped.
+ *
+@@ -1881,22 +1939,6 @@ static void ext4_da_page_release_reserva
+ }
+
+ /*
+- * Delayed allocation stuff
+- */
+-
+-struct mpage_da_data {
+- struct inode *inode;
+- sector_t b_blocknr; /* start block number of extent */
+- size_t b_size; /* size of extent */
+- unsigned long b_state; /* state of the extent */
+- unsigned long first_page, next_page; /* extent of pages */
+- struct writeback_control *wbc;
+- int io_done;
+- int pages_written;
+- int retval;
+-};
+-
+-/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with writepage() call back
+ *
+@@ -2756,8 +2798,10 @@ static int ext4_da_writepages(struct add
+ int no_nrwrite_index_update;
+ int pages_written = 0;
+ long pages_skipped;
++ unsigned int max_pages;
+ int range_cyclic, cycled = 1, io_done = 0;
+- int needed_blocks, ret = 0, nr_to_writebump = 0;
++ int needed_blocks, ret = 0;
++ long desired_nr_to_write, nr_to_writebump = 0;
+ loff_t range_start = wbc->range_start;
+ struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+
+@@ -2784,16 +2828,6 @@ static int ext4_da_writepages(struct add
+ if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
+ return -EROFS;
+
+- /*
+- * Make sure nr_to_write is >= sbi->s_mb_stream_request
+- * This make sure small files blocks are allocated in
+- * single attempt. This ensure that small files
+- * get less fragmented.
+- */
+- if (wbc->nr_to_write < sbi->s_mb_stream_request) {
+- nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
+- wbc->nr_to_write = sbi->s_mb_stream_request;
+- }
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+
+@@ -2808,6 +2842,36 @@ static int ext4_da_writepages(struct add
+ } else
+ index = wbc->range_start >> PAGE_CACHE_SHIFT;
+
++ /*
++ * This works around two forms of stupidity. The first is in
++ * the writeback code, which caps the maximum number of pages
++ * written to be 1024 pages. This is wrong on multiple
++ * levels; different architectues have a different page size,
++ * which changes the maximum amount of data which gets
++ * written. Secondly, 4 megabytes is way too small. XFS
++ * forces this value to be 16 megabytes by multiplying
++ * nr_to_write parameter by four, and then relies on its
++ * allocator to allocate larger extents to make them
++ * contiguous. Unfortunately this brings us to the second
++ * stupidity, which is that ext4's mballoc code only allocates
++ * at most 2048 blocks. So we force contiguous writes up to
++ * the number of dirty blocks in the inode, or
++ * sbi->max_writeback_mb_bump whichever is smaller.
++ */
++ max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
++ if (!range_cyclic && range_whole)
++ desired_nr_to_write = wbc->nr_to_write * 8;
++ else
++ desired_nr_to_write = ext4_num_dirty_pages(inode, index,
++ max_pages);
++ if (desired_nr_to_write > max_pages)
++ desired_nr_to_write = max_pages;
++
++ if (wbc->nr_to_write < desired_nr_to_write) {
++ nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
++ wbc->nr_to_write = desired_nr_to_write;
++ }
++
+ mpd.wbc = wbc;
+ mpd.inode = mapping->host;
+
+@@ -2926,7 +2990,8 @@ retry:
+ out_writepages:
+ if (!no_nrwrite_index_update)
+ wbc->no_nrwrite_index_update = 0;
+- wbc->nr_to_write -= nr_to_writebump;
++ if (wbc->nr_to_write > nr_to_writebump)
++ wbc->nr_to_write -= nr_to_writebump;
+ wbc->range_start = range_start;
+ trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
+ return ret;
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -2199,6 +2199,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb
+ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+ EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+ EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
++EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+
+ static struct attribute *ext4_attrs[] = {
+ ATTR_LIST(delayed_allocation_blocks),
+@@ -2212,6 +2213,7 @@ static struct attribute *ext4_attrs[] =
+ ATTR_LIST(mb_order2_req),
+ ATTR_LIST(mb_stream_req),
+ ATTR_LIST(mb_group_prealloc),
++ ATTR_LIST(max_writeback_mb_bump),
+ NULL,
+ };
+
+@@ -2681,6 +2683,7 @@ static int ext4_fill_super(struct super_
+ }
+
+ sbi->s_stripe = ext4_get_stripe_size(sbi);
++ sbi->s_max_writeback_mb_bump = 128;
+
+ /*
+ * set up enough so that it can read an inode
+--- a/include/trace/events/ext4.h
++++ b/include/trace/events/ext4.h
+@@ -231,6 +231,7 @@ TRACE_EVENT(ext4_da_writepages,
+ __field( char, for_reclaim )
+ __field( char, for_writepages )
+ __field( char, range_cyclic )
++ __field( pgoff_t, writeback_index )
+ ),
+
+ TP_fast_assign(
+@@ -245,14 +246,51 @@ TRACE_EVENT(ext4_da_writepages,
+ __entry->for_reclaim = wbc->for_reclaim;
+ __entry->for_writepages = wbc->for_writepages;
+ __entry->range_cyclic = wbc->range_cyclic;
++ __entry->writeback_index = inode->i_mapping->writeback_index;
+ ),
+
+- TP_printk("dev %s ino %lu nr_t_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d",
+- jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->nr_to_write,
++ TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d for_writepages %d range_cyclic %d writeback_index %lu",
++ jbd2_dev_to_name(__entry->dev),
++ (unsigned long) __entry->ino, __entry->nr_to_write,
+ __entry->pages_skipped, __entry->range_start,
+ __entry->range_end, __entry->nonblocking,
+ __entry->for_kupdate, __entry->for_reclaim,
+- __entry->for_writepages, __entry->range_cyclic)
++ __entry->for_writepages, __entry->range_cyclic,
++ (unsigned long) __entry->writeback_index)
++);
++
++TRACE_EVENT(ext4_da_write_pages,
++ TP_PROTO(struct inode *inode, struct mpage_da_data *mpd),
++
++ TP_ARGS(inode, mpd),
++
++ TP_STRUCT__entry(
++ __field( dev_t, dev )
++ __field( ino_t, ino )
++ __field( __u64, b_blocknr )
++ __field( __u32, b_size )
++ __field( __u32, b_state )
++ __field( unsigned long, first_page )
++ __field( int, io_done )
++ __field( int, pages_written )
++ ),
++
++ TP_fast_assign(
++ __entry->dev = inode->i_sb->s_dev;
++ __entry->ino = inode->i_ino;
++ __entry->b_blocknr = mpd->b_blocknr;
++ __entry->b_size = mpd->b_size;
++ __entry->b_state = mpd->b_state;
++ __entry->first_page = mpd->first_page;
++ __entry->io_done = mpd->io_done;
++ __entry->pages_written = mpd->pages_written;
++ ),
++
++ TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d",
++ jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino,
++ __entry->b_blocknr, __entry->b_size,
++ __entry->b_state, __entry->first_page,
++ __entry->io_done, __entry->pages_written)
+ );
+
+ TRACE_EVENT(ext4_da_writepages_result,
+@@ -270,6 +308,7 @@ TRACE_EVENT(ext4_da_writepages_result,
+ __field( char, encountered_congestion )
+ __field( char, more_io )
+ __field( char, no_nrwrite_index_update )
++ __field( pgoff_t, writeback_index )
+ ),
+
+ TP_fast_assign(
+@@ -281,13 +320,16 @@ TRACE_EVENT(ext4_da_writepages_result,
+ __entry->encountered_congestion = wbc->encountered_congestion;
+ __entry->more_io = wbc->more_io;
+ __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update;
++ __entry->writeback_index = inode->i_mapping->writeback_index;
+ ),
+
+- TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d",
+- jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->ret,
++ TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu",
++ jbd2_dev_to_name(__entry->dev),
++ (unsigned long) __entry->ino, __entry->ret,
+ __entry->pages_written, __entry->pages_skipped,
+ __entry->encountered_congestion, __entry->more_io,
+- __entry->no_nrwrite_index_update)
++ __entry->no_nrwrite_index_update,
++ (unsigned long) __entry->writeback_index)
+ );
+
+ TRACE_EVENT(ext4_da_write_begin,
+
+
+From linux@linux.site Thu Dec 10 20:27:55 2009
+Message-Id: <20091211042755.220900196@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:19 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Mingming Cao <cmm@us.ibm.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [41/90] ext4: release reserved quota when block reservation for delalloc retry
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0041-ext4-release-reserved-quota-when-block-reservation-f.patch
+Content-Length: 953
+Lines: 31
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 9f0ccfd8e07d61b413e6536ffa02fbf60d2e20d8)
+
+ext4_da_reserve_space() can reserve quota blocks multiple times if
+ext4_claim_free_blocks() fail and we retry the allocation. We should
+release the quota reservation before restarting.
+
+Bug found by Jan Kara.
+
+Signed-off-by: Mingming Cao <cmm@us.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1855,11 +1855,11 @@ repeat:
+
+ if (ext4_claim_free_blocks(sbi, total)) {
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
++ vfs_dq_release_reservation_block(inode, total);
+ if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+ yield();
+ goto repeat;
+ }
+- vfs_dq_release_reservation_block(inode, total);
+ return -ENOSPC;
+ }
+ EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+
+
+From linux@linux.site Thu Dec 10 20:27:56 2009
+Message-Id: <20091211042755.790514342@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:20 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [42/90] ext4: Split uninitialized extents for direct I/O
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0042-ext4-Split-uninitialized-extents-for-direct-I-O.patch
+Content-Length: 21012
+Lines: 650
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 0031462b5b392f90d17f1d75abb795883c44e969)
+
+When writing into an unitialized extent via direct I/O, and the direct
+I/O doesn't exactly cover the unitialized extent, split the extent
+into uninitialized and initialized extents before submitting the I/O.
+This avoids needing to deal with an ENOSPC error in the end_io
+callback that gets used for direct I/O.
+
+When the IO is complete, the written extent will be marked as initialized.
+
+Singed-Off-By: Mingming Cao <cmm@us.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 22 ++
+ fs/ext4/ext4_extents.h | 7
+ fs/ext4/extents.c | 423 ++++++++++++++++++++++++++++++++++++++++++++-----
+ fs/ext4/inode.c | 3
+ fs/ext4/migrate.c | 2
+ fs/ext4/move_extent.c | 4
+ 6 files changed, 419 insertions(+), 42 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -113,6 +113,15 @@ struct ext4_allocation_request {
+ unsigned int flags;
+ };
+
++typedef struct ext4_io_end {
++ struct inode *inode; /* file being written to */
++ unsigned int flag; /* sync IO or AIO */
++ int error; /* I/O error code */
++ ext4_lblk_t offset; /* offset in the file */
++ size_t size; /* size of the extent */
++ struct work_struct work; /* data work queue */
++} ext4_io_end_t;
++
+ /*
+ * Delayed allocation stuff
+ */
+@@ -348,7 +357,16 @@ struct ext4_new_group_data {
+ /* Call ext4_da_update_reserve_space() after successfully
+ allocating the blocks */
+ #define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
+-
++ /* caller is from the direct IO path, request to creation of an
++ unitialized extents if not allocated, split the uninitialized
++ extent if blocks has been preallocated already*/
++#define EXT4_GET_BLOCKS_DIO 0x0010
++#define EXT4_GET_BLOCKS_CONVERT 0x0020
++#define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
++ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
++ /* Convert extent to initialized after direct IO complete */
++#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
++ EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+
+ /*
+ * ioctl commands
+@@ -1702,6 +1720,8 @@ extern void ext4_ext_init(struct super_b
+ extern void ext4_ext_release(struct super_block *);
+ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
+ loff_t len);
++extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
++ loff_t len);
+ extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
+ sector_t block, unsigned int max_blocks,
+ struct buffer_head *bh, int flags);
+--- a/fs/ext4/ext4_extents.h
++++ b/fs/ext4/ext4_extents.h
+@@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_le
+ (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
+ }
+
++static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
++{
++ ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
++}
++
+ extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
+ extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
+ extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
+@@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct
+ struct ext4_ext_path *path,
+ struct ext4_extent *);
+ extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
+-extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
++extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
+ extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
+ ext_prepare_callback, void *);
+ extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -710,7 +710,7 @@ err:
+ * insert new index [@logical;@ptr] into the block at @curp;
+ * check where to insert: before @curp or after @curp
+ */
+-static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
++int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *curp,
+ int logical, ext4_fsblk_t ptr)
+ {
+@@ -1572,7 +1572,7 @@ out:
+ */
+ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path,
+- struct ext4_extent *newext)
++ struct ext4_extent *newext, int flag)
+ {
+ struct ext4_extent_header *eh;
+ struct ext4_extent *ex, *fex;
+@@ -1588,7 +1588,8 @@ int ext4_ext_insert_extent(handle_t *han
+ BUG_ON(path[depth].p_hdr == NULL);
+
+ /* try to insert block into found extent and return */
+- if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
++ if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
++ && ext4_can_extents_be_merged(inode, ex, newext)) {
+ ext_debug("append %d block to %d:%d (from %llu)\n",
+ ext4_ext_get_actual_len(newext),
+ le32_to_cpu(ex->ee_block),
+@@ -1703,7 +1704,8 @@ has_space:
+
+ merge:
+ /* try to merge extents to the right */
+- ext4_ext_try_to_merge(inode, path, nearex);
++ if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
++ ext4_ext_try_to_merge(inode, path, nearex);
+
+ /* try to merge extents to the left */
+
+@@ -2470,7 +2472,6 @@ static int ext4_ext_zeroout(struct inode
+ }
+
+ #define EXT4_EXT_ZERO_LEN 7
+-
+ /*
+ * This function is called by ext4_ext_get_blocks() if someone tries to write
+ * to an uninitialized extent. It may result in splitting the uninitialized
+@@ -2563,7 +2564,8 @@ static int ext4_ext_convert_to_initializ
+ ex3->ee_block = cpu_to_le32(iblock);
+ ext4_ext_store_pblock(ex3, newblock);
+ ex3->ee_len = cpu_to_le16(allocated);
+- err = ext4_ext_insert_extent(handle, inode, path, ex3);
++ err = ext4_ext_insert_extent(handle, inode, path,
++ ex3, 0);
+ if (err == -ENOSPC) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+@@ -2619,7 +2621,7 @@ static int ext4_ext_convert_to_initializ
+ ext4_ext_store_pblock(ex3, newblock + max_blocks);
+ ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+ ext4_ext_mark_uninitialized(ex3);
+- err = ext4_ext_insert_extent(handle, inode, path, ex3);
++ err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
+ if (err == -ENOSPC) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+@@ -2737,7 +2739,7 @@ static int ext4_ext_convert_to_initializ
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ goto out;
+ insert:
+- err = ext4_ext_insert_extent(handle, inode, path, &newex);
++ err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
+ if (err == -ENOSPC) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+@@ -2764,6 +2766,320 @@ fix_extent_len:
+ }
+
+ /*
++ * This function is called by ext4_ext_get_blocks() from
++ * ext4_get_blocks_dio_write() when DIO to write
++ * to an uninitialized extent.
++ *
++ * Writing to an uninitized extent may result in splitting the uninitialized
++ * extent into multiple /intialized unintialized extents (up to three)
++ * There are three possibilities:
++ * a> There is no split required: Entire extent should be uninitialized
++ * b> Splits in two extents: Write is happening at either end of the extent
++ * c> Splits in three extents: Somone is writing in middle of the extent
++ *
++ * One of more index blocks maybe needed if the extent tree grow after
++ * the unintialized extent split. To prevent ENOSPC occur at the IO
++ * complete, we need to split the uninitialized extent before DIO submit
++ * the IO. The uninitilized extent called at this time will be split
++ * into three uninitialized extent(at most). After IO complete, the part
++ * being filled will be convert to initialized by the end_io callback function
++ * via ext4_convert_unwritten_extents().
++ */
++static int ext4_split_unwritten_extents(handle_t *handle,
++ struct inode *inode,
++ struct ext4_ext_path *path,
++ ext4_lblk_t iblock,
++ unsigned int max_blocks,
++ int flags)
++{
++ struct ext4_extent *ex, newex, orig_ex;
++ struct ext4_extent *ex1 = NULL;
++ struct ext4_extent *ex2 = NULL;
++ struct ext4_extent *ex3 = NULL;
++ struct ext4_extent_header *eh;
++ ext4_lblk_t ee_block;
++ unsigned int allocated, ee_len, depth;
++ ext4_fsblk_t newblock;
++ int err = 0;
++ int ret = 0;
++
++ ext_debug("ext4_split_unwritten_extents: inode %lu,"
++ "iblock %llu, max_blocks %u\n", inode->i_ino,
++ (unsigned long long)iblock, max_blocks);
++ depth = ext_depth(inode);
++ eh = path[depth].p_hdr;
++ ex = path[depth].p_ext;
++ ee_block = le32_to_cpu(ex->ee_block);
++ ee_len = ext4_ext_get_actual_len(ex);
++ allocated = ee_len - (iblock - ee_block);
++ newblock = iblock - ee_block + ext_pblock(ex);
++ ex2 = ex;
++ orig_ex.ee_block = ex->ee_block;
++ orig_ex.ee_len = cpu_to_le16(ee_len);
++ ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
++
++ /*
++ * if the entire unintialized extent length less than
++ * the size of extent to write, there is no need to split
++ * uninitialized extent
++ */
++ if (allocated <= max_blocks)
++ return ret;
++
++ err = ext4_ext_get_access(handle, inode, path + depth);
++ if (err)
++ goto out;
++ /* ex1: ee_block to iblock - 1 : uninitialized */
++ if (iblock > ee_block) {
++ ex1 = ex;
++ ex1->ee_len = cpu_to_le16(iblock - ee_block);
++ ext4_ext_mark_uninitialized(ex1);
++ ex2 = &newex;
++ }
++ /*
++ * for sanity, update the length of the ex2 extent before
++ * we insert ex3, if ex1 is NULL. This is to avoid temporary
++ * overlap of blocks.
++ */
++ if (!ex1 && allocated > max_blocks)
++ ex2->ee_len = cpu_to_le16(max_blocks);
++ /* ex3: to ee_block + ee_len : uninitialised */
++ if (allocated > max_blocks) {
++ unsigned int newdepth;
++ ex3 = &newex;
++ ex3->ee_block = cpu_to_le32(iblock + max_blocks);
++ ext4_ext_store_pblock(ex3, newblock + max_blocks);
++ ex3->ee_len = cpu_to_le16(allocated - max_blocks);
++ ext4_ext_mark_uninitialized(ex3);
++ err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
++ if (err == -ENOSPC) {
++ err = ext4_ext_zeroout(inode, &orig_ex);
++ if (err)
++ goto fix_extent_len;
++ /* update the extent length and mark as initialized */
++ ex->ee_block = orig_ex.ee_block;
++ ex->ee_len = orig_ex.ee_len;
++ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
++ ext4_ext_dirty(handle, inode, path + depth);
++ /* zeroed the full extent */
++ /* blocks available from iblock */
++ return allocated;
++
++ } else if (err)
++ goto fix_extent_len;
++ /*
++ * The depth, and hence eh & ex might change
++ * as part of the insert above.
++ */
++ newdepth = ext_depth(inode);
++ /*
++ * update the extent length after successful insert of the
++ * split extent
++ */
++ orig_ex.ee_len = cpu_to_le16(ee_len -
++ ext4_ext_get_actual_len(ex3));
++ depth = newdepth;
++ ext4_ext_drop_refs(path);
++ path = ext4_ext_find_extent(inode, iblock, path);
++ if (IS_ERR(path)) {
++ err = PTR_ERR(path);
++ goto out;
++ }
++ eh = path[depth].p_hdr;
++ ex = path[depth].p_ext;
++ if (ex2 != &newex)
++ ex2 = ex;
++
++ err = ext4_ext_get_access(handle, inode, path + depth);
++ if (err)
++ goto out;
++
++ allocated = max_blocks;
++ }
++ /*
++ * If there was a change of depth as part of the
++ * insertion of ex3 above, we need to update the length
++ * of the ex1 extent again here
++ */
++ if (ex1 && ex1 != ex) {
++ ex1 = ex;
++ ex1->ee_len = cpu_to_le16(iblock - ee_block);
++ ext4_ext_mark_uninitialized(ex1);
++ ex2 = &newex;
++ }
++ /*
++ * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
++ * uninitialised still.
++ */
++ ex2->ee_block = cpu_to_le32(iblock);
++ ext4_ext_store_pblock(ex2, newblock);
++ ex2->ee_len = cpu_to_le16(allocated);
++ ext4_ext_mark_uninitialized(ex2);
++ if (ex2 != ex)
++ goto insert;
++ /* Mark modified extent as dirty */
++ err = ext4_ext_dirty(handle, inode, path + depth);
++ ext_debug("out here\n");
++ goto out;
++insert:
++ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
++ if (err == -ENOSPC) {
++ err = ext4_ext_zeroout(inode, &orig_ex);
++ if (err)
++ goto fix_extent_len;
++ /* update the extent length and mark as initialized */
++ ex->ee_block = orig_ex.ee_block;
++ ex->ee_len = orig_ex.ee_len;
++ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
++ ext4_ext_dirty(handle, inode, path + depth);
++ /* zero out the first half */
++ return allocated;
++ } else if (err)
++ goto fix_extent_len;
++out:
++ ext4_ext_show_leaf(inode, path);
++ return err ? err : allocated;
++
++fix_extent_len:
++ ex->ee_block = orig_ex.ee_block;
++ ex->ee_len = orig_ex.ee_len;
++ ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
++ ext4_ext_mark_uninitialized(ex);
++ ext4_ext_dirty(handle, inode, path + depth);
++ return err;
++}
++static int ext4_convert_unwritten_extents_dio(handle_t *handle,
++ struct inode *inode,
++ struct ext4_ext_path *path)
++{
++ struct ext4_extent *ex;
++ struct ext4_extent_header *eh;
++ int depth;
++ int err = 0;
++ int ret = 0;
++
++ depth = ext_depth(inode);
++ eh = path[depth].p_hdr;
++ ex = path[depth].p_ext;
++
++ err = ext4_ext_get_access(handle, inode, path + depth);
++ if (err)
++ goto out;
++ /* first mark the extent as initialized */
++ ext4_ext_mark_initialized(ex);
++
++ /*
++ * We have to see if it can be merged with the extent
++ * on the left.
++ */
++ if (ex > EXT_FIRST_EXTENT(eh)) {
++ /*
++ * To merge left, pass "ex - 1" to try_to_merge(),
++ * since it merges towards right _only_.
++ */
++ ret = ext4_ext_try_to_merge(inode, path, ex - 1);
++ if (ret) {
++ err = ext4_ext_correct_indexes(handle, inode, path);
++ if (err)
++ goto out;
++ depth = ext_depth(inode);
++ ex--;
++ }
++ }
++ /*
++ * Try to Merge towards right.
++ */
++ ret = ext4_ext_try_to_merge(inode, path, ex);
++ if (ret) {
++ err = ext4_ext_correct_indexes(handle, inode, path);
++ if (err)
++ goto out;
++ depth = ext_depth(inode);
++ }
++ /* Mark modified extent as dirty */
++ err = ext4_ext_dirty(handle, inode, path + depth);
++out:
++ ext4_ext_show_leaf(inode, path);
++ return err;
++}
++
++static int
++ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
++ ext4_lblk_t iblock, unsigned int max_blocks,
++ struct ext4_ext_path *path, int flags,
++ unsigned int allocated, struct buffer_head *bh_result,
++ ext4_fsblk_t newblock)
++{
++ int ret = 0;
++ int err = 0;
++
++ ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
++ "block %llu, max_blocks %u, flags %d, allocated %u",
++ inode->i_ino, (unsigned long long)iblock, max_blocks,
++ flags, allocated);
++ ext4_ext_show_leaf(inode, path);
++
++ /* DIO get_block() before submit the IO, split the extent */
++ if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
++ ret = ext4_split_unwritten_extents(handle,
++ inode, path, iblock,
++ max_blocks, flags);
++ goto out;
++ }
++ /* DIO end_io complete, convert the filled extent to written */
++ if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
++ ret = ext4_convert_unwritten_extents_dio(handle, inode,
++ path);
++ goto out2;
++ }
++ /* buffered IO case */
++ /*
++ * repeat fallocate creation request
++ * we already have an unwritten extent
++ */
++ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
++ goto map_out;
++
++ /* buffered READ or buffered write_begin() lookup */
++ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
++ /*
++ * We have blocks reserved already. We
++ * return allocated blocks so that delalloc
++ * won't do block reservation for us. But
++ * the buffer head will be unmapped so that
++ * a read from the block returns 0s.
++ */
++ set_buffer_unwritten(bh_result);
++ goto out1;
++ }
++
++ /* buffered write, writepage time, convert*/
++ ret = ext4_ext_convert_to_initialized(handle, inode,
++ path, iblock,
++ max_blocks);
++out:
++ if (ret <= 0) {
++ err = ret;
++ goto out2;
++ } else
++ allocated = ret;
++ set_buffer_new(bh_result);
++map_out:
++ set_buffer_mapped(bh_result);
++out1:
++ if (allocated > max_blocks)
++ allocated = max_blocks;
++ ext4_ext_show_leaf(inode, path);
++ bh_result->b_bdev = inode->i_sb->s_bdev;
++ bh_result->b_blocknr = newblock;
++out2:
++ if (path) {
++ ext4_ext_drop_refs(path);
++ kfree(path);
++ }
++ return err ? err : allocated;
++}
++/*
+ * Block allocation/map/preallocation routine for extents based files
+ *
+ *
+@@ -2868,33 +3184,10 @@ int ext4_ext_get_blocks(handle_t *handle
+ EXT4_EXT_CACHE_EXTENT);
+ goto out;
+ }
+- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
+- goto out;
+- if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+- if (allocated > max_blocks)
+- allocated = max_blocks;
+- /*
+- * We have blocks reserved already. We
+- * return allocated blocks so that delalloc
+- * won't do block reservation for us. But
+- * the buffer head will be unmapped so that
+- * a read from the block returns 0s.
+- */
+- set_buffer_unwritten(bh_result);
+- bh_result->b_bdev = inode->i_sb->s_bdev;
+- bh_result->b_blocknr = newblock;
+- goto out2;
+- }
+-
+- ret = ext4_ext_convert_to_initialized(handle, inode,
+- path, iblock,
+- max_blocks);
+- if (ret <= 0) {
+- err = ret;
+- goto out2;
+- } else
+- allocated = ret;
+- goto outnew;
++ ret = ext4_ext_handle_uninitialized_extents(handle,
++ inode, iblock, max_blocks, path,
++ flags, allocated, bh_result, newblock);
++ return ret;
+ }
+ }
+
+@@ -2967,7 +3260,7 @@ int ext4_ext_get_blocks(handle_t *handle
+ newex.ee_len = cpu_to_le16(ar.len);
+ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */
+ ext4_ext_mark_uninitialized(&newex);
+- err = ext4_ext_insert_extent(handle, inode, path, &newex);
++ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (err) {
+ /* free data blocks we just allocated */
+ /* not a good idea to call discard here directly,
+@@ -2981,7 +3274,6 @@ int ext4_ext_get_blocks(handle_t *handle
+ /* previous routine could use block we allocated */
+ newblock = ext_pblock(&newex);
+ allocated = ext4_ext_get_actual_len(&newex);
+-outnew:
+ set_buffer_new(bh_result);
+
+ /* Cache only when it is _not_ an uninitialized extent */
+@@ -3180,6 +3472,63 @@ retry:
+ }
+
+ /*
++ * This function convert a range of blocks to written extents
++ * The caller of this function will pass the start offset and the size.
++ * all unwritten extents within this range will be converted to
++ * written extents.
++ *
++ * This function is called from the direct IO end io call back
++ * function, to convert the fallocated extents after IO is completed.
++ */
++int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
++ loff_t len)
++{
++ handle_t *handle;
++ ext4_lblk_t block;
++ unsigned int max_blocks;
++ int ret = 0;
++ int ret2 = 0;
++ struct buffer_head map_bh;
++ unsigned int credits, blkbits = inode->i_blkbits;
++
++ block = offset >> blkbits;
++ /*
++ * We can't just convert len to max_blocks because
++ * If blocksize = 4096 offset = 3072 and len = 2048
++ */
++ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
++ - block;
++ /*
++ * credits to insert 1 extent into extent tree
++ */
++ credits = ext4_chunk_trans_blocks(inode, max_blocks);
++ while (ret >= 0 && ret < max_blocks) {
++ block = block + ret;
++ max_blocks = max_blocks - ret;
++ handle = ext4_journal_start(inode, credits);
++ if (IS_ERR(handle)) {
++ ret = PTR_ERR(handle);
++ break;
++ }
++ map_bh.b_state = 0;
++ ret = ext4_get_blocks(handle, inode, block,
++ max_blocks, &map_bh,
++ EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
++ if (ret <= 0) {
++ WARN_ON(ret <= 0);
++ printk(KERN_ERR "%s: ext4_ext_get_blocks "
++ "returned error inode#%lu, block=%u, "
++ "max_blocks=%u", __func__,
++ inode->i_ino, block, max_blocks);
++ }
++ ext4_mark_inode_dirty(handle, inode);
++ ret2 = ext4_journal_stop(handle);
++ if (ret <= 0 || ret2 )
++ break;
++ }
++ return ret > 0 ? ret2 : ret;
++}
++/*
+ * Callback function called for each extent to gather FIEMAP information.
+ */
+ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1234,6 +1234,9 @@ int ext4_get_blocks(handle_t *handle, st
+ clear_buffer_mapped(bh);
+ clear_buffer_unwritten(bh);
+
++ ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
++ "logical block %lu\n", inode->i_ino, flags, max_blocks,
++ (unsigned long)block);
+ /*
+ * Try to see if we can get the block without requesting a new
+ * file system block.
+--- a/fs/ext4/migrate.c
++++ b/fs/ext4/migrate.c
+@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle
+ goto err_out;
+ }
+ }
+- retval = ext4_ext_insert_extent(handle, inode, path, &newext);
++ retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
+ err_out:
+ if (path) {
+ ext4_ext_drop_refs(path);
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -322,7 +322,7 @@ mext_insert_across_blocks(handle_t *hand
+ goto out;
+
+ if (ext4_ext_insert_extent(handle, orig_inode,
+- orig_path, new_ext))
++ orig_path, new_ext, 0))
+ goto out;
+ }
+
+@@ -333,7 +333,7 @@ mext_insert_across_blocks(handle_t *hand
+ goto out;
+
+ if (ext4_ext_insert_extent(handle, orig_inode,
+- orig_path, end_ext))
++ orig_path, end_ext, 0))
+ goto out;
+ }
+ out:
+
+
+From linux@linux.site Thu Dec 10 20:27:56 2009
+Message-Id: <20091211042756.339680773@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:21 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [43/90] ext4: Use end_io callback to avoid direct I/O fallback to buffered I/O
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0043-ext4-Use-end_io-callback-to-avoid-direct-I-O-fallbac.patch
+Content-Length: 9561
+Lines: 301
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 4c0425ff68b1b87b802ffeda7b6a46ff7da7241c)
+
+Currently the DIO VFS code passes create = 0 when writing to the
+middle of file. It does this to avoid block allocation for holes, so
+as not to expose stale data out when there is a parallel buffered read
+(which does not hold the i_mutex lock). Direct I/O writes into holes
+falls back to buffered IO for this reason.
+
+Since preallocated extents are treated as holes when doing a
+get_block() look up (buffer is not mapped), direct IO over fallocate
+also falls back to buffered IO. Thus ext4 actually silently falls
+back to buffered IO in above two cases, which is undesirable.
+
+To fix this, this patch creates unitialized extents when a direct I/O
+write into holes in sparse files, and registering an end_io callback which
+converts the uninitialized extent to an initialized extent after the
+I/O is completed.
+
+Singed-Off-By: Mingming Cao <cmm@us.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 3
+ fs/ext4/inode.c | 197 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
+ fs/ext4/super.c | 11 +++
+ 3 files changed, 210 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1002,6 +1002,9 @@ struct ext4_sb_info {
+
+ unsigned int s_log_groups_per_flex;
+ struct flex_groups *s_flex_groups;
++
++ /* workqueue for dio unwritten */
++ struct workqueue_struct *dio_unwritten_wq;
+ };
+
+ static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -37,6 +37,7 @@
+ #include <linux/namei.h>
+ #include <linux/uio.h>
+ #include <linux/bio.h>
++#include <linux/workqueue.h>
+
+ #include "ext4_jbd2.h"
+ #include "xattr.h"
+@@ -3350,6 +3351,8 @@ static int ext4_releasepage(struct page
+ }
+
+ /*
++ * O_DIRECT for ext3 (or indirect map) based files
++ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list. So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+@@ -3358,7 +3361,7 @@ static int ext4_releasepage(struct page
+ * crashes then stale disk data _may_ be exposed inside the file. But current
+ * VFS code falls back into buffered path in that case so we are safe.
+ */
+-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
++static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
+ const struct iovec *iov, loff_t offset,
+ unsigned long nr_segs)
+ {
+@@ -3432,6 +3435,198 @@ out:
+ return ret;
+ }
+
++/* Maximum number of blocks we map for direct IO at once. */
++
++static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
++ struct buffer_head *bh_result, int create)
++{
++ handle_t *handle = NULL;
++ int ret = 0;
++ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
++ int dio_credits;
++
++ /*
++ * DIO VFS code passes create = 0 flag for write to
++ * the middle of file. It does this to avoid block
++ * allocation for holes, to prevent expose stale data
++ * out when there is parallel buffered read (which does
++ * not hold the i_mutex lock) while direct IO write has
++ * not completed. DIO request on holes finally falls back
++ * to buffered IO for this reason.
++ *
++ * For ext4 extent based file, since we support fallocate,
++ * new allocated extent as uninitialized, for holes, we
++ * could fallocate blocks for holes, thus parallel
++ * buffered IO read will zero out the page when read on
++ * a hole while parallel DIO write to the hole has not completed.
++ *
++ * when we come here, we know it's a direct IO write to
++ * to the middle of file (<i_size)
++ * so it's safe to override the create flag from VFS.
++ */
++ create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
++
++ if (max_blocks > DIO_MAX_BLOCKS)
++ max_blocks = DIO_MAX_BLOCKS;
++ dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
++ handle = ext4_journal_start(inode, dio_credits);
++ if (IS_ERR(handle)) {
++ ret = PTR_ERR(handle);
++ goto out;
++ }
++ ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
++ create);
++ if (ret > 0) {
++ bh_result->b_size = (ret << inode->i_blkbits);
++ ret = 0;
++ }
++ ext4_journal_stop(handle);
++out:
++ return ret;
++}
++
++#define DIO_AIO 0x1
++
++static void ext4_free_io_end(ext4_io_end_t *io)
++{
++ kfree(io);
++}
++
++/*
++ * IO write completion for unwritten extents.
++ *
++ * check a range of space and convert unwritten extents to written.
++ */
++static void ext4_end_dio_unwritten(struct work_struct *work)
++{
++ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
++ struct inode *inode = io->inode;
++ loff_t offset = io->offset;
++ size_t size = io->size;
++ int ret = 0;
++ int aio = io->flag & DIO_AIO;
++
++ if (aio)
++ mutex_lock(&inode->i_mutex);
++ if (offset + size <= i_size_read(inode))
++ ret = ext4_convert_unwritten_extents(inode, offset, size);
++
++ if (ret < 0)
++ printk(KERN_EMERG "%s: failed to convert unwritten"
++ "extents to written extents, error is %d\n",
++ __func__, ret);
++
++ ext4_free_io_end(io);
++ if (aio)
++ mutex_unlock(&inode->i_mutex);
++}
++
++static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag)
++{
++ ext4_io_end_t *io = NULL;
++
++ io = kmalloc(sizeof(*io), GFP_NOFS);
++
++ if (io) {
++ io->inode = inode;
++ io->flag = flag;
++ io->offset = 0;
++ io->size = 0;
++ io->error = 0;
++ INIT_WORK(&io->work, ext4_end_dio_unwritten);
++ }
++
++ return io;
++}
++
++static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
++ ssize_t size, void *private)
++{
++ ext4_io_end_t *io_end = iocb->private;
++ struct workqueue_struct *wq;
++
++ /* if not hole or unwritten extents, just simple return */
++ if (!io_end || !size || !iocb->private)
++ return;
++ io_end->offset = offset;
++ io_end->size = size;
++ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
++
++ /* We need to convert unwritten extents to written */
++ queue_work(wq, &io_end->work);
++
++ if (is_sync_kiocb(iocb))
++ flush_workqueue(wq);
++
++ iocb->private = NULL;
++}
++/*
++ * For ext4 extent files, ext4 will do direct-io write to holes,
++ * preallocated extents, and those write extend the file, no need to
++ * fall back to buffered IO.
++ *
++ * For holes, we fallocate those blocks, mark them as unintialized
++ * If those blocks were preallocated, we mark sure they are splited, but
++ * still keep the range to write as unintialized.
++ *
++ * When end_io call back function called at the last IO complete time,
++ * those extents will be converted to written extents.
++ *
++ * If the O_DIRECT write will extend the file then add this inode to the
++ * orphan list. So recovery will truncate it back to the original size
++ * if the machine crashes during the write.
++ *
++ */
++static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
++ const struct iovec *iov, loff_t offset,
++ unsigned long nr_segs)
++{
++ struct file *file = iocb->ki_filp;
++ struct inode *inode = file->f_mapping->host;
++ ssize_t ret;
++ size_t count = iov_length(iov, nr_segs);
++
++ loff_t final_size = offset + count;
++ if (rw == WRITE && final_size <= inode->i_size) {
++ /*
++ * For DIO we fallocate blocks for holes, we fallocate blocks
++ * The fallocated extent for hole is marked as uninitialized
++ * to prevent paralel buffered read to expose the stale data
++ * before DIO complete the data IO.
++ * as for previously fallocated extents, ext4 get_block
++ * will just simply mark the buffer mapped but still
++ * keep the extents uninitialized.
++ *
++ * At the end of IO, the ext4 end_io callback function
++ * will convert those unwritten extents to written,
++ *
++ */
++ iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb));
++ if (!iocb->private)
++ return -ENOMEM;
++ ret = blockdev_direct_IO(rw, iocb, inode,
++ inode->i_sb->s_bdev, iov,
++ offset, nr_segs,
++ ext4_get_block_dio_write,
++ ext4_end_io_dio);
++ return ret;
++ }
++ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
++}
++
++static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
++ const struct iovec *iov, loff_t offset,
++ unsigned long nr_segs)
++{
++ struct file *file = iocb->ki_filp;
++ struct inode *inode = file->f_mapping->host;
++
++ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
++ return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
++
++ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
++}
++
+ /*
+ * Pages can be marked dirty completely asynchronously from ext4's journalling
+ * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -580,6 +580,9 @@ static void ext4_put_super(struct super_
+ struct ext4_super_block *es = sbi->s_es;
+ int i, err;
+
++ flush_workqueue(sbi->dio_unwritten_wq);
++ destroy_workqueue(sbi->dio_unwritten_wq);
++
+ lock_super(sb);
+ lock_kernel();
+ if (sb->s_dirt)
+@@ -2809,6 +2812,12 @@ no_journal:
+ clear_opt(sbi->s_mount_opt, NOBH);
+ }
+ }
++ EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
++ if (!EXT4_SB(sb)->dio_unwritten_wq) {
++ printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
++ goto failed_mount_wq;
++ }
++
+ /*
+ * The jbd2_journal_load will have done any necessary log recovery,
+ * so we can safely mount the rest of the filesystem now.
+@@ -2921,6 +2930,8 @@ cantfind_ext4:
+
+ failed_mount4:
+ ext4_msg(sb, KERN_ERR, "mount failed");
++ destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
++failed_mount_wq:
+ ext4_release_system_zone(sb);
+ if (sbi->s_journal) {
+ jbd2_journal_destroy(sbi->s_journal);
+
+
+From linux@linux.site Thu Dec 10 20:27:57 2009
+Message-Id: <20091211042756.989318573@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:22 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Mingming Cao <cmm@us.ibm.com>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [44/90] ext4: async direct IO for holes and fallocate support
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0044-ext4-async-direct-IO-for-holes-and-fallocate-support.patch
+Content-Length: 15796
+Lines: 475
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 8d5d02e6b176565c77ff03604908b1453a22044d)
+
+For async direct IO that covers holes or fallocate, the end_io
+callback function now queued the convertion work on workqueue but
+don't flush the work rightaway as it might take too long to afford.
+
+But when fsync is called after all the data is completed, user expects
+the metadata also being updated before fsync returns.
+
+Thus we need to flush the conversion work when fsync() is called.
+This patch keep track of a listed of completed async direct io that
+has a work queued on workqueue. When fsync() is called, it will go
+through the list and do the conversion.
+
+Signed-off-by: Mingming Cao <cmm@us.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 9 +-
+ fs/ext4/extents.c | 19 ++++
+ fs/ext4/fsync.c | 5 +
+ fs/ext4/inode.c | 231 +++++++++++++++++++++++++++++++++++++++++++++---------
+ fs/ext4/super.c | 8 +
+ 5 files changed, 233 insertions(+), 39 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -113,7 +113,9 @@ struct ext4_allocation_request {
+ unsigned int flags;
+ };
+
++#define DIO_AIO_UNWRITTEN 0x1
+ typedef struct ext4_io_end {
++ struct list_head list; /* per-file finished AIO list */
+ struct inode *inode; /* file being written to */
+ unsigned int flag; /* sync IO or AIO */
+ int error; /* I/O error code */
+@@ -692,6 +694,11 @@ struct ext4_inode_info {
+ __u16 i_extra_isize;
+
+ spinlock_t i_block_reservation_lock;
++
++ /* completed async DIOs that might need unwritten extents handling */
++ struct list_head i_aio_dio_complete_list;
++ /* current io_end structure for async DIO write*/
++ ext4_io_end_t *cur_aio_dio;
+ };
+
+ /*
+@@ -1424,7 +1431,7 @@ extern int ext4_block_truncate_page(hand
+ struct address_space *mapping, loff_t from);
+ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+ extern qsize_t ext4_get_reserved_space(struct inode *inode);
+-
++extern int flush_aio_dio_completed_IO(struct inode *inode);
+ /* ioctl.c */
+ extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
+ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3012,6 +3012,7 @@ ext4_ext_handle_uninitialized_extents(ha
+ {
+ int ret = 0;
+ int err = 0;
++ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+
+ ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
+ "block %llu, max_blocks %u, flags %d, allocated %u",
+@@ -3024,6 +3025,9 @@ ext4_ext_handle_uninitialized_extents(ha
+ ret = ext4_split_unwritten_extents(handle,
+ inode, path, iblock,
+ max_blocks, flags);
++ /* flag the io_end struct that we need convert when IO done */
++ if (io)
++ io->flag = DIO_AIO_UNWRITTEN;
+ goto out;
+ }
+ /* DIO end_io complete, convert the filled extent to written */
+@@ -3109,6 +3113,7 @@ int ext4_ext_get_blocks(handle_t *handle
+ int err = 0, depth, ret, cache_type;
+ unsigned int allocated = 0;
+ struct ext4_allocation_request ar;
++ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+
+ __clear_bit(BH_New, &bh_result->b_state);
+ ext_debug("blocks %u/%u requested for inode %u\n",
+@@ -3258,8 +3263,20 @@ int ext4_ext_get_blocks(handle_t *handle
+ /* try to insert new extent into found leaf and return */
+ ext4_ext_store_pblock(&newex, newblock);
+ newex.ee_len = cpu_to_le16(ar.len);
+- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */
++ /* Mark uninitialized */
++ if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
+ ext4_ext_mark_uninitialized(&newex);
++ /*
++ * io_end structure was created for every async
++ * direct IO write to the middle of the file.
++ * To avoid unecessary convertion for every aio dio rewrite
++ * to the mid of file, here we flag the IO that is really
++ * need the convertion.
++ *
++ */
++ if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT)
++ io->flag = DIO_AIO_UNWRITTEN;
++ }
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (err) {
+ /* free data blocks we just allocated */
+--- a/fs/ext4/fsync.c
++++ b/fs/ext4/fsync.c
+@@ -44,6 +44,8 @@
+ *
+ * What we do is just kick off a commit and wait on it. This will snapshot the
+ * inode to disk.
++ *
++ * i_mutex lock is held when entering and exiting this function
+ */
+
+ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
+@@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, st
+
+ trace_ext4_sync_file(file, dentry, datasync);
+
++ ret = flush_aio_dio_completed_IO(inode);
++ if (ret < 0)
++ goto out;
+ /*
+ * data=writeback:
+ * The caller's filemap_fdatawrite()/wait will sync the data.
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3445,6 +3445,8 @@ static int ext4_get_block_dio_write(stru
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int dio_credits;
+
++ ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n",
++ inode->i_ino, create);
+ /*
+ * DIO VFS code passes create = 0 flag for write to
+ * the middle of file. It does this to avoid block
+@@ -3485,55 +3487,152 @@ out:
+ return ret;
+ }
+
+-#define DIO_AIO 0x1
+-
+ static void ext4_free_io_end(ext4_io_end_t *io)
+ {
++ BUG_ON(!io);
++ iput(io->inode);
+ kfree(io);
+ }
++static void dump_aio_dio_list(struct inode * inode)
++{
++#ifdef EXT4_DEBUG
++ struct list_head *cur, *before, *after;
++ ext4_io_end_t *io, *io0, *io1;
++
++ if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
++ ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino);
++ return;
++ }
++
++ ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino);
++ list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){
++ cur = &io->list;
++ before = cur->prev;
++ io0 = container_of(before, ext4_io_end_t, list);
++ after = cur->next;
++ io1 = container_of(after, ext4_io_end_t, list);
++
++ ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n",
++ io, inode->i_ino, io0, io1);
++ }
++#endif
++}
+
+ /*
+- * IO write completion for unwritten extents.
+- *
+ * check a range of space and convert unwritten extents to written.
+ */
+-static void ext4_end_dio_unwritten(struct work_struct *work)
++static int ext4_end_aio_dio_nolock(ext4_io_end_t *io)
+ {
+- ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+ struct inode *inode = io->inode;
+ loff_t offset = io->offset;
+ size_t size = io->size;
+ int ret = 0;
+- int aio = io->flag & DIO_AIO;
+
+- if (aio)
+- mutex_lock(&inode->i_mutex);
++ ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p,"
++ "list->prev 0x%p\n",
++ io, inode->i_ino, io->list.next, io->list.prev);
++
++ if (list_empty(&io->list))
++ return ret;
++
++ if (io->flag != DIO_AIO_UNWRITTEN)
++ return ret;
++
+ if (offset + size <= i_size_read(inode))
+ ret = ext4_convert_unwritten_extents(inode, offset, size);
+
+- if (ret < 0)
++ if (ret < 0) {
+ printk(KERN_EMERG "%s: failed to convert unwritten"
+- "extents to written extents, error is %d\n",
+- __func__, ret);
++ "extents to written extents, error is %d"
++ " io is still on inode %lu aio dio list\n",
++ __func__, ret, inode->i_ino);
++ return ret;
++ }
++
++ /* clear the DIO AIO unwritten flag */
++ io->flag = 0;
++ return ret;
++}
++/*
++ * work on completed aio dio IO, to convert unwritten extents to extents
++ */
++static void ext4_end_aio_dio_work(struct work_struct *work)
++{
++ ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
++ struct inode *inode = io->inode;
++ int ret = 0;
+
+- ext4_free_io_end(io);
+- if (aio)
+- mutex_unlock(&inode->i_mutex);
++ mutex_lock(&inode->i_mutex);
++ ret = ext4_end_aio_dio_nolock(io);
++ if (ret >= 0) {
++ if (!list_empty(&io->list))
++ list_del_init(&io->list);
++ ext4_free_io_end(io);
++ }
++ mutex_unlock(&inode->i_mutex);
+ }
++/*
++ * This function is called from ext4_sync_file().
++ *
++ * When AIO DIO IO is completed, the work to convert unwritten
++ * extents to written is queued on workqueue but may not get immediately
++ * scheduled. When fsync is called, we need to ensure the
++ * conversion is complete before fsync returns.
++ * The inode keeps track of a list of completed AIO from DIO path
++ * that might needs to do the conversion. This function walks through
++ * the list and convert the related unwritten extents to written.
++ */
++int flush_aio_dio_completed_IO(struct inode *inode)
++{
++ ext4_io_end_t *io;
++ int ret = 0;
++ int ret2 = 0;
++
++ if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list))
++ return ret;
+
+-static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag)
++ dump_aio_dio_list(inode);
++ while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){
++ io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next,
++ ext4_io_end_t, list);
++ /*
++ * Calling ext4_end_aio_dio_nolock() to convert completed
++ * IO to written.
++ *
++ * When ext4_sync_file() is called, run_queue() may already
++ * about to flush the work corresponding to this io structure.
++ * It will be upset if it founds the io structure related
++ * to the work-to-be schedule is freed.
++ *
++ * Thus we need to keep the io structure still valid here after
++ * convertion finished. The io structure has a flag to
++ * avoid double converting from both fsync and background work
++ * queue work.
++ */
++ ret = ext4_end_aio_dio_nolock(io);
++ if (ret < 0)
++ ret2 = ret;
++ else
++ list_del_init(&io->list);
++ }
++ return (ret2 < 0) ? ret2 : 0;
++}
++
++static ext4_io_end_t *ext4_init_io_end (struct inode *inode)
+ {
+ ext4_io_end_t *io = NULL;
+
+ io = kmalloc(sizeof(*io), GFP_NOFS);
+
+ if (io) {
++ igrab(inode);
+ io->inode = inode;
+- io->flag = flag;
++ io->flag = 0;
+ io->offset = 0;
+ io->size = 0;
+ io->error = 0;
+- INIT_WORK(&io->work, ext4_end_dio_unwritten);
++ INIT_WORK(&io->work, ext4_end_aio_dio_work);
++ INIT_LIST_HEAD(&io->list);
+ }
+
+ return io;
+@@ -3545,19 +3644,31 @@ static void ext4_end_io_dio(struct kiocb
+ ext4_io_end_t *io_end = iocb->private;
+ struct workqueue_struct *wq;
+
+- /* if not hole or unwritten extents, just simple return */
+- if (!io_end || !size || !iocb->private)
++ ext_debug("ext4_end_io_dio(): io_end 0x%p"
++ "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
++ iocb->private, io_end->inode->i_ino, iocb, offset,
++ size);
++ /* if not async direct IO or dio with 0 bytes write, just return */
++ if (!io_end || !size)
++ return;
++
++ /* if not aio dio with unwritten extents, just free io and return */
++ if (io_end->flag != DIO_AIO_UNWRITTEN){
++ ext4_free_io_end(io_end);
++ iocb->private = NULL;
+ return;
++ }
++
+ io_end->offset = offset;
+ io_end->size = size;
+ wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+
+- /* We need to convert unwritten extents to written */
++ /* queue the work to convert unwritten extents to written */
+ queue_work(wq, &io_end->work);
+
+- if (is_sync_kiocb(iocb))
+- flush_workqueue(wq);
+-
++ /* Add the io_end to per-inode completed aio dio list*/
++ list_add_tail(&io_end->list,
++ &EXT4_I(io_end->inode)->i_aio_dio_complete_list);
+ iocb->private = NULL;
+ }
+ /*
+@@ -3569,8 +3680,10 @@ static void ext4_end_io_dio(struct kiocb
+ * If those blocks were preallocated, we mark sure they are splited, but
+ * still keep the range to write as unintialized.
+ *
+- * When end_io call back function called at the last IO complete time,
+- * those extents will be converted to written extents.
++ * The unwrritten extents will be converted to written when DIO is completed.
++ * For async direct IO, since the IO may still pending when return, we
++ * set up an end_io call back function, which will do the convertion
++ * when async direct IO completed.
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list. So recovery will truncate it back to the original size
+@@ -3589,28 +3702,76 @@ static ssize_t ext4_ext_direct_IO(int rw
+ loff_t final_size = offset + count;
+ if (rw == WRITE && final_size <= inode->i_size) {
+ /*
+- * For DIO we fallocate blocks for holes, we fallocate blocks
+- * The fallocated extent for hole is marked as uninitialized
++ * We could direct write to holes and fallocate.
++ *
++ * Allocated blocks to fill the hole are marked as uninitialized
+ * to prevent paralel buffered read to expose the stale data
+ * before DIO complete the data IO.
+- * as for previously fallocated extents, ext4 get_block
++ *
++ * As to previously fallocated extents, ext4 get_block
+ * will just simply mark the buffer mapped but still
+ * keep the extents uninitialized.
+ *
+- * At the end of IO, the ext4 end_io callback function
+- * will convert those unwritten extents to written,
+- *
++ * for non AIO case, we will convert those unwritten extents
++ * to written after return back from blockdev_direct_IO.
++ *
++ * for async DIO, the conversion needs to be defered when
++ * the IO is completed. The ext4 end_io callback function
++ * will be called to take care of the conversion work.
++ * Here for async case, we allocate an io_end structure to
++ * hook to the iocb.
+ */
+- iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb));
+- if (!iocb->private)
+- return -ENOMEM;
++ iocb->private = NULL;
++ EXT4_I(inode)->cur_aio_dio = NULL;
++ if (!is_sync_kiocb(iocb)) {
++ iocb->private = ext4_init_io_end(inode);
++ if (!iocb->private)
++ return -ENOMEM;
++ /*
++ * we save the io structure for current async
++ * direct IO, so that later ext4_get_blocks()
++ * could flag the io structure whether there
++ * is a unwritten extents needs to be converted
++ * when IO is completed.
++ */
++ EXT4_I(inode)->cur_aio_dio = iocb->private;
++ }
++
+ ret = blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block_dio_write,
+ ext4_end_io_dio);
++ if (iocb->private)
++ EXT4_I(inode)->cur_aio_dio = NULL;
++ /*
++ * The io_end structure takes a reference to the inode,
++ * that structure needs to be destroyed and the
++ * reference to the inode need to be dropped, when IO is
++ * complete, even with 0 byte write, or failed.
++ *
++ * In the successful AIO DIO case, the io_end structure will be
++ * desctroyed and the reference to the inode will be dropped
++ * after the end_io call back function is called.
++ *
++ * In the case there is 0 byte write, or error case, since
++ * VFS direct IO won't invoke the end_io call back function,
++ * we need to free the end_io structure here.
++ */
++ if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
++ ext4_free_io_end(iocb->private);
++ iocb->private = NULL;
++ } else if (ret > 0)
++ /*
++ * for non AIO case, since the IO is already
++ * completed, we could do the convertion right here
++ */
++ ret = ext4_convert_unwritten_extents(inode,
++ offset, ret);
+ return ret;
+ }
++
++ /* for write the the end of file case, we fall back to old way */
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+ }
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -687,6 +687,8 @@ static struct inode *ext4_alloc_inode(st
+ ei->i_allocated_meta_blocks = 0;
+ ei->i_delalloc_reserved_flag = 0;
+ spin_lock_init(&(ei->i_block_reservation_lock));
++ INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
++ ei->cur_aio_dio = NULL;
+
+ return &ei->vfs_inode;
+ }
+@@ -3383,11 +3385,13 @@ static int ext4_sync_fs(struct super_blo
+ {
+ int ret = 0;
+ tid_t target;
++ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ trace_ext4_sync_fs(sb, wait);
+- if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
++ flush_workqueue(sbi->dio_unwritten_wq);
++ if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
+ if (wait)
+- jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
++ jbd2_log_wait_commit(sbi->s_journal, target);
+ }
+ return ret;
+ }
+
+
+From linux@linux.site Thu Dec 10 20:27:58 2009
+Message-Id: <20091211042757.491467570@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:23 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [45/90] ext4: EXT4_IOC_MOVE_EXT: Check for different original and donor inodes first
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0045-ext4-EXT4_IOC_MOVE_EXT-Check-for-different-original-.patch
+Content-Length: 1529
+Lines: 45
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit f3ce8064b388ccf420012c5a4907aae4f13fe9d0)
+
+Move the check to make sure the original and donor inodes are
+different earlier, to avoid a potential deadlock by trying to lock the
+same inode twice.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -1001,14 +1001,6 @@ mext_check_arguments(struct inode *orig_
+ return -EINVAL;
+ }
+
+- /* orig and donor should be different file */
+- if (orig_inode->i_ino == donor_inode->i_ino) {
+- ext4_debug("ext4 move extent: The argument files should not "
+- "be same file [ino:orig %lu, donor %lu]\n",
+- orig_inode->i_ino, donor_inode->i_ino);
+- return -EINVAL;
+- }
+-
+ /* Ext4 move extent supports only extent based file */
+ if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+ ext4_debug("ext4 move extent: orig file is not extents "
+@@ -1232,6 +1224,14 @@ ext4_move_extents(struct file *o_filp, s
+ int block_len_in_page;
+ int uninit;
+
++ /* orig and donor should be different file */
++ if (orig_inode->i_ino == donor_inode->i_ino) {
++ ext4_debug("ext4 move extent: The argument files should not "
++ "be same file [ino:orig %lu, donor %lu]\n",
++ orig_inode->i_ino, donor_inode->i_ino);
++ return -EINVAL;
++ }
++
+ /* protect orig and donor against a truncate */
+ ret1 = mext_inode_double_lock(orig_inode, donor_inode);
+ if (ret1 < 0)
+
+
+From linux@linux.site Thu Dec 10 20:27:58 2009
+Message-Id: <20091211042758.055563900@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:24 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Frank Mayhar <fmayhar@google.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [46/90] ext4: Avoid updating the inode table bh twice in no journal mode
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0046-ext4-Avoid-updating-the-inode-table-bh-twice-in-no-j.patch
+Content-Length: 2805
+Lines: 84
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 830156c79b0a99ddf0f62496bcf4de640f9f52cd)
+
+This is a cleanup of commit 91ac6f4. Since ext4_mark_inode_dirty()
+has already called ext4_mark_iloc_dirty(), which in turn calls
+ext4_do_update_inode(), it's not necessary to have ext4_write_inode()
+call ext4_do_update_inode() in no journal mode. Indeed, it would be
+duplicated work.
+
+Reviewed-by: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: Frank Mayhar <fmayhar@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 37 ++++++++++++++++---------------------
+ 1 file changed, 16 insertions(+), 21 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -4981,8 +4981,7 @@ static int ext4_inode_blocks_set(handle_
+ */
+ static int ext4_do_update_inode(handle_t *handle,
+ struct inode *inode,
+- struct ext4_iloc *iloc,
+- int do_sync)
++ struct ext4_iloc *iloc)
+ {
+ struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+@@ -5083,22 +5082,10 @@ static int ext4_do_update_inode(handle_t
+ raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+ }
+
+- /*
+- * If we're not using a journal and we were called from
+- * ext4_write_inode() to sync the inode (making do_sync true),
+- * we can just use sync_dirty_buffer() directly to do our dirty
+- * work. Testing s_journal here is a bit redundant but it's
+- * worth it to avoid potential future trouble.
+- */
+- if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
+- BUFFER_TRACE(bh, "call sync_dirty_buffer");
+- sync_dirty_buffer(bh);
+- } else {
+- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+- rc = ext4_handle_dirty_metadata(handle, inode, bh);
+- if (!err)
+- err = rc;
+- }
++ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
++ rc = ext4_handle_dirty_metadata(handle, inode, bh);
++ if (!err)
++ err = rc;
+ ei->i_state &= ~EXT4_STATE_NEW;
+
+ out_brelse:
+@@ -5166,8 +5153,16 @@ int ext4_write_inode(struct inode *inode
+ err = ext4_get_inode_loc(inode, &iloc);
+ if (err)
+ return err;
+- err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
+- inode, &iloc, wait);
++ if (wait)
++ sync_dirty_buffer(iloc.bh);
++ if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
++ ext4_error(inode->i_sb, __func__,
++ "IO error syncing inode, "
++ "inode=%lu, block=%llu",
++ inode->i_ino,
++ (unsigned long long)iloc.bh->b_blocknr);
++ err = -EIO;
++ }
+ }
+ return err;
+ }
+@@ -5463,7 +5458,7 @@ int ext4_mark_iloc_dirty(handle_t *handl
+ get_bh(iloc->bh);
+
+ /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
+- err = ext4_do_update_inode(handle, inode, iloc, 0);
++ err = ext4_do_update_inode(handle, inode, iloc);
+ put_bh(iloc->bh);
+ return err;
+ }
+
+
+From linux@linux.site Thu Dec 10 20:27:59 2009
+Message-Id: <20091211042758.666491884@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:25 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Curt Wohlgemuth <curtw@google.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [47/90] ext4: Make sure ext4_dirty_inode() updates the inode in no journal mode
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0047-ext4-Make-sure-ext4_dirty_inode-updates-the-inode-in.patch
+Content-Length: 1452
+Lines: 48
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit f3dc272fd5e2ae08244796bb39e7e1ce4b25d3b3)
+
+This patch a problem that ext4_dirty_inode() was not calling
+ext4_mark_inode_dirty() if the current_handle is not valid, which it
+is the case in no journal mode.
+
+It also removes a test for non-matching transaction which can never
+happen.
+
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 19 ++++---------------
+ 1 file changed, 4 insertions(+), 15 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -5605,24 +5605,13 @@ void ext4_dirty_inode(struct inode *inod
+ handle_t *current_handle = ext4_journal_current_handle();
+ handle_t *handle;
+
+- if (!ext4_handle_valid(current_handle)) {
+- ext4_mark_inode_dirty(current_handle, inode);
+- return;
+- }
+-
+ handle = ext4_journal_start(inode, 2);
+ if (IS_ERR(handle))
+ goto out;
+- if (current_handle &&
+- current_handle->h_transaction != handle->h_transaction) {
+- /* This task has a transaction open against a different fs */
+- printk(KERN_EMERG "%s: transactions do not match!\n",
+- __func__);
+- } else {
+- jbd_debug(5, "marking dirty. outer handle=%p\n",
+- current_handle);
+- ext4_mark_inode_dirty(handle, inode);
+- }
++
++ jbd_debug(5, "marking dirty. outer handle=%p\n", current_handle);
++ ext4_mark_inode_dirty(handle, inode);
++
+ ext4_journal_stop(handle);
+ out:
+ return;
+
+
+From linux@linux.site Thu Dec 10 20:27:59 2009
+Message-Id: <20091211042759.257547227@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:26 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Curt Wohlgemuth <curtw@google.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [48/90] ext4: Handle nested ext4_journal_start/stop calls without a journal
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0048-ext4-Handle-nested-ext4_journal_start-stop-calls-wit.patch
+Content-Length: 3068
+Lines: 110
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit d3d1faf6a74496ea4435fd057c6a2cad49f3e523)
+
+This patch fixes a problem with handling nested calls to
+ext4_journal_start/ext4_journal_stop, when there is no journal present.
+
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4_jbd2.h | 6 ++++--
+ fs/ext4/namei.c | 3 ++-
+ fs/ext4/super.c | 42 ++++++++++++++++++++++++++++++++----------
+ 3 files changed, 38 insertions(+), 13 deletions(-)
+
+--- a/fs/ext4/ext4_jbd2.h
++++ b/fs/ext4/ext4_jbd2.h
+@@ -161,11 +161,13 @@ int __ext4_handle_dirty_metadata(const c
+ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
+ int __ext4_journal_stop(const char *where, handle_t *handle);
+
+-#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1)
++#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
+
++/* Note: Do not use this for NULL handles. This is only to determine if
++ * a properly allocated handle is using a journal or not. */
+ static inline int ext4_handle_valid(handle_t *handle)
+ {
+- if (handle == EXT4_NOJOURNAL_HANDLE)
++ if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
+ return 0;
+ return 1;
+ }
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -2068,7 +2068,8 @@ int ext4_orphan_del(handle_t *handle, st
+ struct ext4_iloc iloc;
+ int err = 0;
+
+- if (!ext4_handle_valid(handle))
++ /* ext4_handle_valid() assumes a valid handle_t pointer */
++ if (handle && !ext4_handle_valid(handle))
+ return 0;
+
+ mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -189,6 +189,36 @@ void ext4_itable_unused_set(struct super
+ bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
+ }
+
++
++/* Just increment the non-pointer handle value */
++static handle_t *ext4_get_nojournal(void)
++{
++ handle_t *handle = current->journal_info;
++ unsigned long ref_cnt = (unsigned long)handle;
++
++ BUG_ON(ref_cnt >= EXT4_NOJOURNAL_MAX_REF_COUNT);
++
++ ref_cnt++;
++ handle = (handle_t *)ref_cnt;
++
++ current->journal_info = handle;
++ return handle;
++}
++
++
++/* Decrement the non-pointer handle value */
++static void ext4_put_nojournal(handle_t *handle)
++{
++ unsigned long ref_cnt = (unsigned long)handle;
++
++ BUG_ON(ref_cnt == 0);
++
++ ref_cnt--;
++ handle = (handle_t *)ref_cnt;
++
++ current->journal_info = handle;
++}
++
+ /*
+ * Wrappers for jbd2_journal_start/end.
+ *
+@@ -215,11 +245,7 @@ handle_t *ext4_journal_start_sb(struct s
+ }
+ return jbd2_journal_start(journal, nblocks);
+ }
+- /*
+- * We're not journaling, return the appropriate indication.
+- */
+- current->journal_info = EXT4_NOJOURNAL_HANDLE;
+- return current->journal_info;
++ return ext4_get_nojournal();
+ }
+
+ /*
+@@ -235,11 +261,7 @@ int __ext4_journal_stop(const char *wher
+ int rc;
+
+ if (!ext4_handle_valid(handle)) {
+- /*
+- * Do this here since we don't call jbd2_journal_stop() in
+- * no-journal mode.
+- */
+- current->journal_info = NULL;
++ ext4_put_nojournal(handle);
+ return 0;
+ }
+ sb = handle->h_transaction->t_journal->j_private;
+
+
+From linux@linux.site Thu Dec 10 20:28:00 2009
+Message-Id: <20091211042759.784398525@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:27 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [49/90] ext4: Fix time encoding with extra epoch bits
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0049-ext4-Fix-time-encoding-with-extra-epoch-bits.patch
+Content-Length: 1431
+Lines: 37
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit c1fccc0696bcaff6008c11865091f5ec4b0937ab)
+
+"Looking at ext4.h, I think the setting of extra time fields forgets to
+mask the epoch bits so the epoch part overwrites nsec part. The second
+change is only for coherency (2 -> EXT4_EPOCH_BITS)."
+
+Thanks to Damien Guibouret for pointing out this problem.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -522,8 +522,8 @@ struct move_extent {
+ static inline __le32 ext4_encode_extra_time(struct timespec *time)
+ {
+ return cpu_to_le32((sizeof(time->tv_sec) > 4 ?
+- time->tv_sec >> 32 : 0) |
+- ((time->tv_nsec << 2) & EXT4_NSEC_MASK));
++ (time->tv_sec >> 32) & EXT4_EPOCH_MASK : 0) |
++ ((time->tv_nsec << EXT4_EPOCH_BITS) & EXT4_NSEC_MASK));
+ }
+
+ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
+@@ -531,7 +531,7 @@ static inline void ext4_decode_extra_tim
+ if (sizeof(time->tv_sec) > 4)
+ time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK)
+ << 32;
+- time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2;
++ time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
+ }
+
+ #define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \
+
+
+From linux@linux.site Thu Dec 10 20:28:00 2009
+Message-Id: <20091211042800.341947927@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:28 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [50/90] ext4: fix a BUG_ON crash by checking that page has buffers attached to it
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0050-ext4-fix-a-BUG_ON-crash-by-checking-that-page-has-bu.patch
+Content-Length: 1530
+Lines: 53
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 1f94533d9cd75f6d2826018d54a971b9cc085992)
+
+In ext4_num_dirty_pages() we were calling page_buffers() before
+checking to see if the page actually had pages attached to it; this
+would cause a BUG check crash in the inline function page_buffers().
+
+Thanks to Markus Trippelsdorf for reporting this bug.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 22 +++++++++++-----------
+ 1 file changed, 11 insertions(+), 11 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1147,8 +1147,8 @@ static int check_block_validity(struct i
+ }
+
+ /*
+- * Return the number of dirty pages in the given inode starting at
+- * page frame idx.
++ * Return the number of contiguous dirty pages in a given inode
++ * starting at page frame idx.
+ */
+ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
+ unsigned int max_pages)
+@@ -1182,15 +1182,15 @@ static pgoff_t ext4_num_dirty_pages(stru
+ unlock_page(page);
+ break;
+ }
+- head = page_buffers(page);
+- bh = head;
+- do {
+- if (!buffer_delay(bh) &&
+- !buffer_unwritten(bh)) {
+- done = 1;
+- break;
+- }
+- } while ((bh = bh->b_this_page) != head);
++ if (page_has_buffers(page)) {
++ bh = head = page_buffers(page);
++ do {
++ if (!buffer_delay(bh) &&
++ !buffer_unwritten(bh))
++ done = 1;
++ bh = bh->b_this_page;
++ } while (!done && (bh != head));
++ }
+ unlock_page(page);
+ if (done)
+ break;
+
+
+From linux@linux.site Thu Dec 10 20:28:01 2009
+Message-Id: <20091211042800.901979632@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:29 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Eric Sandeen <sandeen@redhat.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [51/90] ext4: retry failed direct IO allocations
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0051-ext4-retry-failed-direct-IO-allocations.patch
+Content-Length: 1203
+Lines: 44
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit fbbf69456619de5d251cb9f1df609069178c62d5)
+
+On a 256M filesystem, doing this in a loop:
+
+ xfs_io -F -f -d -c 'pwrite 0 64m' test
+ rm -f test
+
+eventually leads to ENOSPC. (the xfs_io command does a
+64m direct IO write to the file "test")
+
+As with other block allocation callers, it looks like we need to
+potentially retry the allocations on the initial ENOSPC.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3372,6 +3372,7 @@ static ssize_t ext4_ind_direct_IO(int rw
+ ssize_t ret;
+ int orphan = 0;
+ size_t count = iov_length(iov, nr_segs);
++ int retries = 0;
+
+ if (rw == WRITE) {
+ loff_t final_size = offset + count;
+@@ -3394,9 +3395,12 @@ static ssize_t ext4_ind_direct_IO(int rw
+ }
+ }
+
++retry:
+ ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block, NULL);
++ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
++ goto retry;
+
+ if (orphan) {
+ int err;
+
+
+From linux@linux.site Thu Dec 10 20:28:01 2009
+Message-Id: <20091211042801.408749880@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:30 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [52/90] ext4: discard preallocation when restarting a transaction during truncate
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0052-ext4-discard-preallocation-when-restarting-a-transac.patch
+Content-Length: 1276
+Lines: 35
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit fa5d11133b07053270e18fa9c18560e66e79217e)
+
+When restart a transaction during a truncate operation, we drop and
+reacquire i_data_sem. After reacquiring i_data_sem, we need to
+discard any inode-based preallocation that might have been grabbed
+while we released i_data_sem (for example, if pdflush is allocating
+blocks and racing against the truncate).
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -193,7 +193,7 @@ static int try_to_extend_transaction(han
+ * so before we call here everything must be consistently dirtied against
+ * this transaction.
+ */
+- int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
++int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+ int nblocks)
+ {
+ int ret;
+@@ -209,6 +209,7 @@ static int try_to_extend_transaction(han
+ up_write(&EXT4_I(inode)->i_data_sem);
+ ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+ down_write(&EXT4_I(inode)->i_data_sem);
++ ext4_discard_preallocations(inode);
+
+ return ret;
+ }
+
+
+From linux@linux.site Thu Dec 10 20:28:02 2009
+Message-Id: <20091211042802.010530397@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:31 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Mingming Cao <cmm@us.ibm.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [53/90] ext4: fix ext4_ext_direct_IO()s return value after converting uninit extents
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0053-ext4-fix-ext4_ext_direct_IO-s-return-value-after-con.patch
+Content-Length: 1858
+Lines: 55
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 109f55651954def97fa41ee71c464d268c512ab0)
+
+After a direct I/O request covering an uninitalized extent (i.e.,
+created using the fallocate system call) or a hole in a file, ext4
+will convert the uninitialized extent so it is marked as initialized
+by calling ext4_convert_unwritten_extents(). This function returns
+zero on success.
+
+This return value was getting returned by ext4_direct_IO(); however
+the file system's direct_IO function is supposed to return the number
+of bytes read or written on a success. By returning zero, it confused
+the direct I/O code into falling back to buffered I/O unnecessarily.
+
+Signed-off-by: Mingming Cao <cmm@us.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c | 1 +
+ fs/ext4/inode.c | 10 +++++++---
+ 2 files changed, 8 insertions(+), 3 deletions(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3496,6 +3496,7 @@ retry:
+ *
+ * This function is called from the direct IO end io call back
+ * function, to convert the fallocated extents after IO is completed.
++ * Returns 0 on success.
+ */
+ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
+ loff_t len)
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3766,13 +3766,17 @@ static ssize_t ext4_ext_direct_IO(int rw
+ if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+ ext4_free_io_end(iocb->private);
+ iocb->private = NULL;
+- } else if (ret > 0)
++ } else if (ret > 0) {
++ int err;
+ /*
+ * for non AIO case, since the IO is already
+ * completed, we could do the convertion right here
+ */
+- ret = ext4_convert_unwritten_extents(inode,
+- offset, ret);
++ err = ext4_convert_unwritten_extents(inode,
++ offset, ret);
++ if (err < 0)
++ ret = err;
++ }
+ return ret;
+ }
+
+
+
+From linux@linux.site Thu Dec 10 20:28:03 2009
+Message-Id: <20091211042802.562626729@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:32 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Mingming Cao <cmm@us.ibm.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [54/90] ext4: skip conversion of uninit extents after direct IO if there isnt any
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0054-ext4-skip-conversion-of-uninit-extents-after-direct-.patch
+Content-Length: 3314
+Lines: 92
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 5f5249507e4b5c4fc0f9c93f33d133d8c95f47e1)
+
+At the end of direct I/O operation, ext4_ext_direct_IO() always called
+ext4_convert_unwritten_extents(), regardless of whether there were any
+unwritten extents involved in the I/O or not.
+
+This commit adds a state flag so that ext4_ext_direct_IO() only calls
+ext4_convert_unwritten_extents() when necessary.
+
+Signed-off-by: Mingming Cao <cmm@us.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 1 +
+ fs/ext4/extents.c | 22 +++++++++++++++++-----
+ fs/ext4/inode.c | 4 +++-
+ 3 files changed, 21 insertions(+), 6 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -318,6 +318,7 @@ static inline __u32 ext4_mask_flags(umod
+ #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
+ #define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
+ #define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
++#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
+
+ /* Used to pass group descriptor data when online resize is done */
+ struct ext4_new_group_input {
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3025,12 +3025,18 @@ ext4_ext_handle_uninitialized_extents(ha
+ ret = ext4_split_unwritten_extents(handle,
+ inode, path, iblock,
+ max_blocks, flags);
+- /* flag the io_end struct that we need convert when IO done */
++ /*
++ * Flag the inode(non aio case) or end_io struct (aio case)
++ * that this IO needs to convertion to written when IO is
++ * completed
++ */
+ if (io)
+ io->flag = DIO_AIO_UNWRITTEN;
++ else
++ EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
+ goto out;
+ }
+- /* DIO end_io complete, convert the filled extent to written */
++ /* async DIO end_io complete, convert the filled extent to written */
+ if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
+ ret = ext4_convert_unwritten_extents_dio(handle, inode,
+ path);
+@@ -3272,10 +3278,16 @@ int ext4_ext_get_blocks(handle_t *handle
+ * To avoid unecessary convertion for every aio dio rewrite
+ * to the mid of file, here we flag the IO that is really
+ * need the convertion.
+- *
++ * For non asycn direct IO case, flag the inode state
++ * that we need to perform convertion when IO is done.
+ */
+- if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT)
+- io->flag = DIO_AIO_UNWRITTEN;
++ if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
++ if (io)
++ io->flag = DIO_AIO_UNWRITTEN;
++ else
++ EXT4_I(inode)->i_state |=
++ EXT4_STATE_DIO_UNWRITTEN;;
++ }
+ }
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (err) {
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3766,7 +3766,8 @@ static ssize_t ext4_ext_direct_IO(int rw
+ if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+ ext4_free_io_end(iocb->private);
+ iocb->private = NULL;
+- } else if (ret > 0) {
++ } else if (ret > 0 && (EXT4_I(inode)->i_state &
++ EXT4_STATE_DIO_UNWRITTEN)) {
+ int err;
+ /*
+ * for non AIO case, since the IO is already
+@@ -3776,6 +3777,7 @@ static ssize_t ext4_ext_direct_IO(int rw
+ offset, ret);
+ if (err < 0)
+ ret = err;
++ EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
+ }
+ return ret;
+ }
+
+
+From linux@linux.site Thu Dec 10 20:28:03 2009
+Message-Id: <20091211042803.182022052@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:33 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Mingming Cao <cmm@us.ibm.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [55/90] ext4: code clean up for dio fallocate handling
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0055-ext4-code-clean-up-for-dio-fallocate-handling.patch
+Content-Length: 1564
+Lines: 48
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 4b70df181611012a3556f017b57dfcef7e1d279f)
+
+The ext4_debug() call in ext4_end_io_dio() should be moved after the
+check to make sure that io_end is non-NULL.
+
+The comment above ext4_get_block_dio_write() ("Maximum number of
+blocks...") is a duplicate; the original and correct comment is above
+the #define DIO_MAX_BLOCKS up above.
+
+Based on review comments from Curt Wohlgemuth.
+
+Signed-off-by: Mingming Cao <cmm@us.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 9 ++++-----
+ 1 file changed, 4 insertions(+), 5 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3440,8 +3440,6 @@ out:
+ return ret;
+ }
+
+-/* Maximum number of blocks we map for direct IO at once. */
+-
+ static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+ {
+@@ -3649,13 +3647,14 @@ static void ext4_end_io_dio(struct kiocb
+ ext4_io_end_t *io_end = iocb->private;
+ struct workqueue_struct *wq;
+
++ /* if not async direct IO or dio with 0 bytes write, just return */
++ if (!io_end || !size)
++ return;
++
+ ext_debug("ext4_end_io_dio(): io_end 0x%p"
+ "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+ iocb->private, io_end->inode->i_ino, iocb, offset,
+ size);
+- /* if not async direct IO or dio with 0 bytes write, just return */
+- if (!io_end || !size)
+- return;
+
+ /* if not aio dio with unwritten extents, just free io and return */
+ if (io_end->flag != DIO_AIO_UNWRITTEN){
+
+
+From linux@linux.site Thu Dec 10 20:28:04 2009
+Message-Id: <20091211042803.809439347@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:34 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Mingming Cao <cmm@us.ibm.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [56/90] ext4: Fix return value of ext4_split_unwritten_extents() to fix direct I/O
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0056-ext4-Fix-return-value-of-ext4_split_unwritten_extent.patch
+Content-Length: 2272
+Lines: 58
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit ba230c3f6dc88ec008806adb27b12088486d508e)
+
+To prepare for a direct I/O write, we need to split the unwritten
+extents before submitting the I/O. When no extents needed to be
+split, ext4_split_unwritten_extents() was incorrectly returning 0
+instead of the size of uninitialized extents. This bug caused the
+wrong return value sent back to VFS code when it gets called from
+async IO path, leading to an unnecessary fall back to buffered IO.
+
+This bug also hid the fact that the check to see whether or not a
+split would be necessary was incorrect; we can only skip splitting the
+extent if the write completely covers the uninitialized extent.
+
+Signed-off-by: Mingming Cao <cmm@us.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c | 13 +++++++------
+ 1 file changed, 7 insertions(+), 6 deletions(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -2784,6 +2784,8 @@ fix_extent_len:
+ * into three uninitialized extent(at most). After IO complete, the part
+ * being filled will be convert to initialized by the end_io callback function
+ * via ext4_convert_unwritten_extents().
++ *
++ * Returns the size of uninitialized extent to be written on success.
+ */
+ static int ext4_split_unwritten_extents(handle_t *handle,
+ struct inode *inode,
+@@ -2801,7 +2803,6 @@ static int ext4_split_unwritten_extents(
+ unsigned int allocated, ee_len, depth;
+ ext4_fsblk_t newblock;
+ int err = 0;
+- int ret = 0;
+
+ ext_debug("ext4_split_unwritten_extents: inode %lu,"
+ "iblock %llu, max_blocks %u\n", inode->i_ino,
+@@ -2819,12 +2820,12 @@ static int ext4_split_unwritten_extents(
+ ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+
+ /*
+- * if the entire unintialized extent length less than
+- * the size of extent to write, there is no need to split
+- * uninitialized extent
++ * If the uninitialized extent begins at the same logical
++ * block where the write begins, and the write completely
++ * covers the extent, then we don't need to split it.
+ */
+- if (allocated <= max_blocks)
+- return ret;
++ if ((iblock == ee_block) && (allocated <= max_blocks))
++ return allocated;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+
+
+From linux@linux.site Thu Dec 10 20:28:04 2009
+Message-Id: <20091211042804.362034552@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:35 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Curt Wohlgemuth <curtw@google.com>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [57/90] ext4: fix potential buffer head leak when add_dirent_to_buf() returns ENOSPC
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0057-ext4-fix-potential-buffer-head-leak-when-add_dirent_.patch
+Content-Length: 3833
+Lines: 118
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 2de770a406b06dfc619faabbf5d85c835ed3f2e1)
+
+Previously add_dirent_to_buf() did not free its passed-in buffer head
+in the case of ENOSPC, since in some cases the caller still needed it.
+However, this led to potential buffer head leaks since not all callers
+dealt with this correctly. Fix this by making simplifying the freeing
+convention; now add_dirent_to_buf() *never* frees the passed-in buffer
+head, and leaves that to the responsibility of its caller. This makes
+things cleaner and easier to prove that the code is neither leaking
+buffer heads or calling brelse() one time too many.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Cc: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/namei.c | 30 ++++++++++++------------------
+ 1 file changed, 12 insertions(+), 18 deletions(-)
+
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -1292,9 +1292,6 @@ errout:
+ * add_dirent_to_buf will attempt search the directory block for
+ * space. It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+- *
+- * NOTE! bh is NOT released in the case where ENOSPC is returned. In
+- * all other cases bh is released.
+ */
+ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
+ struct inode *inode, struct ext4_dir_entry_2 *de,
+@@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *h
+ top = bh->b_data + blocksize - reclen;
+ while ((char *) de <= top) {
+ if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
+- bh, offset)) {
+- brelse(bh);
++ bh, offset))
+ return -EIO;
+- }
+- if (ext4_match(namelen, name, de)) {
+- brelse(bh);
++ if (ext4_match(namelen, name, de))
+ return -EEXIST;
+- }
+ nlen = EXT4_DIR_REC_LEN(de->name_len);
+ rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
+ if ((de->inode? rlen - nlen: rlen) >= reclen)
+@@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *h
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err) {
+ ext4_std_error(dir->i_sb, err);
+- brelse(bh);
+ return err;
+ }
+
+@@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *h
+ err = ext4_handle_dirty_metadata(handle, dir, bh);
+ if (err)
+ ext4_std_error(dir->i_sb, err);
+- brelse(bh);
+ return 0;
+ }
+
+@@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *ha
+ if (!(de))
+ return retval;
+
+- return add_dirent_to_buf(handle, dentry, inode, de, bh);
++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
++ brelse(bh);
++ return retval;
+ }
+
+ /*
+@@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *hand
+ if(!bh)
+ return retval;
+ retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+- if (retval != -ENOSPC)
++ if (retval != -ENOSPC) {
++ brelse(bh);
+ return retval;
++ }
+
+ if (blocks == 1 && !dx_fallback &&
+ EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
+@@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *hand
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ de->inode = 0;
+ de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
+- return add_dirent_to_buf(handle, dentry, inode, de, bh);
++ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
++ brelse(bh);
++ return retval;
+ }
+
+ /*
+@@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *h
+ goto journal_error;
+
+ err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
+- if (err != -ENOSPC) {
+- bh = NULL;
++ if (err != -ENOSPC)
+ goto cleanup;
+- }
+
+ /* Block full, should compress but for now just split */
+ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
+@@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *h
+ if (!de)
+ goto cleanup;
+ err = add_dirent_to_buf(handle, dentry, inode, de, bh);
+- bh = NULL;
+ goto cleanup;
+
+ journal_error:
+
+
+From linux@linux.site Thu Dec 10 20:28:05 2009
+Message-Id: <20091211042804.949254622@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:36 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [58/90] ext4: avoid divide by zero when trying to mount a corrupted file system
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0058-ext4-avoid-divide-by-zero-when-trying-to-mount-a-cor.patch
+Content-Length: 1267
+Lines: 39
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 503358ae01b70ce6909d19dd01287093f6b6271c)
+
+If s_log_groups_per_flex is greater than 31, then groups_per_flex will
+will overflow and cause a divide by zero error. This can cause kernel
+BUG if such a file system is mounted.
+
+Thanks to Nageswara R Sastry for analyzing the failure and providing
+an initial patch.
+
+http://bugzilla.kernel.org/show_bug.cgi?id=14287
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/super.c | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -1695,14 +1695,14 @@ static int ext4_fill_flex_info(struct su
+ size_t size;
+ int i;
+
+- if (!sbi->s_es->s_log_groups_per_flex) {
++ sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
++ groups_per_flex = 1 << sbi->s_log_groups_per_flex;
++
++ if (groups_per_flex < 2) {
+ sbi->s_log_groups_per_flex = 0;
+ return 1;
+ }
+
+- sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+- groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+-
+ /* We allocate both existing and potentially added groups */
+ flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
+ ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
+
+
+From linux@linux.site Thu Dec 10 20:28:05 2009
+Message-Id: <20091211042805.444446227@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:37 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Akira Fujita <a-fujita@rs.jp.nec.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [59/90] ext4: fix the returned block count if EXT4_IOC_MOVE_EXT fails
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0059-ext4-fix-the-returned-block-count-if-EXT4_IOC_MOVE_E.patch
+Content-Length: 10970
+Lines: 349
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit f868a48d06f8886cb0367568a12367fa4f21ea0d)
+
+If the EXT4_IOC_MOVE_EXT ioctl fails, the number of blocks that were
+exchanged before the failure should be returned to the userspace
+caller. Unfortunately, currently if the block size is not the same as
+the page size, the returned block count that is returned is the
+page-aligned block count instead of the actual block count. This
+commit addresses this bug.
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c | 139 ++++++++++++++++++++++++++------------------------
+ 1 file changed, 73 insertions(+), 66 deletions(-)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -661,6 +661,7 @@ mext_calc_swap_extents(struct ext4_exten
+ * @donor_inode: donor inode
+ * @from: block offset of orig_inode
+ * @count: block count to be replaced
++ * @err: pointer to save return value
+ *
+ * Replace original inode extents and donor inode extents page by page.
+ * We implement this replacement in the following three steps:
+@@ -671,19 +672,18 @@ mext_calc_swap_extents(struct ext4_exten
+ * 3. Change the block information of donor inode to point at the saved
+ * original inode blocks in the dummy extents.
+ *
+- * Return 0 on success, or a negative error value on failure.
++ * Return replaced block count.
+ */
+ static int
+ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
+ struct inode *donor_inode, ext4_lblk_t from,
+- ext4_lblk_t count)
++ ext4_lblk_t count, int *err)
+ {
+ struct ext4_ext_path *orig_path = NULL;
+ struct ext4_ext_path *donor_path = NULL;
+ struct ext4_extent *oext, *dext;
+ struct ext4_extent tmp_dext, tmp_oext;
+ ext4_lblk_t orig_off = from, donor_off = from;
+- int err = 0;
+ int depth;
+ int replaced_count = 0;
+ int dext_alen;
+@@ -691,13 +691,13 @@ mext_replace_branches(handle_t *handle,
+ mext_double_down_write(orig_inode, donor_inode);
+
+ /* Get the original extent for the block "orig_off" */
+- err = get_ext_path(orig_inode, orig_off, &orig_path);
+- if (err)
++ *err = get_ext_path(orig_inode, orig_off, &orig_path);
++ if (*err)
+ goto out;
+
+ /* Get the donor extent for the head */
+- err = get_ext_path(donor_inode, donor_off, &donor_path);
+- if (err)
++ *err = get_ext_path(donor_inode, donor_off, &donor_path);
++ if (*err)
+ goto out;
+ depth = ext_depth(orig_inode);
+ oext = orig_path[depth].p_ext;
+@@ -707,9 +707,9 @@ mext_replace_branches(handle_t *handle,
+ dext = donor_path[depth].p_ext;
+ tmp_dext = *dext;
+
+- err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
++ *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ donor_off, count);
+- if (err)
++ if (*err)
+ goto out;
+
+ /* Loop for the donor extents */
+@@ -718,7 +718,7 @@ mext_replace_branches(handle_t *handle,
+ if (!dext) {
+ ext4_error(donor_inode->i_sb, __func__,
+ "The extent for donor must be found");
+- err = -EIO;
++ *err = -EIO;
+ goto out;
+ } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
+ ext4_error(donor_inode->i_sb, __func__,
+@@ -726,20 +726,20 @@ mext_replace_branches(handle_t *handle,
+ "extent(%u) should be equal",
+ donor_off,
+ le32_to_cpu(tmp_dext.ee_block));
+- err = -EIO;
++ *err = -EIO;
+ goto out;
+ }
+
+ /* Set donor extent to orig extent */
+- err = mext_leaf_block(handle, orig_inode,
++ *err = mext_leaf_block(handle, orig_inode,
+ orig_path, &tmp_dext, &orig_off);
+- if (err < 0)
++ if (*err)
+ goto out;
+
+ /* Set orig extent to donor extent */
+- err = mext_leaf_block(handle, donor_inode,
++ *err = mext_leaf_block(handle, donor_inode,
+ donor_path, &tmp_oext, &donor_off);
+- if (err < 0)
++ if (*err)
+ goto out;
+
+ dext_alen = ext4_ext_get_actual_len(&tmp_dext);
+@@ -753,35 +753,25 @@ mext_replace_branches(handle_t *handle,
+
+ if (orig_path)
+ ext4_ext_drop_refs(orig_path);
+- err = get_ext_path(orig_inode, orig_off, &orig_path);
+- if (err)
++ *err = get_ext_path(orig_inode, orig_off, &orig_path);
++ if (*err)
+ goto out;
+ depth = ext_depth(orig_inode);
+ oext = orig_path[depth].p_ext;
+- if (le32_to_cpu(oext->ee_block) +
+- ext4_ext_get_actual_len(oext) <= orig_off) {
+- err = 0;
+- goto out;
+- }
+ tmp_oext = *oext;
+
+ if (donor_path)
+ ext4_ext_drop_refs(donor_path);
+- err = get_ext_path(donor_inode, donor_off, &donor_path);
+- if (err)
++ *err = get_ext_path(donor_inode, donor_off, &donor_path);
++ if (*err)
+ goto out;
+ depth = ext_depth(donor_inode);
+ dext = donor_path[depth].p_ext;
+- if (le32_to_cpu(dext->ee_block) +
+- ext4_ext_get_actual_len(dext) <= donor_off) {
+- err = 0;
+- goto out;
+- }
+ tmp_dext = *dext;
+
+- err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
++ *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+ donor_off, count - replaced_count);
+- if (err)
++ if (*err)
+ goto out;
+ }
+
+@@ -796,7 +786,7 @@ out:
+ }
+
+ mext_double_up_write(orig_inode, donor_inode);
+- return err;
++ return replaced_count;
+ }
+
+ /**
+@@ -808,16 +798,17 @@ out:
+ * @data_offset_in_page: block index where data swapping starts
+ * @block_len_in_page: the number of blocks to be swapped
+ * @uninit: orig extent is uninitialized or not
++ * @err: pointer to save return value
+ *
+ * Save the data in original inode blocks and replace original inode extents
+ * with donor inode extents by calling mext_replace_branches().
+- * Finally, write out the saved data in new original inode blocks. Return 0
+- * on success, or a negative error value on failure.
++ * Finally, write out the saved data in new original inode blocks. Return
++ * replaced block count.
+ */
+ static int
+ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+ pgoff_t orig_page_offset, int data_offset_in_page,
+- int block_len_in_page, int uninit)
++ int block_len_in_page, int uninit, int *err)
+ {
+ struct inode *orig_inode = o_filp->f_dentry->d_inode;
+ struct address_space *mapping = orig_inode->i_mapping;
+@@ -829,9 +820,11 @@ move_extent_per_page(struct file *o_filp
+ long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
+ unsigned long blocksize = orig_inode->i_sb->s_blocksize;
+ unsigned int w_flags = 0;
+- unsigned int tmp_data_len, data_len;
++ unsigned int tmp_data_size, data_size, replaced_size;
+ void *fsdata;
+- int ret, i, jblocks;
++ int i, jblocks;
++ int err2 = 0;
++ int replaced_count = 0;
+ int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
+
+ /*
+@@ -841,8 +834,8 @@ move_extent_per_page(struct file *o_filp
+ jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
+ handle = ext4_journal_start(orig_inode, jblocks);
+ if (IS_ERR(handle)) {
+- ret = PTR_ERR(handle);
+- return ret;
++ *err = PTR_ERR(handle);
++ return 0;
+ }
+
+ if (segment_eq(get_fs(), KERNEL_DS))
+@@ -858,9 +851,9 @@ move_extent_per_page(struct file *o_filp
+ * Just swap data blocks between orig and donor.
+ */
+ if (uninit) {
+- ret = mext_replace_branches(handle, orig_inode,
+- donor_inode, orig_blk_offset,
+- block_len_in_page);
++ replaced_count = mext_replace_branches(handle, orig_inode,
++ donor_inode, orig_blk_offset,
++ block_len_in_page, err);
+
+ /* Clear the inode cache not to refer to the old data */
+ ext4_ext_invalidate_cache(orig_inode);
+@@ -870,27 +863,28 @@ move_extent_per_page(struct file *o_filp
+
+ offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
+
+- /* Calculate data_len */
++ /* Calculate data_size */
+ if ((orig_blk_offset + block_len_in_page - 1) ==
+ ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
+ /* Replace the last block */
+- tmp_data_len = orig_inode->i_size & (blocksize - 1);
++ tmp_data_size = orig_inode->i_size & (blocksize - 1);
+ /*
+- * If data_len equal zero, it shows data_len is multiples of
++ * If data_size equal zero, it shows data_size is multiples of
+ * blocksize. So we set appropriate value.
+ */
+- if (tmp_data_len == 0)
+- tmp_data_len = blocksize;
++ if (tmp_data_size == 0)
++ tmp_data_size = blocksize;
+
+- data_len = tmp_data_len +
++ data_size = tmp_data_size +
+ ((block_len_in_page - 1) << orig_inode->i_blkbits);
+- } else {
+- data_len = block_len_in_page << orig_inode->i_blkbits;
+- }
++ } else
++ data_size = block_len_in_page << orig_inode->i_blkbits;
++
++ replaced_size = data_size;
+
+- ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags,
++ *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
+ &page, &fsdata);
+- if (unlikely(ret < 0))
++ if (unlikely(*err < 0))
+ goto out;
+
+ if (!PageUptodate(page)) {
+@@ -911,10 +905,17 @@ move_extent_per_page(struct file *o_filp
+ /* Release old bh and drop refs */
+ try_to_release_page(page, 0);
+
+- ret = mext_replace_branches(handle, orig_inode, donor_inode,
+- orig_blk_offset, block_len_in_page);
+- if (ret < 0)
+- goto out;
++ replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
++ orig_blk_offset, block_len_in_page,
++ &err2);
++ if (err2) {
++ if (replaced_count) {
++ block_len_in_page = replaced_count;
++ replaced_size =
++ block_len_in_page << orig_inode->i_blkbits;
++ } else
++ goto out;
++ }
+
+ /* Clear the inode cache not to refer to the old data */
+ ext4_ext_invalidate_cache(orig_inode);
+@@ -928,16 +929,16 @@ move_extent_per_page(struct file *o_filp
+ bh = bh->b_this_page;
+
+ for (i = 0; i < block_len_in_page; i++) {
+- ret = ext4_get_block(orig_inode,
++ *err = ext4_get_block(orig_inode,
+ (sector_t)(orig_blk_offset + i), bh, 0);
+- if (ret < 0)
++ if (*err < 0)
+ goto out;
+
+ if (bh->b_this_page != NULL)
+ bh = bh->b_this_page;
+ }
+
+- ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len,
++ *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
+ page, fsdata);
+ page = NULL;
+
+@@ -951,7 +952,10 @@ out:
+ out2:
+ ext4_journal_stop(handle);
+
+- return ret < 0 ? ret : 0;
++ if (err2)
++ *err = err2;
++
++ return replaced_count;
+ }
+
+ /**
+@@ -1367,15 +1371,17 @@ ext4_move_extents(struct file *o_filp, s
+ while (orig_page_offset <= seq_end_page) {
+
+ /* Swap original branches with new branches */
+- ret1 = move_extent_per_page(o_filp, donor_inode,
++ block_len_in_page = move_extent_per_page(
++ o_filp, donor_inode,
+ orig_page_offset,
+ data_offset_in_page,
+- block_len_in_page, uninit);
+- if (ret1 < 0)
+- goto out;
+- orig_page_offset++;
++ block_len_in_page, uninit,
++ &ret1);
++
+ /* Count how many blocks we have exchanged */
+ *moved_len += block_len_in_page;
++ if (ret1 < 0)
++ goto out;
+ if (*moved_len > len) {
+ ext4_error(orig_inode->i_sb, __func__,
+ "We replaced blocks too much! "
+@@ -1385,6 +1391,7 @@ ext4_move_extents(struct file *o_filp, s
+ goto out;
+ }
+
++ orig_page_offset++;
+ data_offset_in_page = 0;
+ rest_blocks -= block_len_in_page;
+ if (rest_blocks > blocks_per_page)
+
+
+From linux@linux.site Thu Dec 10 20:28:06 2009
+Message-Id: <20091211042805.985279951@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:38 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Akira Fujita <a-fujita@rs.jp.nec.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [60/90] ext4: fix lock order problem in ext4_move_extents()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0060-ext4-fix-lock-order-problem-in-ext4_move_extents.patch
+Content-Length: 10372
+Lines: 310
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit fc04cb49a898c372a22b21fffc47f299d8710801)
+
+ext4_move_extents() checks the logical block contiguousness
+of original file with ext4_find_extent() and mext_next_extent().
+Therefore the extent which ext4_ext_path structure indicates
+must not be changed between above functions.
+
+But in current implementation, there is no i_data_sem protection
+between ext4_ext_find_extent() and mext_next_extent(). So the extent
+which ext4_ext_path structure indicates may be overwritten by
+delalloc. As a result, ext4_move_extents() will exchange wrong blocks
+between original and donor files. I change the place where
+acquire/release i_data_sem to solve this problem.
+
+Moreover, I changed move_extent_per_page() to start transaction first,
+and then acquire i_data_sem. Without this change, there is a
+possibility of the deadlock between mmap() and ext4_move_extents():
+
+* NOTE: "A", "B" and "C" mean different processes
+
+A-1: ext4_ext_move_extents() acquires i_data_sem of two inodes.
+
+B: do_page_fault() starts the transaction (T),
+ and then tries to acquire i_data_sem.
+ But process "A" is already holding it, so it is kept waiting.
+
+C: While "A" and "B" running, kjournald2 tries to commit transaction (T)
+ but it is under updating, so kjournald2 waits for it.
+
+A-2: Call ext4_journal_start with holding i_data_sem,
+ but transaction (T) is locked.
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c | 117 ++++++++++++++++++++++----------------------------
+ 1 file changed, 53 insertions(+), 64 deletions(-)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -77,12 +77,14 @@ static int
+ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+ struct ext4_extent **extent)
+ {
++ struct ext4_extent_header *eh;
+ int ppos, leaf_ppos = path->p_depth;
+
+ ppos = leaf_ppos;
+ if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
+ /* leaf block */
+ *extent = ++path[ppos].p_ext;
++ path[ppos].p_block = ext_pblock(path[ppos].p_ext);
+ return 0;
+ }
+
+@@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, st
+ ext_block_hdr(path[cur_ppos+1].p_bh);
+ }
+
++ path[leaf_ppos].p_ext = *extent = NULL;
++
++ eh = path[leaf_ppos].p_hdr;
++ if (le16_to_cpu(eh->eh_entries) == 0)
++ /* empty leaf is found */
++ return -ENODATA;
++
+ /* leaf block */
+ path[leaf_ppos].p_ext = *extent =
+ EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
++ path[leaf_ppos].p_block =
++ ext_pblock(path[leaf_ppos].p_ext);
+ return 0;
+ }
+ }
+@@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inod
+ }
+
+ /**
+- * mext_double_down_read - Acquire two inodes' read semaphore
+- *
+- * @orig_inode: original inode structure
+- * @donor_inode: donor inode structure
+- * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
+- */
+-static void
+-mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
+-{
+- struct inode *first = orig_inode, *second = donor_inode;
+-
+- /*
+- * Use the inode number to provide the stable locking order instead
+- * of its address, because the C language doesn't guarantee you can
+- * compare pointers that don't come from the same array.
+- */
+- if (donor_inode->i_ino < orig_inode->i_ino) {
+- first = donor_inode;
+- second = orig_inode;
+- }
+-
+- down_read(&EXT4_I(first)->i_data_sem);
+- down_read(&EXT4_I(second)->i_data_sem);
+-}
+-
+-/**
+- * mext_double_down_write - Acquire two inodes' write semaphore
++ * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
+ *
+ * @orig_inode: original inode structure
+ * @donor_inode: donor inode structure
+- * Acquire write semaphore of the two inodes (orig and donor) by i_ino order.
++ * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
++ * i_ino order.
+ */
+ static void
+-mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
++double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+ {
+ struct inode *first = orig_inode, *second = donor_inode;
+
+@@ -207,28 +193,14 @@ mext_double_down_write(struct inode *ori
+ }
+
+ /**
+- * mext_double_up_read - Release two inodes' read semaphore
++ * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
+ *
+ * @orig_inode: original inode structure to be released its lock first
+ * @donor_inode: donor inode structure to be released its lock second
+- * Release read semaphore of two inodes (orig and donor).
++ * Release write lock of i_data_sem of two inodes (orig and donor).
+ */
+ static void
+-mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
+-{
+- up_read(&EXT4_I(orig_inode)->i_data_sem);
+- up_read(&EXT4_I(donor_inode)->i_data_sem);
+-}
+-
+-/**
+- * mext_double_up_write - Release two inodes' write semaphore
+- *
+- * @orig_inode: original inode structure to be released its lock first
+- * @donor_inode: donor inode structure to be released its lock second
+- * Release write semaphore of two inodes (orig and donor).
+- */
+-static void
+-mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
++double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
+ {
+ up_write(&EXT4_I(orig_inode)->i_data_sem);
+ up_write(&EXT4_I(donor_inode)->i_data_sem);
+@@ -688,8 +660,6 @@ mext_replace_branches(handle_t *handle,
+ int replaced_count = 0;
+ int dext_alen;
+
+- mext_double_down_write(orig_inode, donor_inode);
+-
+ /* Get the original extent for the block "orig_off" */
+ *err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (*err)
+@@ -785,7 +755,6 @@ out:
+ kfree(donor_path);
+ }
+
+- mext_double_up_write(orig_inode, donor_inode);
+ return replaced_count;
+ }
+
+@@ -851,6 +820,11 @@ move_extent_per_page(struct file *o_filp
+ * Just swap data blocks between orig and donor.
+ */
+ if (uninit) {
++ /*
++ * Protect extent trees against block allocations
++ * via delalloc
++ */
++ double_down_write_data_sem(orig_inode, donor_inode);
+ replaced_count = mext_replace_branches(handle, orig_inode,
+ donor_inode, orig_blk_offset,
+ block_len_in_page, err);
+@@ -858,6 +832,7 @@ move_extent_per_page(struct file *o_filp
+ /* Clear the inode cache not to refer to the old data */
+ ext4_ext_invalidate_cache(orig_inode);
+ ext4_ext_invalidate_cache(donor_inode);
++ double_up_write_data_sem(orig_inode, donor_inode);
+ goto out2;
+ }
+
+@@ -905,6 +880,8 @@ move_extent_per_page(struct file *o_filp
+ /* Release old bh and drop refs */
+ try_to_release_page(page, 0);
+
++ /* Protect extent trees against block allocations via delalloc */
++ double_down_write_data_sem(orig_inode, donor_inode);
+ replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
+ orig_blk_offset, block_len_in_page,
+ &err2);
+@@ -913,14 +890,18 @@ move_extent_per_page(struct file *o_filp
+ block_len_in_page = replaced_count;
+ replaced_size =
+ block_len_in_page << orig_inode->i_blkbits;
+- } else
++ } else {
++ double_up_write_data_sem(orig_inode, donor_inode);
+ goto out;
++ }
+ }
+
+ /* Clear the inode cache not to refer to the old data */
+ ext4_ext_invalidate_cache(orig_inode);
+ ext4_ext_invalidate_cache(donor_inode);
+
++ double_up_write_data_sem(orig_inode, donor_inode);
++
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
+
+@@ -1236,16 +1217,16 @@ ext4_move_extents(struct file *o_filp, s
+ return -EINVAL;
+ }
+
+- /* protect orig and donor against a truncate */
++ /* Protect orig and donor inodes against a truncate */
+ ret1 = mext_inode_double_lock(orig_inode, donor_inode);
+ if (ret1 < 0)
+ return ret1;
+
+- mext_double_down_read(orig_inode, donor_inode);
++ /* Protect extent tree against block allocations via delalloc */
++ double_down_write_data_sem(orig_inode, donor_inode);
+ /* Check the filesystem environment whether move_extent can be done */
+ ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
+ donor_start, &len, *moved_len);
+- mext_double_up_read(orig_inode, donor_inode);
+ if (ret1)
+ goto out;
+
+@@ -1308,6 +1289,10 @@ ext4_move_extents(struct file *o_filp, s
+ ext4_ext_get_actual_len(ext_cur), block_end + 1) -
+ max(le32_to_cpu(ext_cur->ee_block), block_start);
+
++ /* Discard preallocations of two inodes */
++ ext4_discard_preallocations(orig_inode);
++ ext4_discard_preallocations(donor_inode);
++
+ while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+ seq_blocks += add_blocks;
+
+@@ -1359,14 +1344,14 @@ ext4_move_extents(struct file *o_filp, s
+ seq_start = le32_to_cpu(ext_cur->ee_block);
+ rest_blocks = seq_blocks;
+
+- /* Discard preallocations of two inodes */
+- down_write(&EXT4_I(orig_inode)->i_data_sem);
+- ext4_discard_preallocations(orig_inode);
+- up_write(&EXT4_I(orig_inode)->i_data_sem);
+-
+- down_write(&EXT4_I(donor_inode)->i_data_sem);
+- ext4_discard_preallocations(donor_inode);
+- up_write(&EXT4_I(donor_inode)->i_data_sem);
++ /*
++ * Up semaphore to avoid following problems:
++ * a. transaction deadlock among ext4_journal_start,
++ * ->write_begin via pagefault, and jbd2_journal_commit
++ * b. racing with ->readpage, ->write_begin, and ext4_get_block
++ * in move_extent_per_page
++ */
++ double_up_write_data_sem(orig_inode, donor_inode);
+
+ while (orig_page_offset <= seq_end_page) {
+
+@@ -1381,14 +1366,14 @@ ext4_move_extents(struct file *o_filp, s
+ /* Count how many blocks we have exchanged */
+ *moved_len += block_len_in_page;
+ if (ret1 < 0)
+- goto out;
++ break;
+ if (*moved_len > len) {
+ ext4_error(orig_inode->i_sb, __func__,
+ "We replaced blocks too much! "
+ "sum of replaced: %llu requested: %llu",
+ *moved_len, len);
+ ret1 = -EIO;
+- goto out;
++ break;
+ }
+
+ orig_page_offset++;
+@@ -1400,6 +1385,10 @@ ext4_move_extents(struct file *o_filp, s
+ block_len_in_page = rest_blocks;
+ }
+
++ double_down_write_data_sem(orig_inode, donor_inode);
++ if (ret1 < 0)
++ break;
++
+ /* Decrease buffer counter */
+ if (holecheck_path)
+ ext4_ext_drop_refs(holecheck_path);
+@@ -1429,7 +1418,7 @@ out:
+ ext4_ext_drop_refs(holecheck_path);
+ kfree(holecheck_path);
+ }
+-
++ double_up_write_data_sem(orig_inode, donor_inode);
+ ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
+
+ if (ret1)
+
+
+From linux@linux.site Thu Dec 10 20:28:07 2009
+Message-Id: <20091211042806.581977969@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:39 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Akira Fujita <a-fujita@rs.jp.nec.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [61/90] ext4: fix possible recursive locking warning in EXT4_IOC_MOVE_EXT
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0061-ext4-fix-possible-recursive-locking-warning-in-EXT4_.patch
+Content-Length: 1075
+Lines: 32
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 49bd22bc4d603a2a4fc2a6a60e156cbea52eb494)
+
+If CONFIG_PROVE_LOCKING is enabled, the double_down_write_data_sem()
+will trigger a false-positive warning of a recursive lock. Since we
+take i_data_sem for the two inodes ordered by their inode numbers,
+this isn't a problem. Use of down_write_nested() will notify the lock
+dependency checker machinery that there is no problem here.
+
+This problem was reported by Brian Rogers:
+
+ http://marc.info/?l=linux-ext4&m=125115356928011&w=1
+
+Reported-by: Brian Rogers <brian@xyzw.org>
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -189,7 +189,7 @@ double_down_write_data_sem(struct inode
+ }
+
+ down_write(&EXT4_I(first)->i_data_sem);
+- down_write(&EXT4_I(second)->i_data_sem);
++ down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
+ }
+
+ /**
+
+
+From linux@linux.site Thu Dec 10 20:28:07 2009
+Message-Id: <20091211042807.176333510@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:40 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [62/90] ext4: plug a buffer_head leak in an error path of ext4_iget()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0062-ext4-plug-a-buffer_head-leak-in-an-error-path-of-ext.patch
+Content-Length: 2427
+Lines: 82
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 567f3e9a70d71e5c9be03701b8578be77857293b)
+
+One of the invalid error paths in ext4_iget() forgot to brelse() the
+inode buffer head. Fix it by adding a brelse() in the common error
+return path, which also simplifies function.
+
+Thanks to Andi Kleen <ak@linux.intel.com> reporting the problem.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 11 +++--------
+ 1 file changed, 3 insertions(+), 8 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -4771,7 +4771,6 @@ struct inode *ext4_iget(struct super_blo
+ struct ext4_iloc iloc;
+ struct ext4_inode *raw_inode;
+ struct ext4_inode_info *ei;
+- struct buffer_head *bh;
+ struct inode *inode;
+ long ret;
+ int block;
+@@ -4783,11 +4782,11 @@ struct inode *ext4_iget(struct super_blo
+ return inode;
+
+ ei = EXT4_I(inode);
++ iloc.bh = 0;
+
+ ret = __ext4_get_inode_loc(inode, &iloc, 0);
+ if (ret < 0)
+ goto bad_inode;
+- bh = iloc.bh;
+ raw_inode = ext4_raw_inode(&iloc);
+ inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+ inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+@@ -4810,7 +4809,6 @@ struct inode *ext4_iget(struct super_blo
+ if (inode->i_mode == 0 ||
+ !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
+ /* this inode is deleted */
+- brelse(bh);
+ ret = -ESTALE;
+ goto bad_inode;
+ }
+@@ -4842,7 +4840,6 @@ struct inode *ext4_iget(struct super_blo
+ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+ if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+ EXT4_INODE_SIZE(inode->i_sb)) {
+- brelse(bh);
+ ret = -EIO;
+ goto bad_inode;
+ }
+@@ -4895,10 +4892,8 @@ struct inode *ext4_iget(struct super_blo
+ /* Validate block references which are part of inode */
+ ret = ext4_check_inode_blockref(inode);
+ }
+- if (ret) {
+- brelse(bh);
++ if (ret)
+ goto bad_inode;
+- }
+
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_op = &ext4_file_inode_operations;
+@@ -4926,7 +4921,6 @@ struct inode *ext4_iget(struct super_blo
+ init_special_inode(inode, inode->i_mode,
+ new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+ } else {
+- brelse(bh);
+ ret = -EIO;
+ ext4_error(inode->i_sb, __func__,
+ "bogus i_mode (%o) for inode=%lu",
+@@ -4939,6 +4933,7 @@ struct inode *ext4_iget(struct super_blo
+ return inode;
+
+ bad_inode:
++ brelse(iloc.bh);
+ iget_failed(inode);
+ return ERR_PTR(ret);
+ }
+
+
+From linux@linux.site Thu Dec 10 20:28:08 2009
+Message-Id: <20091211042807.711256423@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:41 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [63/90] ext4: make sure directory and symlink blocks are revoked
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0063-ext4-make-sure-directory-and-symlink-blocks-are-revo.patch
+Content-Length: 2052
+Lines: 58
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 50689696867d95b38d9c7be640a311494a04fb86)
+
+When an inode gets unlinked, the functions ext4_clear_blocks() and
+ext4_remove_blocks() call ext4_forget() for all the buffer heads
+corresponding to the deleted inode's data blocks. If the inode is a
+directory or a symlink, the is_metadata parameter must be non-zero so
+ext4_forget() will revoke them via jbd2_journal_revoke(). Otherwise,
+if these blocks are reused for a data file, and the system crashes
+before a journal checkpoint, the journal replay could end up
+corrupting these data blocks.
+
+Thanks to Curt Wohlgemuth for pointing out potential problems in this
+area.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c | 2 +-
+ fs/ext4/inode.c | 6 ++++--
+ 2 files changed, 5 insertions(+), 3 deletions(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -2055,7 +2055,7 @@ static int ext4_remove_blocks(handle_t *
+ ext_debug("free last %u blocks starting %llu\n", num, start);
+ for (i = 0; i < num; i++) {
+ bh = sb_find_get_block(inode->i_sb, start + i);
+- ext4_forget(handle, 0, inode, bh, start + i);
++ ext4_forget(handle, metadata, inode, bh, start + i);
+ }
+ ext4_free_blocks(handle, inode, start, num, metadata);
+ } else if (from == le32_to_cpu(ex->ee_block)
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -4110,6 +4110,8 @@ static void ext4_clear_blocks(handle_t *
+ __le32 *last)
+ {
+ __le32 *p;
++ int is_metadata = S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode);
++
+ if (try_to_extend_transaction(handle, inode)) {
+ if (bh) {
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+@@ -4140,11 +4142,11 @@ static void ext4_clear_blocks(handle_t *
+
+ *p = 0;
+ tbh = sb_find_get_block(inode->i_sb, nr);
+- ext4_forget(handle, 0, inode, tbh, nr);
++ ext4_forget(handle, is_metadata, inode, tbh, nr);
+ }
+ }
+
+- ext4_free_blocks(handle, inode, block_to_free, count, 0);
++ ext4_free_blocks(handle, inode, block_to_free, count, is_metadata);
+ }
+
+ /**
+
+
+From linux@linux.site Thu Dec 10 20:28:08 2009
+Message-Id: <20091211042808.337529149@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:42 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Julia Lawall <julia@diku.dk>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [64/90] ext4: fix i_flags access in ext4_da_writepages_trans_blocks()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0064-ext4-fix-i_flags-access-in-ext4_da_writepages_trans_.patch
+Content-Length: 846
+Lines: 25
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 30c6e07a92ea4cb87160d32ffa9bce172576ae4c)
+
+We need to be testing the i_flags field in the ext4 specific portion
+of the inode, instead of the (confusingly aliased) i_flags field in
+the generic struct inode.
+
+Signed-off-by: Julia Lawall <julia@diku.dk>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -2785,7 +2785,7 @@ static int ext4_da_writepages_trans_bloc
+ * number of contiguous block. So we will limit
+ * number of contiguous block to a sane value
+ */
+- if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
++ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+ (max_blocks > EXT4_MAX_TRANS_DATA))
+ max_blocks = EXT4_MAX_TRANS_DATA;
+
+
+
+From linux@linux.site Thu Dec 10 20:28:09 2009
+Message-Id: <20091211042808.870915761@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:43 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Eric Sandeen <sandeen@redhat.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [65/90] ext4: journal all modifications in ext4_xattr_set_handle
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0065-ext4-journal-all-modifications-in-ext4_xattr_set_han.patch
+Content-Length: 1254
+Lines: 39
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 86ebfd08a1930ccedb8eac0aeb1ed4b8b6a41dbc)
+
+ext4_xattr_set_handle() was zeroing out an inode outside
+of journaling constraints; this is one of the accesses that
+was causing the crc errors in journal replay as seen in
+kernel.org bugzilla #14354.
+
+Reviewed-by: Andreas Dilger <adilger@sun.com>
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/xattr.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -988,6 +988,10 @@ ext4_xattr_set_handle(handle_t *handle,
+ if (error)
+ goto cleanup;
+
++ error = ext4_journal_get_write_access(handle, is.iloc.bh);
++ if (error)
++ goto cleanup;
++
+ if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
+ struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
+ memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+@@ -1013,9 +1017,6 @@ ext4_xattr_set_handle(handle_t *handle,
+ if (flags & XATTR_CREATE)
+ goto cleanup;
+ }
+- error = ext4_journal_get_write_access(handle, is.iloc.bh);
+- if (error)
+- goto cleanup;
+ if (!value) {
+ if (!is.s.not_found)
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+
+
+From linux@linux.site Thu Dec 10 20:28:09 2009
+Message-Id: <20091211042809.446063479@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:44 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [66/90] ext4: dont update the superblock in ext4_statfs()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0066-ext4-don-t-update-the-superblock-in-ext4_statfs.patch
+Content-Length: 1341
+Lines: 31
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 3f8fb9490efbd300887470a2a880a64e04dcc3f5)
+
+commit a71ce8c6c9bf269b192f352ea555217815cf027e updated ext4_statfs()
+to update the on-disk superblock counters, but modified this buffer
+directly without any journaling of the change. This is one of the
+accesses that was causing the crc errors in journal replay as seen in
+kernel.org bugzilla #14354.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/super.c | 2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -3693,13 +3693,11 @@ static int ext4_statfs(struct dentry *de
+ buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
+ buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+ percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
+- ext4_free_blocks_count_set(es, buf->f_bfree);
+ buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
+ if (buf->f_bfree < ext4_r_blocks_count(es))
+ buf->f_bavail = 0;
+ buf->f_files = le32_to_cpu(es->s_inodes_count);
+ buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
+- es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
+ buf->f_namelen = EXT4_NAME_LEN;
+ fsid = le64_to_cpup((void *)es->s_uuid) ^
+ le64_to_cpup((void *)es->s_uuid + sizeof(u64));
+
+
+From linux@linux.site Thu Dec 10 20:28:10 2009
+Message-Id: <20091211042810.021726276@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:45 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [67/90] ext4: fix uninit block bitmap initialization when s_meta_first_bg is non-zero
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0067-ext4-fix-uninit-block-bitmap-initialization-when-s_m.patch
+Content-Length: 875
+Lines: 29
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 8dadb198cb70ef811916668fe67eeec82e8858dd)
+
+The number of old-style block group descriptor blocks is
+s_meta_first_bg when the meta_bg feature flag is set.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/balloc.c | 8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/balloc.c
++++ b/fs/ext4/balloc.c
+@@ -761,7 +761,13 @@ static unsigned long ext4_bg_num_gdb_met
+ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
+ ext4_group_t group)
+ {
+- return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0;
++ if (!ext4_bg_has_super(sb, group))
++ return 0;
++
++ if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
++ return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
++ else
++ return EXT4_SB(sb)->s_gdb_count;
+ }
+
+ /**
+
+
+From linux@linux.site Thu Dec 10 20:28:11 2009
+Message-Id: <20091211042810.591847517@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:46 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [68/90] ext4: fix block validity checks so they work correctly with meta_bg
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0068-ext4-fix-block-validity-checks-so-they-work-correctl.patch
+Content-Length: 1411
+Lines: 39
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 1032988c71f3f85483b2b4319684d1205a704c02)
+
+The block validity checks used by ext4_data_block_valid() wasn't
+correctly written to check file systems with the meta_bg feature. Fix
+this.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/block_validity.c | 2 +-
+ fs/ext4/inode.c | 5 +----
+ 2 files changed, 2 insertions(+), 5 deletions(-)
+
+--- a/fs/ext4/block_validity.c
++++ b/fs/ext4/block_validity.c
+@@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_
+ if (ext4_bg_has_super(sb, i) &&
+ ((i < 5) || ((i % flex_size) == 0)))
+ add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+- sbi->s_gdb_count + 1);
++ ext4_bg_num_gdb(sb, i) + 1);
+ gdp = ext4_get_group_desc(sb, i, NULL);
+ ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+ if (ret)
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -4873,10 +4873,7 @@ struct inode *ext4_iget(struct super_blo
+
+ ret = 0;
+ if (ei->i_file_acl &&
+- ((ei->i_file_acl <
+- (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
+- EXT4_SB(sb)->s_gdb_count)) ||
+- (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
++ !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
+ ext4_error(sb, __func__,
+ "bad extended attribute block %llu in inode #%lu",
+ ei->i_file_acl, inode->i_ino);
+
+
+From linux@linux.site Thu Dec 10 20:28:11 2009
+Message-Id: <20091211042811.145411136@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:47 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Jan Kara <jack@suse.cz>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [69/90] ext4: avoid issuing unnecessary barriers
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0069-ext4-avoid-issuing-unnecessary-barriers.patch
+Content-Length: 1115
+Lines: 37
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 6b17d902fdd241adfa4ce780df20547b28bf5801)
+
+We don't to issue an I/O barrier on an error or if we force commit
+because we are doing data journaling.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Cc: Jan Kara <jack@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/fsync.c | 8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+--- a/fs/ext4/fsync.c
++++ b/fs/ext4/fsync.c
+@@ -60,7 +60,7 @@ int ext4_sync_file(struct file *file, st
+
+ ret = flush_aio_dio_completed_IO(inode);
+ if (ret < 0)
+- goto out;
++ return ret;
+ /*
+ * data=writeback:
+ * The caller's filemap_fdatawrite()/wait will sync the data.
+@@ -79,10 +79,8 @@ int ext4_sync_file(struct file *file, st
+ * (they were dirtied by commit). But that's OK - the blocks are
+ * safe in-journal, which is all fsync() needs to ensure.
+ */
+- if (ext4_should_journal_data(inode)) {
+- ret = ext4_force_commit(inode->i_sb);
+- goto out;
+- }
++ if (ext4_should_journal_data(inode))
++ return ext4_force_commit(inode->i_sb);
+
+ if (!journal)
+ ret = sync_mapping_buffers(inode->i_mapping);
+
+
+From linux@linux.site Thu Dec 10 20:28:12 2009
+Message-Id: <20091211042811.707301090@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:48 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Jan Kara <jack@suse.cz>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [70/90] ext4: fix error handling in ext4_ind_get_blocks()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0070-ext4-fix-error-handling-in-ext4_ind_get_blocks.patch
+Content-Length: 733
+Lines: 25
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 2bba702d4f88d7b010ec37e2527b552588404ae7)
+
+When an error happened in ext4_splice_branch we failed to notice that
+in ext4_ind_get_blocks and mapped the buffer anyway. Fix the problem
+by checking for error properly.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1022,7 +1022,7 @@ static int ext4_ind_get_blocks(handle_t
+ if (!err)
+ err = ext4_splice_branch(handle, inode, iblock,
+ partial, indirect_blks, count);
+- else
++ if (err)
+ goto cleanup;
+
+ set_buffer_new(bh_result);
+
+
+From linux@linux.site Thu Dec 10 20:28:12 2009
+Message-Id: <20091211042812.322370572@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:49 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Eric Sandeen <sandeen@redhat.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [71/90] ext4: make trim/discard optional (and off by default)
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0071-ext4-make-trim-discard-optional-and-off-by-default.patch
+Content-Length: 4275
+Lines: 124
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 5328e635315734d42080de9a5a1ee87bf4cae0a4)
+
+It is anticipated that when sb_issue_discard starts doing
+real work on trim-capable devices, we may see issues. Make
+this mount-time optional, and default it to off until we know
+that things are working out OK.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ Documentation/filesystems/ext4.txt | 6 ++++++
+ fs/ext4/ext4.h | 1 +
+ fs/ext4/mballoc.c | 21 +++++++++++++--------
+ fs/ext4/super.c | 14 +++++++++++++-
+ 4 files changed, 33 insertions(+), 9 deletions(-)
+
+--- a/Documentation/filesystems/ext4.txt
++++ b/Documentation/filesystems/ext4.txt
+@@ -338,6 +338,12 @@ noauto_da_alloc replacing existing file
+ system crashes before the delayed allocation
+ blocks are forced to disk.
+
++discard Controls whether ext4 should issue discard/TRIM
++nodiscard(*) commands to the underlying block device when
++ blocks are freed. This is useful for SSD devices
++ and sparse/thinly-provisioned LUNs, but it is off
++ by default until sufficient testing has been done.
++
+ Data Mode
+ =========
+ There are 3 different data modes:
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -747,6 +747,7 @@ struct ext4_inode_info {
+ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
+ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
+ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
++#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
+
+ #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
+ #define set_opt(o, opt) o |= EXT4_MOUNT_##opt
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -2810,7 +2810,6 @@ static void release_blocks_on_commit(jou
+ struct ext4_group_info *db;
+ int err, count = 0, count2 = 0;
+ struct ext4_free_data *entry;
+- ext4_fsblk_t discard_block;
+ struct list_head *l, *ltmp;
+
+ list_for_each_safe(l, ltmp, &txn->t_private_list) {
+@@ -2840,13 +2839,19 @@ static void release_blocks_on_commit(jou
+ page_cache_release(e4b.bd_bitmap_page);
+ }
+ ext4_unlock_group(sb, entry->group);
+- discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
+- + entry->start_blk
+- + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+- trace_ext4_discard_blocks(sb, (unsigned long long)discard_block,
+- entry->count);
+- sb_issue_discard(sb, discard_block, entry->count);
+-
++ if (test_opt(sb, DISCARD)) {
++ ext4_fsblk_t discard_block;
++ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
++
++ discard_block = (ext4_fsblk_t)entry->group *
++ EXT4_BLOCKS_PER_GROUP(sb)
++ + entry->start_blk
++ + le32_to_cpu(es->s_first_data_block);
++ trace_ext4_discard_blocks(sb,
++ (unsigned long long)discard_block,
++ entry->count);
++ sb_issue_discard(sb, discard_block, entry->count);
++ }
+ kmem_cache_free(ext4_free_ext_cachep, entry);
+ ext4_mb_release_desc(&e4b);
+ }
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -906,6 +906,9 @@ static int ext4_show_options(struct seq_
+ if (test_opt(sb, NO_AUTO_DA_ALLOC))
+ seq_puts(seq, ",noauto_da_alloc");
+
++ if (test_opt(sb, DISCARD))
++ seq_puts(seq, ",discard");
++
+ ext4_show_quota_options(seq, sb);
+
+ return 0;
+@@ -1086,7 +1089,8 @@ enum {
+ Opt_usrquota, Opt_grpquota, Opt_i_version,
+ Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+ Opt_block_validity, Opt_noblock_validity,
+- Opt_inode_readahead_blks, Opt_journal_ioprio
++ Opt_inode_readahead_blks, Opt_journal_ioprio,
++ Opt_discard, Opt_nodiscard,
+ };
+
+ static const match_table_t tokens = {
+@@ -1152,6 +1156,8 @@ static const match_table_t tokens = {
+ {Opt_auto_da_alloc, "auto_da_alloc=%u"},
+ {Opt_auto_da_alloc, "auto_da_alloc"},
+ {Opt_noauto_da_alloc, "noauto_da_alloc"},
++ {Opt_discard, "discard"},
++ {Opt_nodiscard, "nodiscard"},
+ {Opt_err, NULL},
+ };
+
+@@ -1580,6 +1586,12 @@ set_qf_format:
+ else
+ set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+ break;
++ case Opt_discard:
++ set_opt(sbi->s_mount_opt, DISCARD);
++ break;
++ case Opt_nodiscard:
++ clear_opt(sbi->s_mount_opt, DISCARD);
++ break;
+ default:
+ ext4_msg(sb, KERN_ERR,
+ "Unrecognized mount option \"%s\" "
+
+
+From linux@linux.site Thu Dec 10 20:28:13 2009
+Message-Id: <20091211042812.915030291@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:50 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Eric Sandeen <sandeen@redhat.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [72/90] ext4: make "norecovery" an alias for "noload"
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0072-ext4-make-norecovery-an-alias-for-noload.patch
+Content-Length: 1856
+Lines: 53
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit e3bb52ae2bb9573e84c17b8e3560378d13a5c798)
+
+Users on the linux-ext4 list recently complained about differences
+across filesystems w.r.t. how to mount without a journal replay.
+
+In the discussion it was noted that xfs's "norecovery" option is
+perhaps more descriptively accurate than "noload," so let's make
+that an alias for ext4.
+
+Also show this status in /proc/mounts
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ Documentation/filesystems/ext4.txt | 4 ++--
+ fs/ext4/super.c | 4 ++++
+ 2 files changed, 6 insertions(+), 2 deletions(-)
+
+--- a/Documentation/filesystems/ext4.txt
++++ b/Documentation/filesystems/ext4.txt
+@@ -153,8 +153,8 @@ journal_dev=devnum When the external jou
+ identified through its new major/minor numbers encoded
+ in devnum.
+
+-noload Don't load the journal on mounting. Note that
+- if the filesystem was not unmounted cleanly,
++norecovery Don't load the journal on mounting. Note that
++noload if the filesystem was not unmounted cleanly,
+ skipping the journal replay will lead to the
+ filesystem containing inconsistencies that can
+ lead to any number of problems.
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -909,6 +909,9 @@ static int ext4_show_options(struct seq_
+ if (test_opt(sb, DISCARD))
+ seq_puts(seq, ",discard");
+
++ if (test_opt(sb, NOLOAD))
++ seq_puts(seq, ",norecovery");
++
+ ext4_show_quota_options(seq, sb);
+
+ return 0;
+@@ -1115,6 +1118,7 @@ static const match_table_t tokens = {
+ {Opt_acl, "acl"},
+ {Opt_noacl, "noacl"},
+ {Opt_noload, "noload"},
++ {Opt_noload, "norecovery"},
+ {Opt_nobh, "nobh"},
+ {Opt_bh, "bh"},
+ {Opt_commit, "commit=%u"},
+
+
+From linux@linux.site Thu Dec 10 20:28:13 2009
+Message-Id: <20091211042813.423360988@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:51 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Akira Fujita <a-fujita@rs.jp.nec.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [73/90] ext4: Fix double-free of blocks with EXT4_IOC_MOVE_EXT
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0073-ext4-Fix-double-free-of-blocks-with-EXT4_IOC_MOVE_EX.patch
+Content-Length: 2565
+Lines: 75
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 94d7c16cbbbd0e03841fcf272bcaf0620ad39618)
+
+At the beginning of ext4_move_extent(), we call
+ext4_discard_preallocations() to discard inode PAs of orig and donor
+inodes. But in the following case, blocks can be double freed, so
+move ext4_discard_preallocations() to the end of ext4_move_extents().
+
+1. Discard inode PAs of orig and donor inodes with
+ ext4_discard_preallocations() in ext4_move_extents().
+
+ orig : [ DATA1 ]
+ donor: [ DATA2 ]
+
+2. While data blocks are exchanging between orig and donor inodes, new
+ inode PAs is created to orig by other process's block allocation.
+ (Since there are semaphore gaps in ext4_move_extents().) And new
+ inode PAs is used partially (2-1).
+
+ 2-1 Create new inode PAs to orig inode
+ orig : [ DATA1 | used PA1 | free PA1 ]
+ donor: [ DATA2 ]
+
+3. Donor inode which has old orig inode's blocks is deleted after
+ EXT4_IOC_MOVE_EXT finished (3-1, 3-2). So the block bitmap
+ corresponds to old orig inode's blocks are freed.
+
+ 3-1 After EXT4_IOC_MOVE_EXT finished
+ orig : [ DATA2 | free PA1 ]
+ donor: [ DATA1 | used PA1 ]
+
+ 3-2 Delete donor inode
+ orig : [ DATA2 | free PA1 ]
+ donor: [ FREE SPACE(DATA1) | FREE SPACE(used PA1) ]
+
+4. The double-free of blocks is occurred, when close() is called to
+ orig inode. Because ext4_discard_preallocations() for orig inode
+ frees used PA1 and free PA1, though used PA1 is already freed in 3.
+
+ 4-1 Double-free of blocks is occurred
+ orig : [ DATA2 | FREE SPACE(free PA1) ]
+ donor: [ FREE SPACE(DATA1) | DOUBLE FREE(used PA1) ]
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c | 9 +++++----
+ 1 file changed, 5 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -1289,10 +1289,6 @@ ext4_move_extents(struct file *o_filp, s
+ ext4_ext_get_actual_len(ext_cur), block_end + 1) -
+ max(le32_to_cpu(ext_cur->ee_block), block_start);
+
+- /* Discard preallocations of two inodes */
+- ext4_discard_preallocations(orig_inode);
+- ext4_discard_preallocations(donor_inode);
+-
+ while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
+ seq_blocks += add_blocks;
+
+@@ -1410,6 +1406,11 @@ ext4_move_extents(struct file *o_filp, s
+
+ }
+ out:
++ if (*moved_len) {
++ ext4_discard_preallocations(orig_inode);
++ ext4_discard_preallocations(donor_inode);
++ }
++
+ if (orig_path) {
+ ext4_ext_drop_refs(orig_path);
+ kfree(orig_path);
+
+
+From linux@linux.site Thu Dec 10 20:28:14 2009
+Message-Id: <20091211042814.022299856@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:52 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Kazuya Mio <k-mio@sx.jp.nec.com>,
+ Akira Fujita <a-fujita@rs.jp.nec.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [74/90] ext4: initialize moved_len before calling ext4_move_extents()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0074-ext4-initialize-moved_len-before-calling-ext4_move_e.patch
+Content-Length: 2445
+Lines: 72
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 446aaa6e7e993b38a6f21c6acfa68f3f1af3dbe3)
+
+The move_extent.moved_len is used to pass back the number of exchanged
+blocks count to user space. Currently the caller must clear this
+field; but we spend more code space checking for this requirement than
+simply zeroing the field ourselves, so let's just make life easier for
+everyone all around.
+
+Signed-off-by: Kazuya Mio <k-mio@sx.jp.nec.com>
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ioctl.c | 1 +
+ fs/ext4/move_extent.c | 14 +++-----------
+ 2 files changed, 4 insertions(+), 11 deletions(-)
+
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -239,6 +239,7 @@ setversion_out:
+ }
+ }
+
++ me.moved_len = 0;
+ err = ext4_move_extents(filp, donor_filp, me.orig_start,
+ me.donor_start, me.len, &me.moved_len);
+ fput(donor_filp);
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -947,7 +947,6 @@ out2:
+ * @orig_start: logical start offset in block for orig
+ * @donor_start: logical start offset in block for donor
+ * @len: the number of blocks to be moved
+- * @moved_len: moved block length
+ *
+ * Check the arguments of ext4_move_extents() whether the files can be
+ * exchanged with each other.
+@@ -955,8 +954,8 @@ out2:
+ */
+ static int
+ mext_check_arguments(struct inode *orig_inode,
+- struct inode *donor_inode, __u64 orig_start,
+- __u64 donor_start, __u64 *len, __u64 moved_len)
++ struct inode *donor_inode, __u64 orig_start,
++ __u64 donor_start, __u64 *len)
+ {
+ ext4_lblk_t orig_blocks, donor_blocks;
+ unsigned int blkbits = orig_inode->i_blkbits;
+@@ -1010,13 +1009,6 @@ mext_check_arguments(struct inode *orig_
+ return -EINVAL;
+ }
+
+- if (moved_len) {
+- ext4_debug("ext4 move extent: moved_len should be 0 "
+- "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
+- donor_inode->i_ino);
+- return -EINVAL;
+- }
+-
+ if ((orig_start > MAX_DEFRAG_SIZE) ||
+ (donor_start > MAX_DEFRAG_SIZE) ||
+ (*len > MAX_DEFRAG_SIZE) ||
+@@ -1226,7 +1218,7 @@ ext4_move_extents(struct file *o_filp, s
+ double_down_write_data_sem(orig_inode, donor_inode);
+ /* Check the filesystem environment whether move_extent can be done */
+ ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
+- donor_start, &len, *moved_len);
++ donor_start, &len);
+ if (ret1)
+ goto out;
+
+
+
+From linux@linux.site Thu Dec 10 20:28:15 2009
+Message-Id: <20091211042814.628012070@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:53 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Akira Fujita <a-fujita@rs.jp.nec.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [75/90] ext4: move_extent_per_page() cleanup
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0075-ext4-move_extent_per_page-cleanup.patch
+Content-Length: 2733
+Lines: 87
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit ac48b0a1d068887141581bea8285de5fcab182b0)
+
+Integrate duplicate lines (acquire/release semaphore and invalidate
+extent cache in move_extent_per_page()) into mext_replace_branches(),
+to reduce source and object code size.
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c | 30 +++++++++---------------------
+ 1 file changed, 9 insertions(+), 21 deletions(-)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -660,6 +660,9 @@ mext_replace_branches(handle_t *handle,
+ int replaced_count = 0;
+ int dext_alen;
+
++ /* Protect extent trees against block allocations via delalloc */
++ double_down_write_data_sem(orig_inode, donor_inode);
++
+ /* Get the original extent for the block "orig_off" */
+ *err = get_ext_path(orig_inode, orig_off, &orig_path);
+ if (*err)
+@@ -755,6 +758,11 @@ out:
+ kfree(donor_path);
+ }
+
++ ext4_ext_invalidate_cache(orig_inode);
++ ext4_ext_invalidate_cache(donor_inode);
++
++ double_up_write_data_sem(orig_inode, donor_inode);
++
+ return replaced_count;
+ }
+
+@@ -820,19 +828,9 @@ move_extent_per_page(struct file *o_filp
+ * Just swap data blocks between orig and donor.
+ */
+ if (uninit) {
+- /*
+- * Protect extent trees against block allocations
+- * via delalloc
+- */
+- double_down_write_data_sem(orig_inode, donor_inode);
+ replaced_count = mext_replace_branches(handle, orig_inode,
+ donor_inode, orig_blk_offset,
+ block_len_in_page, err);
+-
+- /* Clear the inode cache not to refer to the old data */
+- ext4_ext_invalidate_cache(orig_inode);
+- ext4_ext_invalidate_cache(donor_inode);
+- double_up_write_data_sem(orig_inode, donor_inode);
+ goto out2;
+ }
+
+@@ -880,8 +878,6 @@ move_extent_per_page(struct file *o_filp
+ /* Release old bh and drop refs */
+ try_to_release_page(page, 0);
+
+- /* Protect extent trees against block allocations via delalloc */
+- double_down_write_data_sem(orig_inode, donor_inode);
+ replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
+ orig_blk_offset, block_len_in_page,
+ &err2);
+@@ -890,18 +886,10 @@ move_extent_per_page(struct file *o_filp
+ block_len_in_page = replaced_count;
+ replaced_size =
+ block_len_in_page << orig_inode->i_blkbits;
+- } else {
+- double_up_write_data_sem(orig_inode, donor_inode);
++ } else
+ goto out;
+- }
+ }
+
+- /* Clear the inode cache not to refer to the old data */
+- ext4_ext_invalidate_cache(orig_inode);
+- ext4_ext_invalidate_cache(donor_inode);
+-
+- double_up_write_data_sem(orig_inode, donor_inode);
+-
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
+
+
+
+From linux@linux.site Thu Dec 10 20:28:15 2009
+Message-Id: <20091211042815.186798204@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:54 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [76/90] jbd2: Add ENOMEM checking in and for jbd2_journal_write_metadata_buffer()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0076-jbd2-Add-ENOMEM-checking-in-and-for-jbd2_journal_wri.patch
+Content-Length: 1035
+Lines: 38
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit e6ec116b67f46e0e7808276476554727b2e6240b)
+
+OOM happens.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/jbd2/commit.c | 4 ++++
+ fs/jbd2/journal.c | 4 ++++
+ 2 files changed, 8 insertions(+)
+
+--- a/fs/jbd2/commit.c
++++ b/fs/jbd2/commit.c
+@@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(jou
+ JBUFFER_TRACE(jh, "ph3: write metadata");
+ flags = jbd2_journal_write_metadata_buffer(commit_transaction,
+ jh, &new_jh, blocknr);
++ if (flags < 0) {
++ jbd2_journal_abort(journal, flags);
++ continue;
++ }
+ set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
+ wbuf[bufs++] = jh2bh(new_jh);
+
+--- a/fs/jbd2/journal.c
++++ b/fs/jbd2/journal.c
+@@ -361,6 +361,10 @@ repeat:
+
+ jbd_unlock_bh_state(bh_in);
+ tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
++ if (!tmp) {
++ jbd2_journal_put_journal_head(new_jh);
++ return -ENOMEM;
++ }
+ jbd_lock_bh_state(bh_in);
+ if (jh_in->b_frozen_data) {
+ jbd2_free(tmp, bh_in->b_size);
+
+
+From linux@linux.site Thu Dec 10 20:28:16 2009
+Message-Id: <20091211042815.716499145@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:55 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Roel Kluin <roel.kluin@gmail.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [77/90] ext4: Return the PTR_ERR of the correct pointer in setup_new_group_blocks()
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0077-ext4-Return-the-PTR_ERR-of-the-correct-pointer-in-se.patch
+Content-Length: 595
+Lines: 21
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit c09eef305dd43846360944ad072f051f964fa383)
+
+Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/resize.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/resize.c
++++ b/fs/ext4/resize.c
+@@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct
+ goto exit_bh;
+
+ if (IS_ERR(gdb = bclean(handle, sb, block))) {
+- err = PTR_ERR(bh);
++ err = PTR_ERR(gdb);
+ goto exit_bh;
+ }
+ ext4_handle_dirty_metadata(handle, NULL, gdb);
+
+
+From linux@linux.site Thu Dec 10 20:28:16 2009
+Message-Id: <20091211042816.324947251@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:56 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Jan Kara <jack@suse.cz>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [78/90] ext4: Avoid data / filesystem corruption when write fails to copy data
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0078-ext4-Avoid-data-filesystem-corruption-when-write-fai.patch
+Content-Length: 2923
+Lines: 84
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit b9a4207d5e911b938f73079a83cc2ae10524ec7f)
+
+When ext4_write_begin fails after allocating some blocks or
+generic_perform_write fails to copy data to write, we truncate blocks
+already instantiated beyond i_size. Although these blocks were never
+inside i_size, we have to truncate the pagecache of these blocks so
+that corresponding buffers get unmapped. Otherwise subsequent
+__block_prepare_write (called because we are retrying the write) will
+find the buffers mapped, not call ->get_block, and thus the page will
+be backed by already freed blocks leading to filesystem and data
+corruption.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 20 +++++++++++++++-----
+ 1 file changed, 15 insertions(+), 5 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1535,6 +1535,16 @@ static int do_journal_get_write_access(h
+ return ext4_journal_get_write_access(handle, bh);
+ }
+
++/*
++ * Truncate blocks that were not used by write. We have to truncate the
++ * pagecache as well so that corresponding buffers get properly unmapped.
++ */
++static void ext4_truncate_failed_write(struct inode *inode)
++{
++ truncate_inode_pages(inode->i_mapping, inode->i_size);
++ ext4_truncate(inode);
++}
++
+ static int ext4_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+@@ -1600,7 +1610,7 @@ retry:
+
+ ext4_journal_stop(handle);
+ if (pos + len > inode->i_size) {
+- ext4_truncate(inode);
++ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might
+ * still be on the orphan list; we need to
+@@ -1710,7 +1720,7 @@ static int ext4_ordered_write_end(struct
+ ret = ret2;
+
+ if (pos + len > inode->i_size) {
+- ext4_truncate(inode);
++ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+@@ -1752,7 +1762,7 @@ static int ext4_writeback_write_end(stru
+ ret = ret2;
+
+ if (pos + len > inode->i_size) {
+- ext4_truncate(inode);
++ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+@@ -1815,7 +1825,7 @@ static int ext4_journalled_write_end(str
+ if (!ret)
+ ret = ret2;
+ if (pos + len > inode->i_size) {
+- ext4_truncate(inode);
++ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might still be
+ * on the orphan list; we need to make sure the inode
+@@ -3087,7 +3097,7 @@ retry:
+ * i_size_read because we hold i_mutex.
+ */
+ if (pos + len > inode->i_size)
+- ext4_truncate(inode);
++ ext4_truncate_failed_write(inode);
+ }
+
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+
+
+From linux@linux.site Thu Dec 10 20:28:17 2009
+Message-Id: <20091211042816.881920653@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:57 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Josef Bacik <josef@redhat.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [79/90] ext4: wait for log to commit when umounting
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0079-ext4-wait-for-log-to-commit-when-umounting.patch
+Content-Length: 1540
+Lines: 46
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit d4edac314e9ad0b21ba20ba8bc61b61f186f79e1)
+
+There is a potential race when a transaction is committing right when
+the file system is being umounting. This could reduce in a race
+because EXT4_SB(sb)->s_group_info could be freed in ext4_put_super
+before the commit code calls a callback so the mballoc code can
+release freed blocks in the transaction, resulting in a panic trying
+to access the freed s_group_info.
+
+The fix is to wait for the transaction to finish committing before we
+shutdown the multiblock allocator.
+
+Signed-off-by: Josef Bacik <josef@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/super.c | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -610,10 +610,6 @@ static void ext4_put_super(struct super_
+ if (sb->s_dirt)
+ ext4_commit_super(sb, 1);
+
+- ext4_release_system_zone(sb);
+- ext4_mb_release(sb);
+- ext4_ext_release(sb);
+- ext4_xattr_put_super(sb);
+ if (sbi->s_journal) {
+ err = jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
+@@ -621,6 +617,12 @@ static void ext4_put_super(struct super_
+ ext4_abort(sb, __func__,
+ "Couldn't clean up the journal");
+ }
++
++ ext4_release_system_zone(sb);
++ ext4_mb_release(sb);
++ ext4_ext_release(sb);
++ ext4_xattr_put_super(sb);
++
+ if (!(sb->s_flags & MS_RDONLY)) {
+ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ es->s_state = cpu_to_le16(sbi->s_mount_state);
+
+
+From linux@linux.site Thu Dec 10 20:28:17 2009
+Message-Id: <20091211042817.411285726@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:58 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Curt Wohlgemuth <curtw@google.com>,
+ "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [80/90] ext4: remove blocks from inode prealloc list on failure
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0080-ext4-remove-blocks-from-inode-prealloc-list-on-failu.patch
+Content-Length: 1476
+Lines: 49
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit b844167edc7fcafda9623955c05e4c1b3c32ebc7)
+
+This fixes a leak of blocks in an inode prealloc list if device failures
+cause ext4_mb_mark_diskspace_used() to fail.
+
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Acked-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/mballoc.c | 19 +++++++++++++++++++
+ 1 file changed, 19 insertions(+)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -3258,6 +3258,24 @@ static void ext4_mb_collect_stats(struct
+ }
+
+ /*
++ * Called on failure; free up any blocks from the inode PA for this
++ * context. We don't need this for MB_GROUP_PA because we only change
++ * pa_free in ext4_mb_release_context(), but on failure, we've already
++ * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
++ */
++static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
++{
++ struct ext4_prealloc_space *pa = ac->ac_pa;
++ int len;
++
++ if (pa && pa->pa_type == MB_INODE_PA) {
++ len = ac->ac_b_ex.fe_len;
++ pa->pa_free += len;
++ }
++
++}
++
++/*
+ * use blocks preallocated to inode
+ */
+ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
+@@ -4546,6 +4564,7 @@ repeat:
+ ac->ac_status = AC_STATUS_CONTINUE;
+ goto repeat;
+ } else if (*errp) {
++ ext4_discard_allocated_blocks(ac);
+ ac->ac_b_ex.fe_len = 0;
+ ar->len = 0;
+ ext4_mb_show_ac(ac);
+
+
+From linux@linux.site Thu Dec 10 20:28:18 2009
+Message-Id: <20091211042818.062493875@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:25:59 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Dmitry Monakhov <dmonakhov@openvz.org>,
+ Mingming Cao <cmm@us.ibm.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [81/90] ext4: ext4_get_reserved_space() must return bytes instead of blocks
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0081-ext4-ext4_get_reserved_space-must-return-bytes-inste.patch
+Content-Length: 718
+Lines: 23
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 8aa6790f876e81f5a2211fe1711a5fe3fe2d7b20)
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Reviewed-by: Eric Sandeen <sandeen@redhat.com>
+Acked-by: Mingming Cao <cmm@us.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1053,7 +1053,7 @@ qsize_t ext4_get_reserved_space(struct i
+ EXT4_I(inode)->i_reserved_meta_blocks;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+- return total;
++ return (total << inode->i_blkbits);
+ }
+ /*
+ * Calculate the number of metadata blocks need to reserve
+
+
+From linux@linux.site Thu Dec 10 20:28:19 2009
+Message-Id: <20091211042818.571106799@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:26:00 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Dmitry Monakhov <dmonakhov@openvz.org>,
+ Mingming Cao <cmm@us.ibm.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [82/90] ext4: quota macros cleanup
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0082-ext4-quota-macros-cleanup.patch
+Content-Length: 5167
+Lines: 138
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 5aca07eb7d8f14d90c740834d15ca15277f4820c)
+
+Currently all quota block reservation macros contains hard-coded "2"
+aka MAXQUOTAS value. This is no good because in some places it is not
+obvious to understand what does this digit represent. Let's introduce
+new macro with self descriptive name.
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Acked-by: Mingming Cao <cmm@us.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4_jbd2.h | 8 ++++++--
+ fs/ext4/extents.c | 2 +-
+ fs/ext4/inode.c | 2 +-
+ fs/ext4/migrate.c | 4 ++--
+ fs/ext4/namei.c | 8 ++++----
+ 5 files changed, 14 insertions(+), 10 deletions(-)
+
+--- a/fs/ext4/ext4_jbd2.h
++++ b/fs/ext4/ext4_jbd2.h
+@@ -49,7 +49,7 @@
+
+ #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
+ EXT4_XATTR_TRANS_BLOCKS - 2 + \
+- 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
++ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
+
+ /*
+ * Define the number of metadata blocks we need to account to modify data.
+@@ -57,7 +57,7 @@
+ * This include super block, inode block, quota blocks and xattr blocks
+ */
+ #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
+- 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
++ EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
+
+ /* Delete operations potentially hit one directory's namespace plus an
+ * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
+@@ -92,6 +92,7 @@
+ * but inode, sb and group updates are done only once */
+ #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
+ (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
++
+ #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
+ (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
+ #else
+@@ -99,6 +100,9 @@
+ #define EXT4_QUOTA_INIT_BLOCKS(sb) 0
+ #define EXT4_QUOTA_DEL_BLOCKS(sb) 0
+ #endif
++#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
++#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
++#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
+
+ int
+ ext4_mark_iloc_dirty(handle_t *handle,
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -2147,7 +2147,7 @@ ext4_ext_rm_leaf(handle_t *handle, struc
+ correct_index = 1;
+ credits += (ext_depth(inode)) + 1;
+ }
+- credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
++ credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
+
+ err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+ if (err)
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -5221,7 +5221,7 @@ int ext4_setattr(struct dentry *dentry,
+
+ /* (user+group)*(old+new) structure, inode write (sb,
+ * inode block, ? - but truncate inode update has it) */
+- handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+
++ handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+ EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+--- a/fs/ext4/migrate.c
++++ b/fs/ext4/migrate.c
+@@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(hand
+ * So allocate a credit of 3. We may update
+ * quota (user and group).
+ */
+- needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
++ needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
+
+ if (ext4_journal_extend(handle, needed) != 0)
+ retval = ext4_journal_restart(handle, needed);
+@@ -477,7 +477,7 @@ int ext4_ext_migrate(struct inode *inode
+ handle = ext4_journal_start(inode,
+ EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+- 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)
++ EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
+ + 1);
+ if (IS_ERR(handle)) {
+ retval = PTR_ERR(handle);
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -1769,7 +1769,7 @@ static int ext4_create(struct inode *dir
+ retry:
+ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
++ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+@@ -1803,7 +1803,7 @@ static int ext4_mknod(struct inode *dir,
+ retry:
+ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
++ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+@@ -1840,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir,
+ retry:
+ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
+- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
++ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+@@ -2253,7 +2253,7 @@ static int ext4_symlink(struct inode *di
+ retry:
+ handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
+- 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb));
++ EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+
+
+From linux@linux.site Thu Dec 10 20:28:19 2009
+Message-Id: <20091211042819.212643394@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:26:01 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Dmitry Monakhov <dmonakhov@openvz.org>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [83/90] ext4: fix incorrect block reservation on quota transfer.
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0083-ext4-fix-incorrect-block-reservation-on-quota-transf.patch
+Content-Length: 1036
+Lines: 27
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 194074acacebc169ded90a4657193f5180015051)
+
+Inside ->setattr() call both ATTR_UID and ATTR_GID may be valid
+This means that we may end-up with transferring all quotas. Add
+we have to reserve QUOTA_DEL_BLOCKS for all quotas, as we do in
+case of QUOTA_INIT_BLOCKS.
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Reviewed-by: Mingming Cao <cmm@us.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -5222,7 +5222,7 @@ int ext4_setattr(struct dentry *dentry,
+ /* (user+group)*(old+new) structure, inode write (sb,
+ * inode block, ? - but truncate inode update has it) */
+ handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
+- EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
++ EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto err_out;
+
+
+From linux@linux.site Thu Dec 10 20:28:20 2009
+Message-Id: <20091211042819.790485160@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:26:02 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Jan Kara <jack@suse.cz>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [84/90] ext4: Wait for proper transaction commit on fsync
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0084-ext4-Wait-for-proper-transaction-commit-on-fsync.patch
+Content-Length: 7849
+Lines: 252
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit b436b9bef84de6893e86346d8fbf7104bc520645)
+
+We cannot rely on buffer dirty bits during fsync because pdflush can come
+before fsync is called and clear dirty bits without forcing a transaction
+commit. What we do is that we track which transaction has last changed
+the inode and which transaction last changed allocation and force it to
+disk on fsync.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 7 +++++++
+ fs/ext4/ext4_jbd2.h | 13 +++++++++++++
+ fs/ext4/extents.c | 14 ++++++++++++--
+ fs/ext4/fsync.c | 46 +++++++++++++++++-----------------------------
+ fs/ext4/inode.c | 29 +++++++++++++++++++++++++++++
+ fs/ext4/super.c | 2 ++
+ fs/jbd2/journal.c | 1 +
+ 7 files changed, 81 insertions(+), 31 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -700,6 +700,13 @@ struct ext4_inode_info {
+ struct list_head i_aio_dio_complete_list;
+ /* current io_end structure for async DIO write*/
+ ext4_io_end_t *cur_aio_dio;
++
++ /*
++ * Transactions that contain inode's metadata needed to complete
++ * fsync and fdatasync, respectively.
++ */
++ tid_t i_sync_tid;
++ tid_t i_datasync_tid;
+ };
+
+ /*
+--- a/fs/ext4/ext4_jbd2.h
++++ b/fs/ext4/ext4_jbd2.h
+@@ -258,6 +258,19 @@ static inline int ext4_jbd2_file_inode(h
+ return 0;
+ }
+
++static inline void ext4_update_inode_fsync_trans(handle_t *handle,
++ struct inode *inode,
++ int datasync)
++{
++ struct ext4_inode_info *ei = EXT4_I(inode);
++
++ if (ext4_handle_valid(handle)) {
++ ei->i_sync_tid = handle->h_transaction->t_tid;
++ if (datasync)
++ ei->i_datasync_tid = handle->h_transaction->t_tid;
++ }
++}
++
+ /* super.c */
+ int ext4_force_commit(struct super_block *sb);
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3041,6 +3041,8 @@ ext4_ext_handle_uninitialized_extents(ha
+ if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
+ ret = ext4_convert_unwritten_extents_dio(handle, inode,
+ path);
++ if (ret >= 0)
++ ext4_update_inode_fsync_trans(handle, inode, 1);
+ goto out2;
+ }
+ /* buffered IO case */
+@@ -3068,6 +3070,8 @@ ext4_ext_handle_uninitialized_extents(ha
+ ret = ext4_ext_convert_to_initialized(handle, inode,
+ path, iblock,
+ max_blocks);
++ if (ret >= 0)
++ ext4_update_inode_fsync_trans(handle, inode, 1);
+ out:
+ if (ret <= 0) {
+ err = ret;
+@@ -3306,10 +3310,16 @@ int ext4_ext_get_blocks(handle_t *handle
+ allocated = ext4_ext_get_actual_len(&newex);
+ set_buffer_new(bh_result);
+
+- /* Cache only when it is _not_ an uninitialized extent */
+- if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
++ /*
++ * Cache the extent and update transaction to commit on fdatasync only
++ * when it is _not_ an uninitialized extent.
++ */
++ if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
+ ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+ EXT4_EXT_CACHE_EXTENT);
++ ext4_update_inode_fsync_trans(handle, inode, 1);
++ } else
++ ext4_update_inode_fsync_trans(handle, inode, 0);
+ out:
+ if (allocated > max_blocks)
+ allocated = max_blocks;
+--- a/fs/ext4/fsync.c
++++ b/fs/ext4/fsync.c
+@@ -51,25 +51,30 @@
+ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
+ {
+ struct inode *inode = dentry->d_inode;
++ struct ext4_inode_info *ei = EXT4_I(inode);
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+- int err, ret = 0;
++ int ret;
++ tid_t commit_tid;
+
+ J_ASSERT(ext4_journal_current_handle() == NULL);
+
+ trace_ext4_sync_file(file, dentry, datasync);
+
++ if (inode->i_sb->s_flags & MS_RDONLY)
++ return 0;
++
+ ret = flush_aio_dio_completed_IO(inode);
+ if (ret < 0)
+ return ret;
++
++ if (!journal)
++ return simple_fsync(file, dentry, datasync);
++
+ /*
+- * data=writeback:
++ * data=writeback,ordered:
+ * The caller's filemap_fdatawrite()/wait will sync the data.
+- * sync_inode() will sync the metadata
+- *
+- * data=ordered:
+- * The caller's filemap_fdatawrite() will write the data and
+- * sync_inode() will write the inode if it is dirty. Then the caller's
+- * filemap_fdatawait() will wait on the pages.
++ * Metadata is in the journal, we wait for proper transaction to
++ * commit here.
+ *
+ * data=journal:
+ * filemap_fdatawrite won't do anything (the buffers are clean).
+@@ -82,27 +87,10 @@ int ext4_sync_file(struct file *file, st
+ if (ext4_should_journal_data(inode))
+ return ext4_force_commit(inode->i_sb);
+
+- if (!journal)
+- ret = sync_mapping_buffers(inode->i_mapping);
+-
+- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+- goto out;
+-
+- /*
+- * The VFS has written the file data. If the inode is unaltered
+- * then we need not start a commit.
+- */
+- if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
+- struct writeback_control wbc = {
+- .sync_mode = WB_SYNC_ALL,
+- .nr_to_write = 0, /* sys_fsync did this */
+- };
+- err = sync_inode(inode, &wbc);
+- if (ret == 0)
+- ret = err;
+- }
+-out:
+- if (journal && (journal->j_flags & JBD2_BARRIER))
++ commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
++ if (jbd2_log_start_commit(journal, commit_tid))
++ jbd2_log_wait_commit(journal, commit_tid);
++ else if (journal->j_flags & JBD2_BARRIER)
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ return ret;
+ }
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1026,6 +1026,8 @@ static int ext4_ind_get_blocks(handle_t
+ goto cleanup;
+
+ set_buffer_new(bh_result);
++
++ ext4_update_inode_fsync_trans(handle, inode, 1);
+ got_it:
+ map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+ if (count > blocks_to_boundary)
+@@ -4784,6 +4786,7 @@ struct inode *ext4_iget(struct super_blo
+ struct ext4_inode *raw_inode;
+ struct ext4_inode_info *ei;
+ struct inode *inode;
++ journal_t *journal = EXT4_SB(sb)->s_journal;
+ long ret;
+ int block;
+
+@@ -4848,6 +4851,31 @@ struct inode *ext4_iget(struct super_blo
+ ei->i_data[block] = raw_inode->i_block[block];
+ INIT_LIST_HEAD(&ei->i_orphan);
+
++ /*
++ * Set transaction id's of transactions that have to be committed
++ * to finish f[data]sync. We set them to currently running transaction
++ * as we cannot be sure that the inode or some of its metadata isn't
++ * part of the transaction - the inode could have been reclaimed and
++ * now it is reread from disk.
++ */
++ if (journal) {
++ transaction_t *transaction;
++ tid_t tid;
++
++ spin_lock(&journal->j_state_lock);
++ if (journal->j_running_transaction)
++ transaction = journal->j_running_transaction;
++ else
++ transaction = journal->j_committing_transaction;
++ if (transaction)
++ tid = transaction->t_tid;
++ else
++ tid = journal->j_commit_sequence;
++ spin_unlock(&journal->j_state_lock);
++ ei->i_sync_tid = tid;
++ ei->i_datasync_tid = tid;
++ }
++
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+ if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+@@ -5102,6 +5130,7 @@ static int ext4_do_update_inode(handle_t
+ err = rc;
+ ei->i_state &= ~EXT4_STATE_NEW;
+
++ ext4_update_inode_fsync_trans(handle, inode, 0);
+ out_brelse:
+ brelse(bh);
+ ext4_std_error(inode->i_sb, err);
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -713,6 +713,8 @@ static struct inode *ext4_alloc_inode(st
+ spin_lock_init(&(ei->i_block_reservation_lock));
+ INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
+ ei->cur_aio_dio = NULL;
++ ei->i_sync_tid = 0;
++ ei->i_datasync_tid = 0;
+
+ return &ei->vfs_inode;
+ }
+--- a/fs/jbd2/journal.c
++++ b/fs/jbd2/journal.c
+@@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno);
+ EXPORT_SYMBOL(jbd2_journal_ack_err);
+ EXPORT_SYMBOL(jbd2_journal_clear_err);
+ EXPORT_SYMBOL(jbd2_log_wait_commit);
++EXPORT_SYMBOL(jbd2_log_start_commit);
+ EXPORT_SYMBOL(jbd2_journal_start_commit);
+ EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
+ EXPORT_SYMBOL(jbd2_journal_wipe);
+
+
+From linux@linux.site Thu Dec 10 20:28:20 2009
+Message-Id: <20091211042820.350577113@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:26:03 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Akira Fujita <a-fujita@rs.jp.nec.com>,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [85/90] ext4: Fix insufficient checks in EXT4_IOC_MOVE_EXT
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=0085-ext4-Fix-insufficient-checks-in-EXT4_IOC_MOVE_EXT.patch
+Content-Length: 2732
+Lines: 94
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit 4a58579b9e4e2a35d57e6c9c8483e52f6f1b7fd6)
+
+This patch fixes three problems in the handling of the
+EXT4_IOC_MOVE_EXT ioctl:
+
+1. In current EXT4_IOC_MOVE_EXT, there are read access mode checks for
+original and donor files, but they allow the illegal write access to
+donor file, since donor file is overwritten by original file data. To
+fix this problem, change access mode checks of original (r->r/w) and
+donor (r->w) files.
+
+2. Disallow the use of donor files that have a setuid or setgid bits.
+
+3. Call mnt_want_write() and mnt_drop_write() before and after
+ext4_move_extents() calling to get write access to a mount.
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ioctl.c | 30 ++++++++++++++++++------------
+ fs/ext4/move_extent.c | 7 +++++++
+ 2 files changed, 25 insertions(+), 12 deletions(-)
+
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -221,32 +221,38 @@ setversion_out:
+ struct file *donor_filp;
+ int err;
+
++ if (!(filp->f_mode & FMODE_READ) ||
++ !(filp->f_mode & FMODE_WRITE))
++ return -EBADF;
++
+ if (copy_from_user(&me,
+ (struct move_extent __user *)arg, sizeof(me)))
+ return -EFAULT;
++ me.moved_len = 0;
+
+ donor_filp = fget(me.donor_fd);
+ if (!donor_filp)
+ return -EBADF;
+
+- if (!capable(CAP_DAC_OVERRIDE)) {
+- if ((current->real_cred->fsuid != inode->i_uid) ||
+- !(inode->i_mode & S_IRUSR) ||
+- !(donor_filp->f_dentry->d_inode->i_mode &
+- S_IRUSR)) {
+- fput(donor_filp);
+- return -EACCES;
+- }
++ if (!(donor_filp->f_mode & FMODE_WRITE)) {
++ err = -EBADF;
++ goto mext_out;
+ }
+
+- me.moved_len = 0;
++ err = mnt_want_write(filp->f_path.mnt);
++ if (err)
++ goto mext_out;
++
+ err = ext4_move_extents(filp, donor_filp, me.orig_start,
+ me.donor_start, me.len, &me.moved_len);
+- fput(donor_filp);
++ mnt_drop_write(filp->f_path.mnt);
++ if (me.moved_len > 0)
++ file_remove_suid(donor_filp);
+
+ if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
+- return -EFAULT;
+-
++ err = -EFAULT;
++mext_out:
++ fput(donor_filp);
+ return err;
+ }
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -957,6 +957,13 @@ mext_check_arguments(struct inode *orig_
+ return -EINVAL;
+ }
+
++ if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
++ ext4_debug("ext4 move extent: suid or sgid is set"
++ " to donor file [ino:orig %lu, donor %lu]\n",
++ orig_inode->i_ino, donor_inode->i_ino);
++ return -EINVAL;
++ }
++
+ /* Ext4 move extent does not support swapfile */
+ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
+ ext4_debug("ext4 move extent: The argument files should "
+
+
+From linux@linux.site Thu Dec 10 20:28:21 2009
+Message-Id: <20091211042820.904178854@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:26:04 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ James Bottomley <James.Bottomley@suse.de>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [86/90] SCSI: megaraid_sas: fix 64 bit sense pointer truncation
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=scsi-megaraid_sas-fix-64-bit-sense-pointer-truncation.patch
+Content-Length: 1456
+Lines: 47
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+From: Yang, Bo <Bo.Yang@lsi.com>
+
+commit 7b2519afa1abd1b9f63aa1e90879307842422dae upstream.
+
+The current sense pointer is cast to a u32 pointer, which can truncate
+on 64 bits. Fix by using unsigned long instead.
+
+Signed-off-by Bo Yang<bo.yang@lsi.com>
+Signed-off-by: James Bottomley <James.Bottomley@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/scsi/megaraid/megaraid_sas.c | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/drivers/scsi/megaraid/megaraid_sas.c
++++ b/drivers/scsi/megaraid/megaraid_sas.c
+@@ -3032,7 +3032,7 @@ megasas_mgmt_fw_ioctl(struct megasas_ins
+ int error = 0, i;
+ void *sense = NULL;
+ dma_addr_t sense_handle;
+- u32 *sense_ptr;
++ unsigned long *sense_ptr;
+
+ memset(kbuff_arr, 0, sizeof(kbuff_arr));
+
+@@ -3109,7 +3109,7 @@ megasas_mgmt_fw_ioctl(struct megasas_ins
+ }
+
+ sense_ptr =
+- (u32 *) ((unsigned long)cmd->frame + ioc->sense_off);
++ (unsigned long *) ((unsigned long)cmd->frame + ioc->sense_off);
+ *sense_ptr = sense_handle;
+ }
+
+@@ -3140,8 +3140,8 @@ megasas_mgmt_fw_ioctl(struct megasas_ins
+ * sense_ptr points to the location that has the user
+ * sense buffer address
+ */
+- sense_ptr = (u32 *) ((unsigned long)ioc->frame.raw +
+- ioc->sense_off);
++ sense_ptr = (unsigned long *) ((unsigned long)ioc->frame.raw +
++ ioc->sense_off);
+
+ if (copy_to_user((void __user *)((unsigned long)(*sense_ptr)),
+ sense, ioc->sense_len)) {
+
+
+From linux@linux.site Thu Dec 10 20:28:22 2009
+Message-Id: <20091211042821.450487768@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:26:05 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Martin Michlmayr <tbm@cyrius.com>,
+ Boaz Harrosh <bharrosh@panasas.com>,
+ James Bottomley <James.Bottomley@suse.de>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [87/90] SCSI: osd_protocol.h: Add missing #include
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=scsi-osd_protocol.h-add-missing-include.patch
+Content-Length: 783
+Lines: 28
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+From: Martin Michlmayr <tbm@cyrius.com>
+
+commit 0899638688f223fd9e9fee60d662665e11693d12 upstream.
+
+include/scsi/osd_protocol.h uses ALIGN() without an #include
+<linux/kernel.h>, leading to:
+| include/scsi/osd_protocol.h:362: error: implicit declaration of function 'ALIGN'
+
+Signed-off-by: Martin Michlmayr <tbm@cyrius.com>
+Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
+Signed-off-by: James Bottomley <James.Bottomley@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ include/scsi/osd_protocol.h | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/include/scsi/osd_protocol.h
++++ b/include/scsi/osd_protocol.h
+@@ -17,6 +17,7 @@
+ #define __OSD_PROTOCOL_H__
+
+ #include <linux/types.h>
++#include <linux/kernel.h>
+ #include <asm/unaligned.h>
+ #include <scsi/scsi.h>
+
+
+
+From linux@linux.site Thu Dec 10 20:28:22 2009
+Message-Id: <20091211042822.204320413@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:26:06 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ James Smart <james.smart@emulex.com>,
+ James Bottomley <James.Bottomley@suse.de>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [88/90] SCSI: scsi_lib_dma: fix bug with dma maps on nested scsi objects
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=scsi-scsi_lib_dma-fix-bug-with-dma-maps-on-nested-scsi-objects.patch
+Content-Length: 5210
+Lines: 149
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+From: James Bottomley <James.Bottomley@suse.de>
+
+commit d139b9bd0e52dda14fd13412e7096e68b56d0076 upstream.
+
+Some of our virtual SCSI hosts don't have a proper bus parent at the
+top, which can be a problem for doing DMA on them
+
+This patch makes the host device cache a pointer to the physical bus
+device and provides an extra API for setting it (the normal API picks
+it up from the parent). This patch also modifies the qla2xxx and lpfc
+vport logic to use the new DMA host setting API.
+
+Acked-By: James Smart <james.smart@emulex.com>
+Signed-off-by: James Bottomley <James.Bottomley@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ drivers/scsi/hosts.c | 13 ++++++++++---
+ drivers/scsi/lpfc/lpfc_init.c | 2 +-
+ drivers/scsi/qla2xxx/qla_attr.c | 3 ++-
+ drivers/scsi/scsi_lib_dma.c | 4 ++--
+ include/scsi/scsi_host.h | 16 +++++++++++++++-
+ 5 files changed, 30 insertions(+), 8 deletions(-)
+
+--- a/drivers/scsi/hosts.c
++++ b/drivers/scsi/hosts.c
+@@ -180,14 +180,20 @@ void scsi_remove_host(struct Scsi_Host *
+ EXPORT_SYMBOL(scsi_remove_host);
+
+ /**
+- * scsi_add_host - add a scsi host
++ * scsi_add_host_with_dma - add a scsi host with dma device
+ * @shost: scsi host pointer to add
+ * @dev: a struct device of type scsi class
++ * @dma_dev: dma device for the host
++ *
++ * Note: You rarely need to worry about this unless you're in a
++ * virtualised host environments, so use the simpler scsi_add_host()
++ * function instead.
+ *
+ * Return value:
+ * 0 on success / != 0 for error
+ **/
+-int scsi_add_host(struct Scsi_Host *shost, struct device *dev)
++int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
++ struct device *dma_dev)
+ {
+ struct scsi_host_template *sht = shost->hostt;
+ int error = -EINVAL;
+@@ -207,6 +213,7 @@ int scsi_add_host(struct Scsi_Host *shos
+
+ if (!shost->shost_gendev.parent)
+ shost->shost_gendev.parent = dev ? dev : &platform_bus;
++ shost->dma_dev = dma_dev;
+
+ error = device_add(&shost->shost_gendev);
+ if (error)
+@@ -262,7 +269,7 @@ int scsi_add_host(struct Scsi_Host *shos
+ fail:
+ return error;
+ }
+-EXPORT_SYMBOL(scsi_add_host);
++EXPORT_SYMBOL(scsi_add_host_with_dma);
+
+ static void scsi_host_dev_release(struct device *dev)
+ {
+--- a/drivers/scsi/lpfc/lpfc_init.c
++++ b/drivers/scsi/lpfc/lpfc_init.c
+@@ -2384,7 +2384,7 @@ lpfc_create_port(struct lpfc_hba *phba,
+ vport->els_tmofunc.function = lpfc_els_timeout;
+ vport->els_tmofunc.data = (unsigned long)vport;
+
+- error = scsi_add_host(shost, dev);
++ error = scsi_add_host_with_dma(shost, dev, &phba->pcidev->dev);
+ if (error)
+ goto out_put_shost;
+
+--- a/drivers/scsi/qla2xxx/qla_attr.c
++++ b/drivers/scsi/qla2xxx/qla_attr.c
+@@ -1654,7 +1654,8 @@ qla24xx_vport_create(struct fc_vport *fc
+ fc_vport_set_state(fc_vport, FC_VPORT_LINKDOWN);
+ }
+
+- if (scsi_add_host(vha->host, &fc_vport->dev)) {
++ if (scsi_add_host_with_dma(vha->host, &fc_vport->dev,
++ &ha->pdev->dev)) {
+ DEBUG15(printk("scsi(%ld): scsi_add_host failure for VP[%d].\n",
+ vha->host_no, vha->vp_idx));
+ goto vport_create_failed_2;
+--- a/drivers/scsi/scsi_lib_dma.c
++++ b/drivers/scsi/scsi_lib_dma.c
+@@ -23,7 +23,7 @@ int scsi_dma_map(struct scsi_cmnd *cmd)
+ int nseg = 0;
+
+ if (scsi_sg_count(cmd)) {
+- struct device *dev = cmd->device->host->shost_gendev.parent;
++ struct device *dev = cmd->device->host->dma_dev;
+
+ nseg = dma_map_sg(dev, scsi_sglist(cmd), scsi_sg_count(cmd),
+ cmd->sc_data_direction);
+@@ -41,7 +41,7 @@ EXPORT_SYMBOL(scsi_dma_map);
+ void scsi_dma_unmap(struct scsi_cmnd *cmd)
+ {
+ if (scsi_sg_count(cmd)) {
+- struct device *dev = cmd->device->host->shost_gendev.parent;
++ struct device *dev = cmd->device->host->dma_dev;
+
+ dma_unmap_sg(dev, scsi_sglist(cmd), scsi_sg_count(cmd),
+ cmd->sc_data_direction);
+--- a/include/scsi/scsi_host.h
++++ b/include/scsi/scsi_host.h
+@@ -677,6 +677,12 @@ struct Scsi_Host {
+ void *shost_data;
+
+ /*
++ * Points to the physical bus device we'd use to do DMA
++ * Needed just in case we have virtual hosts.
++ */
++ struct device *dma_dev;
++
++ /*
+ * We should ensure that this is aligned, both for better performance
+ * and also because some compilers (m68k) don't automatically force
+ * alignment to a long boundary.
+@@ -720,7 +726,9 @@ extern int scsi_queue_work(struct Scsi_H
+ extern void scsi_flush_work(struct Scsi_Host *);
+
+ extern struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *, int);
+-extern int __must_check scsi_add_host(struct Scsi_Host *, struct device *);
++extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *,
++ struct device *,
++ struct device *);
+ extern void scsi_scan_host(struct Scsi_Host *);
+ extern void scsi_rescan_device(struct device *);
+ extern void scsi_remove_host(struct Scsi_Host *);
+@@ -731,6 +739,12 @@ extern const char *scsi_host_state_name(
+
+ extern u64 scsi_calculate_bounce_limit(struct Scsi_Host *);
+
++static inline int __must_check scsi_add_host(struct Scsi_Host *host,
++ struct device *dev)
++{
++ return scsi_add_host_with_dma(host, dev, dev);
++}
++
+ static inline struct device *scsi_get_device(struct Scsi_Host *shost)
+ {
+ return shost->shost_gendev.parent;
+
+
+From linux@linux.site Thu Dec 10 20:28:23 2009
+Message-Id: <20091211042822.857844494@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:26:07 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ Sebastian Andrzej Siewior <sebastian@breakpoint.cc>,
+ Oleg Nesterov <oleg@redhat.com>,
+ Roland McGrath <roland@redhat.com>,
+ Kyle McMartin <kyle@mcmartin.ca>,
+ Thomas Gleixner <tglx@linutronix.de>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [89/90] signal: Fix alternate signal stack check
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=signal-fix-alternate-signal-stack-check.patch
+Content-Length: 2919
+Lines: 83
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+From: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
+
+commit 2a855dd01bc1539111adb7233f587c5c468732ac upstream.
+
+All architectures in the kernel increment/decrement the stack pointer
+before storing values on the stack.
+
+On architectures which have the stack grow down sas_ss_sp == sp is not
+on the alternate signal stack while sas_ss_sp + sas_ss_size == sp is
+on the alternate signal stack.
+
+On architectures which have the stack grow up sas_ss_sp == sp is on
+the alternate signal stack while sas_ss_sp + sas_ss_size == sp is not
+on the alternate signal stack.
+
+The current implementation fails for architectures which have the
+stack grow down on the corner case where sas_ss_sp == sp.This was
+reported as Debian bug #544905 on AMD64.
+Simplified test case: http://download.breakpoint.cc/tc-sig-stack.c
+
+The test case creates the following stack scenario:
+ 0xn0300 stack top
+ 0xn0200 alt stack pointer top (when switching to alt stack)
+ 0xn01ff alt stack end
+ 0xn0100 alt stack start == stack pointer
+
+If the signal is sent the stack pointer is pointing to the base
+address of the alt stack and the kernel erroneously decides that it
+has already switched to the alternate stack because of the current
+check for "sp - sas_ss_sp < sas_ss_size"
+
+On parisc (stack grows up) the scenario would be:
+ 0xn0200 stack pointer
+ 0xn01ff alt stack end
+ 0xn0100 alt stack start = alt stack pointer base
+ (when switching to alt stack)
+ 0xn0000 stack base
+
+This is handled correctly by the current implementation.
+
+[ tglx: Modified for archs which have the stack grow up (parisc) which
+ would fail with the correct implementation for stack grows
+ down. Added a check for sp >= current->sas_ss_sp which is
+ strictly not necessary but makes the code symetric for both
+ variants ]
+
+Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Roland McGrath <roland@redhat.com>
+Cc: Kyle McMartin <kyle@mcmartin.ca>
+LKML-Reference: <20091025143758.GA6653@Chamillionaire.breakpoint.cc>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ include/linux/sched.h | 13 ++++++++++---
+ 1 file changed, 10 insertions(+), 3 deletions(-)
+
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1999,11 +1999,18 @@ static inline int is_si_special(const st
+ return info <= SEND_SIG_FORCED;
+ }
+
+-/* True if we are on the alternate signal stack. */
+-
++/*
++ * True if we are on the alternate signal stack.
++ */
+ static inline int on_sig_stack(unsigned long sp)
+ {
+- return (sp - current->sas_ss_sp < current->sas_ss_size);
++#ifdef CONFIG_STACK_GROWSUP
++ return sp >= current->sas_ss_sp &&
++ sp - current->sas_ss_sp < current->sas_ss_size;
++#else
++ return sp > current->sas_ss_sp &&
++ sp - current->sas_ss_sp <= current->sas_ss_size;
++#endif
+ }
+
+ static inline int sas_ss_flags(unsigned long sp)
+
+
+From linux@linux.site Thu Dec 10 20:28:24 2009
+Message-Id: <20091211042823.450484017@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:26:08 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk,
+ "Theodore Tso" <tytso@mit.edu>,
+ Greg Kroah-Hartman <gregkh@suse.de>
+Subject: [90/90] ext4: Fix potential fiemap deadlock (mmap_sem vs. i_data_sem)
+References: <20091211042438.970725457@linux.site>
+Content-Disposition: inline; filename=ext4-fix-potential-fiemap-deadlock-mmap_sem-vs.-i_data_sem.patch
+Content-Length: 5029
+Lines: 115
+
+2.6.31-stable review patch. If anyone has any objections, please let us know.
+
+------------------
+(cherry picked from commit fab3a549e204172236779f502eccb4f9bf0dc87d)
+
+Fix the following potential circular locking dependency between
+mm->mmap_sem and ei->i_data_sem:
+
+ =======================================================
+ [ INFO: possible circular locking dependency detected ]
+ 2.6.32-04115-gec044c5 #37
+ -------------------------------------------------------
+ ureadahead/1855 is trying to acquire lock:
+ (&mm->mmap_sem){++++++}, at: [<ffffffff81107224>] might_fault+0x5c/0xac
+
+ but task is already holding lock:
+ (&ei->i_data_sem){++++..}, at: [<ffffffff811be1fd>] ext4_fiemap+0x11b/0x159
+
+ which lock already depends on the new lock.
+
+ the existing dependency chain (in reverse order) is:
+
+ -> #1 (&ei->i_data_sem){++++..}:
+ [<ffffffff81099bfa>] __lock_acquire+0xb67/0xd0f
+ [<ffffffff81099e7e>] lock_acquire+0xdc/0x102
+ [<ffffffff81516633>] down_read+0x51/0x84
+ [<ffffffff811a2414>] ext4_get_blocks+0x50/0x2a5
+ [<ffffffff811a3453>] ext4_get_block+0xab/0xef
+ [<ffffffff81154f39>] do_mpage_readpage+0x198/0x48d
+ [<ffffffff81155360>] mpage_readpages+0xd0/0x114
+ [<ffffffff811a104b>] ext4_readpages+0x1d/0x1f
+ [<ffffffff810f8644>] __do_page_cache_readahead+0x12f/0x1bc
+ [<ffffffff810f86f2>] ra_submit+0x21/0x25
+ [<ffffffff810f0cfd>] filemap_fault+0x19f/0x32c
+ [<ffffffff81107b97>] __do_fault+0x55/0x3a2
+ [<ffffffff81109db0>] handle_mm_fault+0x327/0x734
+ [<ffffffff8151aaa9>] do_page_fault+0x292/0x2aa
+ [<ffffffff81518205>] page_fault+0x25/0x30
+ [<ffffffff812a34d8>] clear_user+0x38/0x3c
+ [<ffffffff81167e16>] padzero+0x20/0x31
+ [<ffffffff81168b47>] load_elf_binary+0x8bc/0x17ed
+ [<ffffffff81130e95>] search_binary_handler+0xc2/0x259
+ [<ffffffff81166d64>] load_script+0x1b8/0x1cc
+ [<ffffffff81130e95>] search_binary_handler+0xc2/0x259
+ [<ffffffff8113255f>] do_execve+0x1ce/0x2cf
+ [<ffffffff81027494>] sys_execve+0x43/0x5a
+ [<ffffffff8102918a>] stub_execve+0x6a/0xc0
+
+ -> #0 (&mm->mmap_sem){++++++}:
+ [<ffffffff81099aa4>] __lock_acquire+0xa11/0xd0f
+ [<ffffffff81099e7e>] lock_acquire+0xdc/0x102
+ [<ffffffff81107251>] might_fault+0x89/0xac
+ [<ffffffff81139382>] fiemap_fill_next_extent+0x95/0xda
+ [<ffffffff811bcb43>] ext4_ext_fiemap_cb+0x138/0x157
+ [<ffffffff811be069>] ext4_ext_walk_space+0x178/0x1f1
+ [<ffffffff811be21e>] ext4_fiemap+0x13c/0x159
+ [<ffffffff811390e6>] do_vfs_ioctl+0x348/0x4d6
+ [<ffffffff811392ca>] sys_ioctl+0x56/0x79
+ [<ffffffff81028cb2>] system_call_fastpath+0x16/0x1b
+
+ other info that might help us debug this:
+
+ 1 lock held by ureadahead/1855:
+ #0: (&ei->i_data_sem){++++..}, at: [<ffffffff811be1fd>] ext4_fiemap+0x11b/0x159
+
+ stack backtrace:
+ Pid: 1855, comm: ureadahead Not tainted 2.6.32-04115-gec044c5 #37
+ Call Trace:
+ [<ffffffff81098c70>] print_circular_bug+0xa8/0xb7
+ [<ffffffff81099aa4>] __lock_acquire+0xa11/0xd0f
+ [<ffffffff8102f229>] ? sched_clock+0x9/0xd
+ [<ffffffff81099e7e>] lock_acquire+0xdc/0x102
+ [<ffffffff81107224>] ? might_fault+0x5c/0xac
+ [<ffffffff81107251>] might_fault+0x89/0xac
+ [<ffffffff81107224>] ? might_fault+0x5c/0xac
+ [<ffffffff81124b44>] ? __kmalloc+0x13b/0x18c
+ [<ffffffff81139382>] fiemap_fill_next_extent+0x95/0xda
+ [<ffffffff811bcb43>] ext4_ext_fiemap_cb+0x138/0x157
+ [<ffffffff811bca0b>] ? ext4_ext_fiemap_cb+0x0/0x157
+ [<ffffffff811be069>] ext4_ext_walk_space+0x178/0x1f1
+ [<ffffffff811be21e>] ext4_fiemap+0x13c/0x159
+ [<ffffffff81107224>] ? might_fault+0x5c/0xac
+ [<ffffffff811390e6>] do_vfs_ioctl+0x348/0x4d6
+ [<ffffffff8129f6d0>] ? __up_read+0x8d/0x95
+ [<ffffffff81517fb5>] ? retint_swapgs+0x13/0x1b
+ [<ffffffff811392ca>] sys_ioctl+0x56/0x79
+ [<ffffffff81028cb2>] system_call_fastpath+0x16/0x1b
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -1742,7 +1742,9 @@ int ext4_ext_walk_space(struct inode *in
+ while (block < last && block != EXT_MAX_BLOCK) {
+ num = last - block;
+ /* find extent for this block */
++ down_read(&EXT4_I(inode)->i_data_sem);
+ path = ext4_ext_find_extent(inode, block, path);
++ up_read(&EXT4_I(inode)->i_data_sem);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ path = NULL;
+@@ -3707,10 +3709,8 @@ int ext4_fiemap(struct inode *inode, str
+ * Walk the extent tree gathering extent information.
+ * ext4_ext_fiemap_cb will push extents back to user.
+ */
+- down_read(&EXT4_I(inode)->i_data_sem);
+ error = ext4_ext_walk_space(inode, start_blk, len_blks,
+ ext4_ext_fiemap_cb, fieinfo);
+- up_read(&EXT4_I(inode)->i_data_sem);
+ }
+
+ return error;
+
+
+From linux@linux.site Thu Dec 10 20:27:24 2009
+Message-Id: <20091211042438.970725457@linux.site>
+User-Agent: quilt/0.47-14.9
+Date: Thu, 10 Dec 2009 20:24:38 -0800
+From: Greg KH <gregkh@suse.de>
+To: linux-kernel@vger.kernel.org,
+ stable@kernel.org
+Cc: stable-review@kernel.org,
+ torvalds@linux-foundation.org,
+ akpm@linux-foundation.org,
+ alan@lxorguk.ukuu.org.uk
+Subject: [00/90] 2.6.31.8-stable review
+Content-Length: 2517
+Lines: 53
+
+This is the start of the stable review cycle for the 2.6.31.8 release.
+There are 90 patches in this series, all will be posted as a response
+to this one. If anyone has any issues with these being applied, please
+let us know. If anyone is a maintainer of the proper subsystem, and
+wants to add a Signed-off-by: line to the patch, please respond with it.
+
+Yes, there are still more patches to be queued up for the .31-stable
+tree, but as I just queued up 86 ext4 patches, I figured I would add 4
+more to make it a nice even 90 and push it out for everyone to enjoy
+while I work on getting the rest out after this.
+
+Responses should be made by Sunday, Dec 13 04:00:00 UTC 2009
+Anything received after that time might be too late.
+
+The whole patch series can be found in one patch at:
+ kernel.org/pub/linux/kernel/v2.6/stable-review/patch-2.6.31.8-rc1.gz
+and the diffstat can be found below.
+
+thanks,
+
+greg k-h
+
+ Documentation/filesystems/ext4.txt | 10 +-
+ drivers/scsi/hosts.c | 13 +-
+ drivers/scsi/lpfc/lpfc_init.c | 2 +-
+ drivers/scsi/megaraid/megaraid_sas.c | 8 +-
+ drivers/scsi/qla2xxx/qla_attr.c | 3 +-
+ drivers/scsi/scsi_lib_dma.c | 4 +-
+ fs/ext4/balloc.c | 8 +-
+ fs/ext4/block_validity.c | 2 +-
+ fs/ext4/ext4.h | 105 +++++-
+ fs/ext4/ext4_extents.h | 7 +-
+ fs/ext4/ext4_jbd2.c | 9 +-
+ fs/ext4/ext4_jbd2.h | 27 ++-
+ fs/ext4/extents.c | 493 +++++++++++++++++++++---
+ fs/ext4/fsync.c | 54 ++--
+ fs/ext4/inode.c | 705 +++++++++++++++++++++++++++++-----
+ fs/ext4/ioctl.c | 32 +-
+ fs/ext4/mballoc.c | 322 ++++++++--------
+ fs/ext4/migrate.c | 28 +-
+ fs/ext4/move_extent.c | 572 ++++++++++++++++------------
+ fs/ext4/namei.c | 47 +--
+ fs/ext4/resize.c | 2 +-
+ fs/ext4/super.c | 239 ++++++++----
+ fs/ext4/xattr.c | 22 +-
+ fs/jbd2/commit.c | 4 +
+ fs/jbd2/journal.c | 11 +
+ fs/jbd2/transaction.c | 7 +-
+ include/linux/sched.h | 13 +-
+ include/scsi/osd_protocol.h | 1 +
+ include/scsi/scsi_host.h | 16 +-
+ include/trace/events/ext4.h | 60 +++-
+ 30 files changed, 2082 insertions(+), 744 deletions(-)
+