--- /dev/null
+From 268a9f6207f354daedf0f92b0b57986bea37b69c Mon Sep 17 00:00:00 2001
+From: Avi Kivity <avi@redhat.com>
+Date: Thu, 27 May 2010 14:35:58 +0300
+Subject: KVM: MMU: Remove user access when allowing kernel access to gpte.w=0 page
+
+If cr0.wp=0, we have to allow the guest kernel access to a page with pte.w=0.
+We do that by setting spte.w=1, since the host cr0.wp must remain set so the
+host can write protect pages. Once we allow write access, we must remove
+user access otherwise we mistakenly allow the user to write the page.
+
+Reviewed-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
+Signed-off-by: Avi Kivity <avi@redhat.com>
+(cherry picked from commit 69325a122580d3a7b26589e8efdd6663001c3297)
+---
+ arch/x86/kvm/mmu.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/arch/x86/kvm/mmu.c
++++ b/arch/x86/kvm/mmu.c
+@@ -1843,6 +1843,9 @@ static int set_spte(struct kvm_vcpu *vcp
+
+ spte |= PT_WRITABLE_MASK;
+
++ if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
++ spte &= ~PT_USER_MASK;
++
+ /*
+ * Optimization: for pte sync, if spte was writable the hash
+ * lookup is unnecessary (and expensive). Write protection
--- /dev/null
+From fea2aabf4ac586092b1a3acb4adb234bb4bf6266 Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:14 -0400
+Subject: ext4: Fix potential quota deadlock
+
+commit d21cd8f163ac44b15c465aab7306db931c606908 upstream (as of v2.6.33-rc2)
+
+We have to delay vfs_dq_claim_space() until allocation context destruction.
+Currently we have following call-trace:
+ext4_mb_new_blocks()
+ /* task is already holding ac->alloc_semp */
+ ->ext4_mb_mark_diskspace_used
+ ->vfs_dq_claim_space() /* acquire dqptr_sem here. Possible deadlock */
+ ->ext4_mb_release_context() /* drop ac->alloc_semp here */
+
+Let's move quota claiming to ext4_da_update_reserve_space()
+
+ =======================================================
+ [ INFO: possible circular locking dependency detected ]
+ 2.6.32-rc7 #18
+ -------------------------------------------------------
+ write-truncate-/3465 is trying to acquire lock:
+ (&s->s_dquot.dqptr_sem){++++..}, at: [<c025e73b>] dquot_claim_space+0x3b/0x1b0
+
+ but task is already holding lock:
+ (&meta_group_info[i]->alloc_sem){++++..}, at: [<c02ce962>] ext4_mb_load_buddy+0xb2/0x370
+
+ which lock already depends on the new lock.
+
+ the existing dependency chain (in reverse order) is:
+
+ -> #3 (&meta_group_info[i]->alloc_sem){++++..}:
+ [<c017d04b>] __lock_acquire+0xd7b/0x1260
+ [<c017d5ea>] lock_acquire+0xba/0xd0
+ [<c0527191>] down_read+0x51/0x90
+ [<c02ce962>] ext4_mb_load_buddy+0xb2/0x370
+ [<c02d0c1c>] ext4_mb_free_blocks+0x46c/0x870
+ [<c029c9d3>] ext4_free_blocks+0x73/0x130
+ [<c02c8cfc>] ext4_ext_truncate+0x76c/0x8d0
+ [<c02a8087>] ext4_truncate+0x187/0x5e0
+ [<c01e0f7b>] vmtruncate+0x6b/0x70
+ [<c022ec02>] inode_setattr+0x62/0x190
+ [<c02a2d7a>] ext4_setattr+0x25a/0x370
+ [<c022ee81>] notify_change+0x151/0x340
+ [<c021349d>] do_truncate+0x6d/0xa0
+ [<c0221034>] may_open+0x1d4/0x200
+ [<c022412b>] do_filp_open+0x1eb/0x910
+ [<c021244d>] do_sys_open+0x6d/0x140
+ [<c021258e>] sys_open+0x2e/0x40
+ [<c0103100>] sysenter_do_call+0x12/0x32
+
+ -> #2 (&ei->i_data_sem){++++..}:
+ [<c017d04b>] __lock_acquire+0xd7b/0x1260
+ [<c017d5ea>] lock_acquire+0xba/0xd0
+ [<c0527191>] down_read+0x51/0x90
+ [<c02a5787>] ext4_get_blocks+0x47/0x450
+ [<c02a74c1>] ext4_getblk+0x61/0x1d0
+ [<c02a7a7f>] ext4_bread+0x1f/0xa0
+ [<c02bcddc>] ext4_quota_write+0x12c/0x310
+ [<c0262d23>] qtree_write_dquot+0x93/0x120
+ [<c0261708>] v2_write_dquot+0x28/0x30
+ [<c025d3fb>] dquot_commit+0xab/0xf0
+ [<c02be977>] ext4_write_dquot+0x77/0x90
+ [<c02be9bf>] ext4_mark_dquot_dirty+0x2f/0x50
+ [<c025e321>] dquot_alloc_inode+0x101/0x180
+ [<c029fec2>] ext4_new_inode+0x602/0xf00
+ [<c02ad789>] ext4_create+0x89/0x150
+ [<c0221ff2>] vfs_create+0xa2/0xc0
+ [<c02246e7>] do_filp_open+0x7a7/0x910
+ [<c021244d>] do_sys_open+0x6d/0x140
+ [<c021258e>] sys_open+0x2e/0x40
+ [<c0103100>] sysenter_do_call+0x12/0x32
+
+ -> #1 (&sb->s_type->i_mutex_key#7/4){+.+...}:
+ [<c017d04b>] __lock_acquire+0xd7b/0x1260
+ [<c017d5ea>] lock_acquire+0xba/0xd0
+ [<c0526505>] mutex_lock_nested+0x65/0x2d0
+ [<c0260c9d>] vfs_load_quota_inode+0x4bd/0x5a0
+ [<c02610af>] vfs_quota_on_path+0x5f/0x70
+ [<c02bc812>] ext4_quota_on+0x112/0x190
+ [<c026345a>] sys_quotactl+0x44a/0x8a0
+ [<c0103100>] sysenter_do_call+0x12/0x32
+
+ -> #0 (&s->s_dquot.dqptr_sem){++++..}:
+ [<c017d361>] __lock_acquire+0x1091/0x1260
+ [<c017d5ea>] lock_acquire+0xba/0xd0
+ [<c0527191>] down_read+0x51/0x90
+ [<c025e73b>] dquot_claim_space+0x3b/0x1b0
+ [<c02cb95f>] ext4_mb_mark_diskspace_used+0x36f/0x380
+ [<c02d210a>] ext4_mb_new_blocks+0x34a/0x530
+ [<c02c83fb>] ext4_ext_get_blocks+0x122b/0x13c0
+ [<c02a5966>] ext4_get_blocks+0x226/0x450
+ [<c02a5ff3>] mpage_da_map_blocks+0xc3/0xaa0
+ [<c02a6ed6>] ext4_da_writepages+0x506/0x790
+ [<c01de272>] do_writepages+0x22/0x50
+ [<c01d766d>] __filemap_fdatawrite_range+0x6d/0x80
+ [<c01d7b9b>] filemap_flush+0x2b/0x30
+ [<c02a40ac>] ext4_alloc_da_blocks+0x5c/0x60
+ [<c029e595>] ext4_release_file+0x75/0xb0
+ [<c0216b59>] __fput+0xf9/0x210
+ [<c0216c97>] fput+0x27/0x30
+ [<c02122dc>] filp_close+0x4c/0x80
+ [<c014510e>] put_files_struct+0x6e/0xd0
+ [<c01451b7>] exit_files+0x47/0x60
+ [<c0146a24>] do_exit+0x144/0x710
+ [<c0147028>] do_group_exit+0x38/0xa0
+ [<c0159abc>] get_signal_to_deliver+0x2ac/0x410
+ [<c0102849>] do_notify_resume+0xb9/0x890
+ [<c01032d2>] work_notifysig+0x13/0x21
+
+ other info that might help us debug this:
+
+ 3 locks held by write-truncate-/3465:
+ #0: (jbd2_handle){+.+...}, at: [<c02e1f8f>] start_this_handle+0x38f/0x5c0
+ #1: (&ei->i_data_sem){++++..}, at: [<c02a57f6>] ext4_get_blocks+0xb6/0x450
+ #2: (&meta_group_info[i]->alloc_sem){++++..}, at: [<c02ce962>] ext4_mb_load_buddy+0xb2/0x370
+
+ stack backtrace:
+ Pid: 3465, comm: write-truncate- Not tainted 2.6.32-rc7 #18
+ Call Trace:
+ [<c0524cb3>] ? printk+0x1d/0x22
+ [<c017ac9a>] print_circular_bug+0xca/0xd0
+ [<c017d361>] __lock_acquire+0x1091/0x1260
+ [<c016bca2>] ? sched_clock_local+0xd2/0x170
+ [<c0178fd0>] ? trace_hardirqs_off_caller+0x20/0xd0
+ [<c017d5ea>] lock_acquire+0xba/0xd0
+ [<c025e73b>] ? dquot_claim_space+0x3b/0x1b0
+ [<c0527191>] down_read+0x51/0x90
+ [<c025e73b>] ? dquot_claim_space+0x3b/0x1b0
+ [<c025e73b>] dquot_claim_space+0x3b/0x1b0
+ [<c02cb95f>] ext4_mb_mark_diskspace_used+0x36f/0x380
+ [<c02d210a>] ext4_mb_new_blocks+0x34a/0x530
+ [<c02c601d>] ? ext4_ext_find_extent+0x25d/0x280
+ [<c02c83fb>] ext4_ext_get_blocks+0x122b/0x13c0
+ [<c016bca2>] ? sched_clock_local+0xd2/0x170
+ [<c016be60>] ? sched_clock_cpu+0x120/0x160
+ [<c016beef>] ? cpu_clock+0x4f/0x60
+ [<c0178fd0>] ? trace_hardirqs_off_caller+0x20/0xd0
+ [<c052712c>] ? down_write+0x8c/0xa0
+ [<c02a5966>] ext4_get_blocks+0x226/0x450
+ [<c016be60>] ? sched_clock_cpu+0x120/0x160
+ [<c016beef>] ? cpu_clock+0x4f/0x60
+ [<c017908b>] ? trace_hardirqs_off+0xb/0x10
+ [<c02a5ff3>] mpage_da_map_blocks+0xc3/0xaa0
+ [<c01d69cc>] ? find_get_pages_tag+0x16c/0x180
+ [<c01d6860>] ? find_get_pages_tag+0x0/0x180
+ [<c02a73bd>] ? __mpage_da_writepage+0x16d/0x1a0
+ [<c01dfc4e>] ? pagevec_lookup_tag+0x2e/0x40
+ [<c01ddf1b>] ? write_cache_pages+0xdb/0x3d0
+ [<c02a7250>] ? __mpage_da_writepage+0x0/0x1a0
+ [<c02a6ed6>] ext4_da_writepages+0x506/0x790
+ [<c016beef>] ? cpu_clock+0x4f/0x60
+ [<c016bca2>] ? sched_clock_local+0xd2/0x170
+ [<c016be60>] ? sched_clock_cpu+0x120/0x160
+ [<c016be60>] ? sched_clock_cpu+0x120/0x160
+ [<c02a69d0>] ? ext4_da_writepages+0x0/0x790
+ [<c01de272>] do_writepages+0x22/0x50
+ [<c01d766d>] __filemap_fdatawrite_range+0x6d/0x80
+ [<c01d7b9b>] filemap_flush+0x2b/0x30
+ [<c02a40ac>] ext4_alloc_da_blocks+0x5c/0x60
+ [<c029e595>] ext4_release_file+0x75/0xb0
+ [<c0216b59>] __fput+0xf9/0x210
+ [<c0216c97>] fput+0x27/0x30
+ [<c02122dc>] filp_close+0x4c/0x80
+ [<c014510e>] put_files_struct+0x6e/0xd0
+ [<c01451b7>] exit_files+0x47/0x60
+ [<c0146a24>] do_exit+0x144/0x710
+ [<c017b163>] ? lock_release_holdtime+0x33/0x210
+ [<c0528137>] ? _spin_unlock_irq+0x27/0x30
+ [<c0147028>] do_group_exit+0x38/0xa0
+ [<c017babb>] ? trace_hardirqs_on+0xb/0x10
+ [<c0159abc>] get_signal_to_deliver+0x2ac/0x410
+ [<c0102849>] do_notify_resume+0xb9/0x890
+ [<c0178fd0>] ? trace_hardirqs_off_caller+0x20/0xd0
+ [<c017b163>] ? lock_release_holdtime+0x33/0x210
+ [<c0165b50>] ? autoremove_wake_function+0x0/0x50
+ [<c017ba54>] ? trace_hardirqs_on_caller+0x134/0x190
+ [<c017babb>] ? trace_hardirqs_on+0xb/0x10
+ [<c0300ba4>] ? security_file_permission+0x14/0x20
+ [<c0215761>] ? vfs_write+0x131/0x190
+ [<c0214f50>] ? do_sync_write+0x0/0x120
+ [<c0103115>] ? sysenter_do_call+0x27/0x32
+ [<c01032d2>] work_notifysig+0x13/0x21
+
+CC: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 9 +++++++--
+ fs/ext4/mballoc.c | 6 ------
+ 2 files changed, 7 insertions(+), 8 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1088,7 +1088,7 @@ static int ext4_calc_metadata_amount(str
+ static void ext4_da_update_reserve_space(struct inode *inode, int used)
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+- int total, mdb, mdb_free;
++ int total, mdb, mdb_free, mdb_claim = 0;
+
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ /* recalculate the number of metablocks still need to be reserved */
+@@ -1101,7 +1101,9 @@ static void ext4_da_update_reserve_space
+
+ if (mdb_free) {
+ /* Account for allocated meta_blocks */
+- mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
++ mdb_claim = EXT4_I(inode)->i_allocated_meta_blocks;
++ BUG_ON(mdb_free < mdb_claim);
++ mdb_free -= mdb_claim;
+
+ /* update fs dirty blocks counter */
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+@@ -1112,8 +1114,11 @@ static void ext4_da_update_reserve_space
+ /* update per-inode reservations */
+ BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
+ EXT4_I(inode)->i_reserved_data_blocks -= used;
++ percpu_counter_sub(&sbi->s_dirtyblocks_counter, used + mdb_claim);
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
++ vfs_dq_claim_block(inode, used + mdb_claim);
++
+ /*
+ * free those over-booking quota for metadata blocks
+ */
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -2755,12 +2755,6 @@ ext4_mb_mark_diskspace_used(struct ext4_
+ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+ /* release all the reserved blocks if non delalloc */
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
+- else {
+- percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+- ac->ac_b_ex.fe_len);
+- /* convert reserved quota blocks to real quota blocks */
+- vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len);
+- }
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi,
--- /dev/null
+From 9ce5c64e94beb615d6581e7b8839bb0173903425 Mon Sep 17 00:00:00 2001
+From: Joerg Roedel <joerg.roedel@amd.com>
+Date: Mon, 17 May 2010 14:43:34 +0200
+Subject: KVM: SVM: Handle MCEs early in the vmexit process
+
+This patch moves handling of the MC vmexits to an earlier
+point in the vmexit. The handle_exit function is too late
+because the vcpu might alreadry have changed its physical
+cpu.
+
+Cc: stable@kernel.org
+Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
+Signed-off-by: Avi Kivity <avi@redhat.com>
+(cherry picked from commit fe5913e4e1700cbfc337f4b1da9ddb26f6a55586)
+---
+ arch/x86/kvm/svm.c | 15 ++++++++++++++-
+ 1 file changed, 14 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -1257,7 +1257,7 @@ static int nm_interception(struct vcpu_s
+ return 1;
+ }
+
+-static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
++static void svm_handle_mce(struct vcpu_svm *svm)
+ {
+ /*
+ * On an #MC intercept the MCE handler is not called automatically in
+@@ -1267,6 +1267,11 @@ static int mc_interception(struct vcpu_s
+ "int $0x12\n");
+ /* not sure if we ever come back to this point */
+
++ return;
++}
++
++static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
++{
+ return 1;
+ }
+
+@@ -2717,6 +2722,14 @@ static void svm_vcpu_run(struct kvm_vcpu
+ vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
+ vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
+ }
++
++ /*
++ * We need to handle MC intercepts here before the vcpu has a chance to
++ * change the physical cpu
++ */
++ if (unlikely(svm->vmcb->control.exit_code ==
++ SVM_EXIT_EXCP_BASE + MC_VECTOR))
++ svm_handle_mce(svm);
+ }
+
+ #undef R
--- /dev/null
+From f57e36578513418a67eef4912c8503a47a4993aa Mon Sep 17 00:00:00 2001
+From: Surbhi Palande <surbhi.palande@canonical.com>
+Date: Sun, 30 May 2010 22:49:16 -0400
+Subject: ext4: replace BUG() with return -EIO in ext4_ext_get_blocks
+
+commit 034fb4c95fc0fed4ec4a50778127b92c6f2aec01 upstream (as of v2.6.33-rc3)
+
+This patch fixes the Kernel BZ #14286. When the address of an extent
+corresponding to a valid block is corrupted, a -EIO should be reported
+instead of a BUG(). This situation should not normally not occur
+except in the case of a corrupted filesystem. If however it does,
+then the system should not panic directly but depending on the mount
+time options appropriate action should be taken. If the mount options
+so permit, the I/O should be gracefully aborted by returning a -EIO.
+
+http://bugzilla.kernel.org/show_bug.cgi?id=14286
+
+Signed-off-by: Surbhi Palande <surbhi.palande@canonical.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c | 8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3196,7 +3196,13 @@ int ext4_ext_get_blocks(handle_t *handle
+ * this situation is possible, though, _during_ tree modification;
+ * this is why assert can't be put in ext4_ext_find_extent()
+ */
+- BUG_ON(path[depth].p_ext == NULL && depth != 0);
++ if (path[depth].p_ext == NULL && depth != 0) {
++ ext4_error(inode->i_sb, __func__, "bad extent address "
++ "inode: %lu, iblock: %d, depth: %d",
++ inode->i_ino, iblock, depth);
++ err = -EIO;
++ goto out2;
++ }
+ eh = path[depth].p_hdr;
+
+ ex = path[depth].p_ext;
--- /dev/null
+From a61279422bc32ecbf85e3a6a9349287c7df0b0b1 Mon Sep 17 00:00:00 2001
+From: Joerg Roedel <joerg.roedel@amd.com>
+Date: Mon, 17 May 2010 14:43:35 +0200
+Subject: KVM: SVM: Implement workaround for Erratum 383
+
+This patch implements a workaround for AMD erratum 383 into
+KVM. Without this erratum fix it is possible for a guest to
+kill the host machine. This patch implements the suggested
+workaround for hypervisors which will be published by the
+next revision guide update.
+
+[jan: fix overflow warning on i386]
+[xiao: fix unused variable warning]
+
+Cc: stable@kernel.org
+Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
+Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
+Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
+Signed-off-by: Avi Kivity <avi@redhat.com>
+(cherry picked from commit 67ec66077799f2fef84b21a643912b179c422281)
+---
+ arch/x86/include/asm/msr-index.h | 1
+ arch/x86/kvm/svm.c | 84 ++++++++++++++++++++++++++++++++++++++-
+ 2 files changed, 84 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/msr-index.h
++++ b/arch/x86/include/asm/msr-index.h
+@@ -106,6 +106,7 @@
+ #define MSR_AMD64_PATCH_LOADER 0xc0010020
+ #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
+ #define MSR_AMD64_OSVW_STATUS 0xc0010141
++#define MSR_AMD64_DC_CFG 0xc0011022
+ #define MSR_AMD64_IBSFETCHCTL 0xc0011030
+ #define MSR_AMD64_IBSFETCHLINAD 0xc0011031
+ #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -27,6 +27,7 @@
+ #include <linux/sched.h>
+ #include <linux/ftrace_event.h>
+
++#include <asm/tlbflush.h>
+ #include <asm/desc.h>
+
+ #include <asm/virtext.h>
+@@ -62,6 +63,8 @@ MODULE_LICENSE("GPL");
+ #define nsvm_printk(fmt, args...) do {} while(0)
+ #endif
+
++static bool erratum_383_found __read_mostly;
++
+ static const u32 host_save_user_msrs[] = {
+ #ifdef CONFIG_X86_64
+ MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
+@@ -299,6 +302,31 @@ static void skip_emulated_instruction(st
+ svm_set_interrupt_shadow(vcpu, 0);
+ }
+
++static void svm_init_erratum_383(void)
++{
++ u32 low, high;
++ int err;
++ u64 val;
++
++ /* Only Fam10h is affected */
++ if (boot_cpu_data.x86 != 0x10)
++ return;
++
++ /* Use _safe variants to not break nested virtualization */
++ val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
++ if (err)
++ return;
++
++ val |= (1ULL << 47);
++
++ low = lower_32_bits(val);
++ high = upper_32_bits(val);
++
++ native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
++
++ erratum_383_found = true;
++}
++
+ static int has_svm(void)
+ {
+ const char *msg;
+@@ -318,7 +346,6 @@ static void svm_hardware_disable(void *g
+
+ static void svm_hardware_enable(void *garbage)
+ {
+-
+ struct svm_cpu_data *svm_data;
+ uint64_t efer;
+ struct descriptor_table gdt_descr;
+@@ -350,6 +377,10 @@ static void svm_hardware_enable(void *ga
+
+ wrmsrl(MSR_VM_HSAVE_PA,
+ page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
++
++ svm_init_erratum_383();
++
++ return;
+ }
+
+ static void svm_cpu_uninit(int cpu)
+@@ -1257,8 +1288,59 @@ static int nm_interception(struct vcpu_s
+ return 1;
+ }
+
++static bool is_erratum_383(void)
++{
++ int err, i;
++ u64 value;
++
++ if (!erratum_383_found)
++ return false;
++
++ value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
++ if (err)
++ return false;
++
++ /* Bit 62 may or may not be set for this mce */
++ value &= ~(1ULL << 62);
++
++ if (value != 0xb600000000010015ULL)
++ return false;
++
++ /* Clear MCi_STATUS registers */
++ for (i = 0; i < 6; ++i)
++ native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
++
++ value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
++ if (!err) {
++ u32 low, high;
++
++ value &= ~(1ULL << 2);
++ low = lower_32_bits(value);
++ high = upper_32_bits(value);
++
++ native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
++ }
++
++ /* Flush tlb to evict multi-match entries */
++ __flush_tlb_all();
++
++ return true;
++}
++
+ static void svm_handle_mce(struct vcpu_svm *svm)
+ {
++ if (is_erratum_383()) {
++ /*
++ * Erratum 383 triggered. Guest state is corrupt so kill the
++ * guest.
++ */
++ pr_err("KVM: Guest triggered AMD Erratum 383\n");
++
++ set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests);
++
++ return;
++ }
++
+ /*
+ * On an #MC intercept the MCE handler is not called automatically in
+ * the host. So do it by hand here.
--- /dev/null
+From de6e76774ecec8a14ef63d3ad383479ca98633e6 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 30 May 2010 22:49:17 -0400
+Subject: ext4, jbd2: Add barriers for file systems with exernal journals
+
+commit cc3e1bea5d87635c519da657303690f5538bb4eb upstream (as of v2.6.33-rc3)
+
+This is a bit complicated because we are trying to optimize when we
+send barriers to the fs data disk. We could just throw in an extra
+barrier to the data disk whenever we send a barrier to the journal
+disk, but that's not always strictly necessary.
+
+We only need to send a barrier during a commit when there are data
+blocks which are must be written out due to an inode written in
+ordered mode, or if fsync() depends on the commit to force data blocks
+to disk. Finally, before we drop transactions from the beginning of
+the journal during a checkpoint operation, we need to guarantee that
+any blocks that were flushed out to the data disk are firmly on the
+rust platter before we drop the transaction from the journal.
+
+Thanks to Oleg Drokin for pointing out this flaw in ext3/ext4.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/fsync.c | 16 ++++++++++++++--
+ fs/jbd2/checkpoint.c | 15 +++++++++++++++
+ fs/jbd2/commit.c | 19 +++++++++++--------
+ include/linux/jbd2.h | 1 +
+ 4 files changed, 41 insertions(+), 10 deletions(-)
+
+--- a/fs/ext4/fsync.c
++++ b/fs/ext4/fsync.c
+@@ -88,9 +88,21 @@ int ext4_sync_file(struct file *file, st
+ return ext4_force_commit(inode->i_sb);
+
+ commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+- if (jbd2_log_start_commit(journal, commit_tid))
++ if (jbd2_log_start_commit(journal, commit_tid)) {
++ /*
++ * When the journal is on a different device than the
++ * fs data disk, we need to issue the barrier in
++ * writeback mode. (In ordered mode, the jbd2 layer
++ * will take care of issuing the barrier. In
++ * data=journal, all of the data blocks are written to
++ * the journal device.)
++ */
++ if (ext4_should_writeback_data(inode) &&
++ (journal->j_fs_dev != journal->j_dev) &&
++ (journal->j_flags & JBD2_BARRIER))
++ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ jbd2_log_wait_commit(journal, commit_tid);
+- else if (journal->j_flags & JBD2_BARRIER)
++ } else if (journal->j_flags & JBD2_BARRIER)
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ return ret;
+ }
+--- a/fs/jbd2/checkpoint.c
++++ b/fs/jbd2/checkpoint.c
+@@ -22,6 +22,7 @@
+ #include <linux/jbd2.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
++#include <linux/blkdev.h>
+ #include <trace/events/jbd2.h>
+
+ /*
+@@ -515,6 +516,20 @@ int jbd2_cleanup_journal_tail(journal_t
+ journal->j_tail_sequence = first_tid;
+ journal->j_tail = blocknr;
+ spin_unlock(&journal->j_state_lock);
++
++ /*
++ * If there is an external journal, we need to make sure that
++ * any data blocks that were recently written out --- perhaps
++ * by jbd2_log_do_checkpoint() --- are flushed out before we
++ * drop the transactions from the external journal. It's
++ * unlikely this will be necessary, especially with a
++ * appropriately sized journal, but we need this to guarantee
++ * correctness. Fortunately jbd2_cleanup_journal_tail()
++ * doesn't get called all that often.
++ */
++ if ((journal->j_fs_dev != journal->j_dev) &&
++ (journal->j_flags & JBD2_BARRIER))
++ blkdev_issue_flush(journal->j_fs_dev, NULL);
+ if (!(journal->j_flags & JBD2_ABORT))
+ jbd2_journal_update_superblock(journal, 1);
+ return 0;
+--- a/fs/jbd2/commit.c
++++ b/fs/jbd2/commit.c
+@@ -259,6 +259,7 @@ static int journal_submit_data_buffers(j
+ ret = err;
+ spin_lock(&journal->j_list_lock);
+ J_ASSERT(jinode->i_transaction == commit_transaction);
++ commit_transaction->t_flushed_data_blocks = 1;
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+ }
+@@ -708,8 +709,17 @@ start_journal_io:
+ }
+ }
+
+- /* Done it all: now write the commit record asynchronously. */
++ /*
++ * If the journal is not located on the file system device,
++ * then we must flush the file system device before we issue
++ * the commit record
++ */
++ if (commit_transaction->t_flushed_data_blocks &&
++ (journal->j_fs_dev != journal->j_dev) &&
++ (journal->j_flags & JBD2_BARRIER))
++ blkdev_issue_flush(journal->j_fs_dev, NULL);
+
++ /* Done it all: now write the commit record asynchronously. */
+ if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ err = journal_submit_commit_record(journal, commit_transaction,
+@@ -720,13 +730,6 @@ start_journal_io:
+ blkdev_issue_flush(journal->j_dev, NULL);
+ }
+
+- /*
+- * This is the right place to wait for data buffers both for ASYNC
+- * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+- * the commit block went to disk (which happens above). If commit is
+- * SYNC, we need to wait for data buffers before we start writing
+- * commit block, which happens below in such setting.
+- */
+ err = journal_finish_inode_data_buffers(journal, commit_transaction);
+ if (err) {
+ printk(KERN_WARNING
+--- a/include/linux/jbd2.h
++++ b/include/linux/jbd2.h
+@@ -653,6 +653,7 @@ struct transaction_s
+ * waiting for it to finish.
+ */
+ unsigned int t_synchronous_commit:1;
++ unsigned int t_flushed_data_blocks:1;
+
+ /*
+ * For use by the filesystem to store fs-specific data
--- /dev/null
+From 51e00c5c8ddedce8030521bf8645d90b82854980 Mon Sep 17 00:00:00 2001
+From: Marcelo Tosatti <mtosatti@redhat.com>
+Date: Fri, 28 May 2010 09:44:59 -0300
+Subject: KVM: MMU: invalidate and flush on spte small->large page size change
+
+Always invalidate spte and flush TLBs when changing page size, to make
+sure different sized translations for the same address are never cached
+in a CPU's TLB.
+
+Currently the only case where this occurs is when a non-leaf spte pointer is
+overwritten by a leaf, large spte entry. This can happen after dirty
+logging is disabled on a memslot, for example.
+
+Noticed by Andrea.
+
+KVM-Stable-Tag
+Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
+Signed-off-by: Avi Kivity <avi@redhat.com>
+(cherry picked from commit 3be2264be3c00865116f997dc53ebcc90fe7fc4b)
+---
+ arch/x86/kvm/mmu.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/x86/kvm/mmu.c
++++ b/arch/x86/kvm/mmu.c
+@@ -1901,6 +1901,8 @@ static void mmu_set_spte(struct kvm_vcpu
+
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ mmu_page_remove_parent_pte(child, sptep);
++ __set_spte(sptep, shadow_trap_nonpresent_pte);
++ kvm_flush_remote_tlbs(vcpu->kvm);
+ } else if (pfn != spte_to_pfn(*sptep)) {
+ pgprintk("hfn old %lx new %lx\n",
+ spte_to_pfn(*sptep), pfn);
--- /dev/null
+From 857855f2523af677951cb3bba61396813df6128d Mon Sep 17 00:00:00 2001
+From: Julia Lawall <julia@diku.dk>
+Date: Sun, 30 May 2010 22:49:18 -0400
+Subject: ext4: Eliminate potential double free on error path
+
+commit d3533d72e7478a61a3e1936956fc825289a2acf4 upstream (as of v2.6.33-rc3)
+
+b_entry_name and buffer are initially NULL, are initialized within a loop
+to the result of calling kmalloc, and are freed at the bottom of this loop.
+The loop contains gotos to cleanup, which also frees b_entry_name and
+buffer. Some of these gotos are before the reinitializations of
+b_entry_name and buffer. To maintain the invariant that b_entry_name and
+buffer are NULL at the top of the loop, and thus acceptable arguments to
+kfree, these variables are now set to NULL after the kfrees.
+
+This seems to be the simplest solution. A more complicated solution
+would be to introduce more labels in the error handling code at the end of
+the function.
+
+A simplified version of the semantic match that finds this problem is as
+follows: (http://coccinelle.lip6.fr/)
+
+// <smpl>
+@r@
+identifier E;
+expression E1;
+iterator I;
+statement S;
+@@
+
+*kfree(E);
+... when != E = E1
+ when != I(E,...) S
+ when != &E
+*kfree(E);
+// </smpl>
+
+Signed-off-by: Julia Lawall <julia@diku.dk>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/xattr.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -1327,6 +1327,8 @@ retry:
+ goto cleanup;
+ kfree(b_entry_name);
+ kfree(buffer);
++ b_entry_name = NULL;
++ buffer = NULL;
+ brelse(is->iloc.bh);
+ kfree(is);
+ kfree(bs);
--- /dev/null
+From 657eba6d2e9501946a11cc4f53148e46e4b3cbe1 Mon Sep 17 00:00:00 2001
+From: Richard Kennedy <richard@rsk.demon.co.uk>
+Date: Sun, 30 May 2010 22:49:19 -0400
+Subject: ext4: return correct wbc.nr_to_write in ext4_da_writepages
+
+commit 2faf2e19dd0e060eeb32442858ef495ac3083277 upstream (as of v2.6.33-rc3)
+
+When ext4_da_writepages increases the nr_to_write in writeback_control
+then it must always re-base the return value. Originally there was a
+(misguided) attempt prevent wbc.nr_to_write from going negative. In
+fact, it's necessary to allow nr_to_write to be negative so that
+wb_writeback() can correctly calculate how many pages were actually
+written.
+
+Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3010,8 +3010,7 @@ retry:
+ out_writepages:
+ if (!no_nrwrite_index_update)
+ wbc->no_nrwrite_index_update = 0;
+- if (wbc->nr_to_write > nr_to_writebump)
+- wbc->nr_to_write -= nr_to_writebump;
++ wbc->nr_to_write -= nr_to_writebump;
+ wbc->range_start = range_start;
+ trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
+ return ret;
--- /dev/null
+From 436e2704a8b589fb1217add4f9e5be480773ca6c Mon Sep 17 00:00:00 2001
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Sun, 30 May 2010 22:49:20 -0400
+Subject: ext4: Ensure zeroout blocks have no dirty metadata
+
+commit 515f41c33a9d44a964264c9511ad2c869af1fac3 upstream (as of v2.6.33-rc3)
+
+This fixes a bug (found by Curt Wohlgemuth) in which new blocks
+returned from an extent created with ext4_ext_zeroout() can have dirty
+metadata still associated with them.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c | 20 ++++++++++++++++++++
+ 1 file changed, 20 insertions(+)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3029,6 +3029,14 @@ out:
+ return err;
+ }
+
++static void unmap_underlying_metadata_blocks(struct block_device *bdev,
++ sector_t block, int count)
++{
++ int i;
++ for (i = 0; i < count; i++)
++ unmap_underlying_metadata(bdev, block + i);
++}
++
+ static int
+ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, unsigned int max_blocks,
+@@ -3104,6 +3112,18 @@ out:
+ } else
+ allocated = ret;
+ set_buffer_new(bh_result);
++ /*
++ * if we allocated more blocks than requested
++ * we need to make sure we unmap the extra block
++ * allocated. The actual needed block will get
++ * unmapped later when we find the buffer_head marked
++ * new.
++ */
++ if (allocated > max_blocks) {
++ unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
++ newblock + max_blocks,
++ allocated - max_blocks);
++ }
+ map_out:
+ set_buffer_mapped(bh_result);
+ out1:
--- /dev/null
+From 74ded2cc0427839ccdda41f2738130f0eea77fde Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 30 May 2010 22:49:21 -0400
+Subject: ext4: Patch up how we claim metadata blocks for quota purposes
+
+commit 0637c6f4135f592f094207c7c21e7c0fc5557834 upstream (as of v2.6.33-rc3)
+
+As reported in Kernel Bugzilla #14936, commit d21cd8f triggered a BUG
+in the function ext4_da_update_reserve_space() found in
+fs/ext4/inode.c. The root cause of this BUG() was caused by the fact
+that ext4_calc_metadata_amount() can severely over-estimate how many
+metadata blocks will be needed, especially when using direct
+block-mapped files.
+
+In addition, it can also badly *under* estimate how much space is
+needed, since ext4_calc_metadata_amount() assumes that the blocks are
+contiguous, and this is not always true. If the application is
+writing blocks to a sparse file, the number of metadata blocks
+necessary can be severly underestimated by the functions
+ext4_da_reserve_space(), ext4_da_update_reserve_space() and
+ext4_da_release_space(). This was the cause of the dq_claim_space
+reports found on kerneloops.org.
+
+Unfortunately, doing this right means that we need to massively
+over-estimate the amount of free space needed. So in some cases we
+may need to force the inode to be written to disk asynchronously in
+to avoid spurious quota failures.
+
+http://bugzilla.kernel.org/show_bug.cgi?id=14936
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 153 ++++++++++++++++++++++++++++++--------------------------
+ 1 file changed, 82 insertions(+), 71 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1085,43 +1085,47 @@ static int ext4_calc_metadata_amount(str
+ return ext4_indirect_calc_metadata_amount(inode, blocks);
+ }
+
++/*
++ * Called with i_data_sem down, which is important since we can call
++ * ext4_discard_preallocations() from here.
++ */
+ static void ext4_da_update_reserve_space(struct inode *inode, int used)
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+- int total, mdb, mdb_free, mdb_claim = 0;
++ struct ext4_inode_info *ei = EXT4_I(inode);
++ int mdb_free = 0;
+
+- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+- /* recalculate the number of metablocks still need to be reserved */
+- total = EXT4_I(inode)->i_reserved_data_blocks - used;
+- mdb = ext4_calc_metadata_amount(inode, total);
+-
+- /* figure out how many metablocks to release */
+- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+- mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+-
+- if (mdb_free) {
+- /* Account for allocated meta_blocks */
+- mdb_claim = EXT4_I(inode)->i_allocated_meta_blocks;
+- BUG_ON(mdb_free < mdb_claim);
+- mdb_free -= mdb_claim;
++ spin_lock(&ei->i_block_reservation_lock);
++ if (unlikely(used > ei->i_reserved_data_blocks)) {
++ ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
++ "with only %d reserved data blocks\n",
++ __func__, inode->i_ino, used,
++ ei->i_reserved_data_blocks);
++ WARN_ON(1);
++ used = ei->i_reserved_data_blocks;
++ }
++
++ /* Update per-inode reservations */
++ ei->i_reserved_data_blocks -= used;
++ used += ei->i_allocated_meta_blocks;
++ ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
++ ei->i_allocated_meta_blocks = 0;
++ percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
+
+- /* update fs dirty blocks counter */
++ if (ei->i_reserved_data_blocks == 0) {
++ /*
++ * We can release all of the reserved metadata blocks
++ * only when we have written all of the delayed
++ * allocation blocks.
++ */
++ mdb_free = ei->i_allocated_meta_blocks;
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+- EXT4_I(inode)->i_allocated_meta_blocks = 0;
+- EXT4_I(inode)->i_reserved_meta_blocks = mdb;
++ ei->i_allocated_meta_blocks = 0;
+ }
+-
+- /* update per-inode reservations */
+- BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
+- EXT4_I(inode)->i_reserved_data_blocks -= used;
+- percpu_counter_sub(&sbi->s_dirtyblocks_counter, used + mdb_claim);
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+- vfs_dq_claim_block(inode, used + mdb_claim);
+-
+- /*
+- * free those over-booking quota for metadata blocks
+- */
++ /* Update quota subsystem */
++ vfs_dq_claim_block(inode, used);
+ if (mdb_free)
+ vfs_dq_release_reservation_block(inode, mdb_free);
+
+@@ -1130,7 +1134,8 @@ static void ext4_da_update_reserve_space
+ * there aren't any writers on the inode, we can discard the
+ * inode's preallocations.
+ */
+- if (!total && (atomic_read(&inode->i_writecount) == 0))
++ if ((ei->i_reserved_data_blocks == 0) &&
++ (atomic_read(&inode->i_writecount) == 0))
+ ext4_discard_preallocations(inode);
+ }
+
+@@ -1843,7 +1848,8 @@ static int ext4_da_reserve_space(struct
+ {
+ int retries = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+- unsigned long md_needed, mdblocks, total = 0;
++ struct ext4_inode_info *ei = EXT4_I(inode);
++ unsigned long md_needed, md_reserved, total = 0;
+
+ /*
+ * recalculate the amount of metadata blocks to reserve
+@@ -1851,35 +1857,44 @@ static int ext4_da_reserve_space(struct
+ * worse case is one extent per block
+ */
+ repeat:
+- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+- total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+- mdblocks = ext4_calc_metadata_amount(inode, total);
+- BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+-
+- md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
++ spin_lock(&ei->i_block_reservation_lock);
++ md_reserved = ei->i_reserved_meta_blocks;
++ md_needed = ext4_calc_metadata_amount(inode, nrblocks);
+ total = md_needed + nrblocks;
+- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
++ spin_unlock(&ei->i_block_reservation_lock);
+
+ /*
+ * Make quota reservation here to prevent quota overflow
+ * later. Real quota accounting is done at pages writeout
+ * time.
+ */
+- if (vfs_dq_reserve_block(inode, total))
++ if (vfs_dq_reserve_block(inode, total)) {
++ /*
++ * We tend to badly over-estimate the amount of
++ * metadata blocks which are needed, so if we have
++ * reserved any metadata blocks, try to force out the
++ * inode and see if we have any better luck.
++ */
++ if (md_reserved && retries++ <= 3)
++ goto retry;
+ return -EDQUOT;
++ }
+
+ if (ext4_claim_free_blocks(sbi, total)) {
+ vfs_dq_release_reservation_block(inode, total);
+ if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
++ retry:
++ if (md_reserved)
++ write_inode_now(inode, (retries == 3));
+ yield();
+ goto repeat;
+ }
+ return -ENOSPC;
+ }
+- spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+- EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+- EXT4_I(inode)->i_reserved_meta_blocks += md_needed;
+- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
++ spin_lock(&ei->i_block_reservation_lock);
++ ei->i_reserved_data_blocks += nrblocks;
++ ei->i_reserved_meta_blocks += md_needed;
++ spin_unlock(&ei->i_block_reservation_lock);
+
+ return 0; /* success */
+ }
+@@ -1887,49 +1902,45 @@ repeat:
+ static void ext4_da_release_space(struct inode *inode, int to_free)
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+- int total, mdb, mdb_free, release;
++ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (!to_free)
+ return; /* Nothing to release, exit */
+
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+
+- if (!EXT4_I(inode)->i_reserved_data_blocks) {
++ if (unlikely(to_free > ei->i_reserved_data_blocks)) {
+ /*
+- * if there is no reserved blocks, but we try to free some
+- * then the counter is messed up somewhere.
+- * but since this function is called from invalidate
+- * page, it's harmless to return without any action
++ * if there aren't enough reserved blocks, then the
++ * counter is messed up somewhere. Since this
++ * function is called from invalidate page, it's
++ * harmless to return without any action.
+ */
+- printk(KERN_INFO "ext4 delalloc try to release %d reserved "
+- "blocks for inode %lu, but there is no reserved "
+- "data blocks\n", to_free, inode->i_ino);
+- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+- return;
++ ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
++ "ino %lu, to_free %d with only %d reserved "
++ "data blocks\n", inode->i_ino, to_free,
++ ei->i_reserved_data_blocks);
++ WARN_ON(1);
++ to_free = ei->i_reserved_data_blocks;
+ }
++ ei->i_reserved_data_blocks -= to_free;
+
+- /* recalculate the number of metablocks still need to be reserved */
+- total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
+- mdb = ext4_calc_metadata_amount(inode, total);
+-
+- /* figure out how many metablocks to release */
+- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+- mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+-
+- release = to_free + mdb_free;
+-
+- /* update fs dirty blocks counter for truncate case */
+- percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
++ if (ei->i_reserved_data_blocks == 0) {
++ /*
++ * We can release all of the reserved metadata blocks
++ * only when we have written all of the delayed
++ * allocation blocks.
++ */
++ to_free += ei->i_allocated_meta_blocks;
++ ei->i_allocated_meta_blocks = 0;
++ }
+
+- /* update per-inode reservations */
+- BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
+- EXT4_I(inode)->i_reserved_data_blocks -= to_free;
++ /* update fs dirty blocks counter */
++ percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
+
+- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+- EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+- vfs_dq_release_reservation_block(inode, release);
++ vfs_dq_release_reservation_block(inode, to_free);
+ }
+
+ static void ext4_da_page_release_reservation(struct page *page,
--- /dev/null
+From 81799214a5369211cf9046735dafcf59a29e7454 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 30 May 2010 22:49:22 -0400
+Subject: ext4: Fix accounting of reserved metadata blocks
+
+commit ee5f4d9cdf32fd99172d11665c592a288c2b1ff4 upstream (as of v2.6.33-rc3)
+
+Commit 0637c6f had a typo which caused the reserved metadata blocks to
+not be released correctly. Fix this.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1118,9 +1118,9 @@ static void ext4_da_update_reserve_space
+ * only when we have written all of the delayed
+ * allocation blocks.
+ */
+- mdb_free = ei->i_allocated_meta_blocks;
++ mdb_free = ei->i_reserved_meta_blocks;
++ ei->i_reserved_meta_blocks = 0;
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+- ei->i_allocated_meta_blocks = 0;
+ }
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+@@ -1931,8 +1931,8 @@ static void ext4_da_release_space(struct
+ * only when we have written all of the delayed
+ * allocation blocks.
+ */
+- to_free += ei->i_allocated_meta_blocks;
+- ei->i_allocated_meta_blocks = 0;
++ to_free += ei->i_reserved_meta_blocks;
++ ei->i_reserved_meta_blocks = 0;
+ }
+
+ /* update fs dirty blocks counter */
--- /dev/null
+From 665d82f8d039371ba402227e99d3b95078c97fb9 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 30 May 2010 22:49:23 -0400
+Subject: ext4: Calculate metadata requirements more accurately
+
+commit 9d0be50230b333005635967f7ecd4897dbfd181b upstream (as of v2.6.33-rc3)
+
+In the past, ext4_calc_metadata_amount(), and its sub-functions
+ext4_ext_calc_metadata_amount() and ext4_indirect_calc_metadata_amount()
+badly over-estimated the number of metadata blocks that might be
+required for delayed allocation blocks. This didn't matter as much
+when functions which managed the reserved metadata blocks were more
+aggressive about dropping reserved metadata blocks as delayed
+allocation blocks were written, but unfortunately they were too
+aggressive. This was fixed in commit 0637c6f, but as a result the
+over-estimation by ext4_calc_metadata_amount() would lead to reserving
+2-3 times the number of pending delayed allocation blocks as
+potentially required metadata blocks. So if there are 1 megabytes of
+blocks which have been not yet been allocation, up to 3 megabytes of
+space would get reserved out of the user's quota and from the file
+system free space pool until all of the inode's data blocks have been
+allocated.
+
+This commit addresses this problem by much more accurately estimating
+the number of metadata blocks that will be required. It will still
+somewhat over-estimate the number of blocks needed, since it must make
+a worst case estimate not knowing which physical blocks will be
+needed, but it is much more accurate than before.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 2 +
+ fs/ext4/ext4_extents.h | 3 +-
+ fs/ext4/extents.c | 49 ++++++++++++++++++++++++-------------
+ fs/ext4/inode.c | 62 +++++++++++++++++++++++++++--------------------
+ fs/ext4/super.c | 1 +
+ 5 files changed, 73 insertions(+), 44 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 4a825c1..23bfbbc 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -693,6 +693,8 @@ struct ext4_inode_info {
+ unsigned int i_reserved_meta_blocks;
+ unsigned int i_allocated_meta_blocks;
+ unsigned short i_delalloc_reserved_flag;
++ sector_t i_da_metadata_calc_last_lblock;
++ int i_da_metadata_calc_len;
+
+ /* on-disk additional length */
+ __u16 i_extra_isize;
+diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
+index 2ca6864..bdb6ce7 100644
+--- a/fs/ext4/ext4_extents.h
++++ b/fs/ext4/ext4_extents.h
+@@ -225,7 +225,8 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
+ ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
+ }
+
+-extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
++extern int ext4_ext_calc_metadata_amount(struct inode *inode,
++ sector_t lblocks);
+ extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
+ extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
+ extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
+diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
+index b14fb6d..5f03f9f 100644
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -296,29 +296,44 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+-int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
++int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
+ {
+- int lcap, icap, rcap, leafs, idxs, num;
+- int newextents = blocks;
+-
+- rcap = ext4_ext_space_root_idx(inode, 0);
+- lcap = ext4_ext_space_block(inode, 0);
+- icap = ext4_ext_space_block_idx(inode, 0);
++ struct ext4_inode_info *ei = EXT4_I(inode);
++ int idxs, num = 0;
+
+- /* number of new leaf blocks needed */
+- num = leafs = (newextents + lcap - 1) / lcap;
++ idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
++ / sizeof(struct ext4_extent_idx));
+
+ /*
+- * Worse case, we need separate index block(s)
+- * to link all new leaf blocks
++ * If the new delayed allocation block is contiguous with the
++ * previous da block, it can share index blocks with the
++ * previous block, so we only need to allocate a new index
++ * block every idxs leaf blocks. At ldxs**2 blocks, we need
++ * an additional index block, and at ldxs**3 blocks, yet
++ * another index blocks.
+ */
+- idxs = (leafs + icap - 1) / icap;
+- do {
+- num += idxs;
+- idxs = (idxs + icap - 1) / icap;
+- } while (idxs > rcap);
++ if (ei->i_da_metadata_calc_len &&
++ ei->i_da_metadata_calc_last_lblock+1 == lblock) {
++ if ((ei->i_da_metadata_calc_len % idxs) == 0)
++ num++;
++ if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
++ num++;
++ if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
++ num++;
++ ei->i_da_metadata_calc_len = 0;
++ } else
++ ei->i_da_metadata_calc_len++;
++ ei->i_da_metadata_calc_last_lblock++;
++ return num;
++ }
+
+- return num;
++ /*
++ * In the worst case we need a new set of index blocks at
++ * every level of the inode's extent tree.
++ */
++ ei->i_da_metadata_calc_len = 1;
++ ei->i_da_metadata_calc_last_lblock = lblock;
++ return ext_depth(inode) + 1;
+ }
+
+ static int
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 533bb84..2e3f422 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1051,38 +1051,44 @@ qsize_t *ext4_get_reserved_space(struct inode *inode)
+ return &EXT4_I(inode)->i_reserved_quota;
+ }
+ #endif
++
+ /*
+ * Calculate the number of metadata blocks need to reserve
+- * to allocate @blocks for non extent file based file
++ * to allocate a new block at @lblocks for non extent file based file
+ */
+-static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
++static int ext4_indirect_calc_metadata_amount(struct inode *inode,
++ sector_t lblock)
+ {
+- int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+- int ind_blks, dind_blks, tind_blks;
+-
+- /* number of new indirect blocks needed */
+- ind_blks = (blocks + icap - 1) / icap;
++ struct ext4_inode_info *ei = EXT4_I(inode);
++ int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1;
++ int blk_bits;
+
+- dind_blks = (ind_blks + icap - 1) / icap;
++ if (lblock < EXT4_NDIR_BLOCKS)
++ return 0;
+
+- tind_blks = 1;
++ lblock -= EXT4_NDIR_BLOCKS;
+
+- return ind_blks + dind_blks + tind_blks;
++ if (ei->i_da_metadata_calc_len &&
++ (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
++ ei->i_da_metadata_calc_len++;
++ return 0;
++ }
++ ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
++ ei->i_da_metadata_calc_len = 1;
++ blk_bits = roundup_pow_of_two(lblock + 1);
++ return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
+ }
+
+ /*
+ * Calculate the number of metadata blocks need to reserve
+- * to allocate given number of blocks
++ * to allocate a block located at @lblock
+ */
+-static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
++static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
+ {
+- if (!blocks)
+- return 0;
+-
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+- return ext4_ext_calc_metadata_amount(inode, blocks);
++ return ext4_ext_calc_metadata_amount(inode, lblock);
+
+- return ext4_indirect_calc_metadata_amount(inode, blocks);
++ return ext4_indirect_calc_metadata_amount(inode, lblock);
+ }
+
+ /*
+@@ -1120,6 +1126,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
+ */
+ mdb_free = ei->i_reserved_meta_blocks;
+ ei->i_reserved_meta_blocks = 0;
++ ei->i_da_metadata_calc_len = 0;
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+ }
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+@@ -1844,12 +1851,15 @@ static int ext4_journalled_write_end(struct file *file,
+ return ret ? ret : copied;
+ }
+
+-static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
++/*
++ * Reserve a single block located at lblock
++ */
++static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
+ {
+ int retries = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+- unsigned long md_needed, md_reserved, total = 0;
++ unsigned long md_needed, md_reserved;
+
+ /*
+ * recalculate the amount of metadata blocks to reserve
+@@ -1859,8 +1869,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+ repeat:
+ spin_lock(&ei->i_block_reservation_lock);
+ md_reserved = ei->i_reserved_meta_blocks;
+- md_needed = ext4_calc_metadata_amount(inode, nrblocks);
+- total = md_needed + nrblocks;
++ md_needed = ext4_calc_metadata_amount(inode, lblock);
+ spin_unlock(&ei->i_block_reservation_lock);
+
+ /*
+@@ -1868,7 +1877,7 @@ repeat:
+ * later. Real quota accounting is done at pages writeout
+ * time.
+ */
+- if (vfs_dq_reserve_block(inode, total)) {
++ if (vfs_dq_reserve_block(inode, md_needed + 1)) {
+ /*
+ * We tend to badly over-estimate the amount of
+ * metadata blocks which are needed, so if we have
+@@ -1880,8 +1889,8 @@ repeat:
+ return -EDQUOT;
+ }
+
+- if (ext4_claim_free_blocks(sbi, total)) {
+- vfs_dq_release_reservation_block(inode, total);
++ if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
++ vfs_dq_release_reservation_block(inode, md_needed + 1);
+ if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+ retry:
+ if (md_reserved)
+@@ -1892,7 +1901,7 @@ repeat:
+ return -ENOSPC;
+ }
+ spin_lock(&ei->i_block_reservation_lock);
+- ei->i_reserved_data_blocks += nrblocks;
++ ei->i_reserved_data_blocks++;
+ ei->i_reserved_meta_blocks += md_needed;
+ spin_unlock(&ei->i_block_reservation_lock);
+
+@@ -1933,6 +1942,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
+ */
+ to_free += ei->i_reserved_meta_blocks;
+ ei->i_reserved_meta_blocks = 0;
++ ei->i_da_metadata_calc_len = 0;
+ }
+
+ /* update fs dirty blocks counter */
+@@ -2546,7 +2556,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ * XXX: __block_prepare_write() unmaps passed block,
+ * is it OK?
+ */
+- ret = ext4_da_reserve_space(inode, 1);
++ ret = ext4_da_reserve_space(inode, iblock);
+ if (ret)
+ /* not enough space to reserve */
+ return ret;
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 92943f2..252f30b 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -702,6 +702,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
+ ei->i_reserved_data_blocks = 0;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_allocated_meta_blocks = 0;
++ ei->i_da_metadata_calc_len = 0;
+ ei->i_delalloc_reserved_flag = 0;
+ spin_lock_init(&(ei->i_block_reservation_lock));
+ #ifdef CONFIG_QUOTA
+--
+1.7.1
+
--- /dev/null
+From 34e8248f530c4db6c4ba200c945257e0713d9905 Mon Sep 17 00:00:00 2001
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Sun, 30 May 2010 22:49:24 -0400
+Subject: ext4: Handle -EDQUOT error on write
+
+commit 1db913823c0f8360fccbd24ca67eb073966a5ffd upstream (as of v2.6.33-rc6)
+
+We need to release the journal before we do a write_inode. Otherwise
+we could deadlock.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 32 ++++++++++++++++++--------------
+ 1 file changed, 18 insertions(+), 14 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1877,24 +1877,12 @@ repeat:
+ * later. Real quota accounting is done at pages writeout
+ * time.
+ */
+- if (vfs_dq_reserve_block(inode, md_needed + 1)) {
+- /*
+- * We tend to badly over-estimate the amount of
+- * metadata blocks which are needed, so if we have
+- * reserved any metadata blocks, try to force out the
+- * inode and see if we have any better luck.
+- */
+- if (md_reserved && retries++ <= 3)
+- goto retry;
++ if (vfs_dq_reserve_block(inode, md_needed + 1))
+ return -EDQUOT;
+- }
+
+ if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
+ vfs_dq_release_reservation_block(inode, md_needed + 1);
+ if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+- retry:
+- if (md_reserved)
+- write_inode_now(inode, (retries == 3));
+ yield();
+ goto repeat;
+ }
+@@ -3075,7 +3063,7 @@ static int ext4_da_write_begin(struct fi
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+ {
+- int ret, retries = 0;
++ int ret, retries = 0, quota_retries = 0;
+ struct page *page;
+ pgoff_t index;
+ unsigned from, to;
+@@ -3134,6 +3122,22 @@ retry:
+
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
++
++ if ((ret == -EDQUOT) &&
++ EXT4_I(inode)->i_reserved_meta_blocks &&
++ (quota_retries++ < 3)) {
++ /*
++ * Since we often over-estimate the number of meta
++ * data blocks required, we may sometimes get a
++ * spurios out of quota error even though there would
++ * be enough space once we write the data blocks and
++ * find out how many meta data blocks were _really_
++ * required. So try forcing the inode write to see if
++ * that helps.
++ */
++ write_inode_now(inode, (quota_retries == 3));
++ goto retry;
++ }
+ out:
+ return ret;
+ }
--- /dev/null
+From 09e8f5642b741ecfdd05c259b47796f85fdd01aa Mon Sep 17 00:00:00 2001
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Sun, 30 May 2010 22:49:25 -0400
+Subject: ext4: Fix quota accounting error with fallocate
+
+commit 5f634d064c709ea02c3cdaa850a08323a4a4bf28 upstream (as of v2.6.33-rc6)
+
+When we fallocate a region of the file which we had recently written,
+and which is still in the page cache marked as delayed allocated blocks
+we need to make sure we don't do the quota update on writepage path.
+This is because the needed quota updated would have already be done
+by fallocate.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 2 ++
+ fs/ext4/extents.c | 21 +++++++++++++++++++++
+ fs/ext4/inode.c | 44 +++++++++++++++++++++++++++++++-------------
+ 3 files changed, 54 insertions(+), 13 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1440,6 +1440,8 @@ extern int ext4_block_truncate_page(hand
+ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+ extern qsize_t *ext4_get_reserved_space(struct inode *inode);
+ extern int flush_aio_dio_completed_IO(struct inode *inode);
++extern void ext4_da_update_reserve_space(struct inode *inode,
++ int used, int quota_claim);
+ /* ioctl.c */
+ extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
+ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3138,7 +3138,19 @@ out:
+ unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
+ newblock + max_blocks,
+ allocated - max_blocks);
++ allocated = max_blocks;
+ }
++
++ /*
++ * If we have done fallocate with the offset that is already
++ * delayed allocated, we would have block reservation
++ * and quota reservation done in the delayed write path.
++ * But fallocate would have already updated quota and block
++ * count for this offset. So cancel these reservation
++ */
++ if (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)
++ ext4_da_update_reserve_space(inode, allocated, 0);
++
+ map_out:
+ set_buffer_mapped(bh_result);
+ out1:
+@@ -3374,9 +3386,18 @@ int ext4_ext_get_blocks(handle_t *handle
+ /* previous routine could use block we allocated */
+ newblock = ext_pblock(&newex);
+ allocated = ext4_ext_get_actual_len(&newex);
++ if (allocated > max_blocks)
++ allocated = max_blocks;
+ set_buffer_new(bh_result);
+
+ /*
++ * Update reserved blocks/metadata blocks after successful
++ * block allocation which had been deferred till now.
++ */
++ if (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)
++ ext4_da_update_reserve_space(inode, allocated, 1);
++
++ /*
+ * Cache the extent and update transaction to commit on fdatasync only
+ * when it is _not_ an uninitialized extent.
+ */
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1095,11 +1095,12 @@ static int ext4_calc_metadata_amount(str
+ * Called with i_data_sem down, which is important since we can call
+ * ext4_discard_preallocations() from here.
+ */
+-static void ext4_da_update_reserve_space(struct inode *inode, int used)
++void ext4_da_update_reserve_space(struct inode *inode,
++ int used, int quota_claim)
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
+- int mdb_free = 0;
++ int mdb_free = 0, allocated_meta_blocks = 0;
+
+ spin_lock(&ei->i_block_reservation_lock);
+ if (unlikely(used > ei->i_reserved_data_blocks)) {
+@@ -1115,6 +1116,7 @@ static void ext4_da_update_reserve_space
+ ei->i_reserved_data_blocks -= used;
+ used += ei->i_allocated_meta_blocks;
+ ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
++ allocated_meta_blocks = ei->i_allocated_meta_blocks;
+ ei->i_allocated_meta_blocks = 0;
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
+
+@@ -1132,9 +1134,23 @@ static void ext4_da_update_reserve_space
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ /* Update quota subsystem */
+- vfs_dq_claim_block(inode, used);
+- if (mdb_free)
+- vfs_dq_release_reservation_block(inode, mdb_free);
++ if (quota_claim) {
++ vfs_dq_claim_block(inode, used);
++ if (mdb_free)
++ vfs_dq_release_reservation_block(inode, mdb_free);
++ } else {
++ /*
++ * We did fallocate with an offset that is already delayed
++ * allocated. So on delayed allocated writeback we should
++ * not update the quota for allocated blocks. But then
++ * converting an fallocate region to initialized region would
++ * have caused a metadata allocation. So claim quota for
++ * that
++ */
++ if (allocated_meta_blocks)
++ vfs_dq_claim_block(inode, allocated_meta_blocks);
++ vfs_dq_release_reservation_block(inode, mdb_free + used);
++ }
+
+ /*
+ * If we have done all the pending block allocations and if
+@@ -1334,18 +1350,20 @@ int ext4_get_blocks(handle_t *handle, st
+ */
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+ }
+- }
+
++ /*
++ * Update reserved blocks/metadata blocks after successful
++ * block allocation which had been deferred till now. We don't
++ * support fallocate for non extent files. So we can update
++ * reserve space here.
++ */
++ if ((retval > 0) &&
++ (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
++ ext4_da_update_reserve_space(inode, retval, 1);
++ }
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+
+- /*
+- * Update reserved blocks/metadata blocks after successful
+- * block allocation which had been deferred till now.
+- */
+- if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
+- ext4_da_update_reserve_space(inode, retval);
+-
+ up_write((&EXT4_I(inode)->i_data_sem));
+ if (retval > 0 && buffer_mapped(bh)) {
+ int ret = check_block_validity(inode, "file system "
--- /dev/null
+From 3a1a12ca4219f564fe4f86cae1bfb563422a2d15 Mon Sep 17 00:00:00 2001
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Sun, 30 May 2010 22:49:26 -0400
+Subject: ext4: Drop EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE flag
+
+commit 1296cc85c26e94eb865d03f82140f27d598de467 upstream (as of v2.6.33-rc6)
+
+We should update reserve space if it is delalloc buffer
+and that is indicated by EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.
+So use EXT4_GET_BLOCKS_DELALLOC_RESERVE in place of
+EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE
+
+[ Stable note: This fixes a corruption cuased by the following
+ reproduction case:
+
+ rm -f $TEST_FN
+ touch $TEST_FN
+ fallocate -n -o 656712 -l 858907 $TEST_FN
+ dd if=/dev/zero of=$TEST_FN conv=notrunc bs=1 seek=1011020 count=36983
+ sync
+ dd if=/dev/zero of=$TEST_FN conv=notrunc bs=1 seek=332121 count=24005
+ dd if=/dev/zero of=$TEST_FN conv=notrunc bs=1 seek=1040179 count=93319
+
+ If the filesystem is then unmounted and e2fsck run forced, the
+ i_blocks field for the file $TEST_FN will be found to be incorrect. ]
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 7 ++-----
+ fs/ext4/extents.c | 4 ++--
+ fs/ext4/inode.c | 8 ++++----
+ 3 files changed, 8 insertions(+), 11 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -361,14 +361,11 @@ struct ext4_new_group_data {
+ so set the magic i_delalloc_reserve_flag after taking the
+ inode allocation semaphore for */
+ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
+- /* Call ext4_da_update_reserve_space() after successfully
+- allocating the blocks */
+-#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008
+ /* caller is from the direct IO path, request to creation of an
+ unitialized extents if not allocated, split the uninitialized
+ extent if blocks has been preallocated already*/
+-#define EXT4_GET_BLOCKS_DIO 0x0010
+-#define EXT4_GET_BLOCKS_CONVERT 0x0020
++#define EXT4_GET_BLOCKS_DIO 0x0008
++#define EXT4_GET_BLOCKS_CONVERT 0x0010
+ #define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\
+ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ /* Convert extent to initialized after direct IO complete */
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3148,7 +3148,7 @@ out:
+ * But fallocate would have already updated quota and block
+ * count for this offset. So cancel these reservation
+ */
+- if (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)
++ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ext4_da_update_reserve_space(inode, allocated, 0);
+
+ map_out:
+@@ -3394,7 +3394,7 @@ int ext4_ext_get_blocks(handle_t *handle
+ * Update reserved blocks/metadata blocks after successful
+ * block allocation which had been deferred till now.
+ */
+- if (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)
++ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ext4_da_update_reserve_space(inode, allocated, 1);
+
+ /*
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1358,7 +1358,7 @@ int ext4_get_blocks(handle_t *handle, st
+ * reserve space here.
+ */
+ if ((retval > 0) &&
+- (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
++ (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
+ ext4_da_update_reserve_space(inode, retval, 1);
+ }
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+@@ -2261,10 +2261,10 @@ static int mpage_da_map_blocks(struct mp
+ * variables are updated after the blocks have been allocated.
+ */
+ new.b_state = 0;
+- get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
+- EXT4_GET_BLOCKS_DELALLOC_RESERVE);
++ get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
+ if (mpd->b_state & (1 << BH_Delay))
+- get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
++ get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
++
+ blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
+ &new, get_blocks_flags);
+ if (blks < 0) {
--- /dev/null
+From f7ae767b11e7ac054c5f8de55e5a83ec7c60c6a0 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 30 May 2010 22:49:27 -0400
+Subject: ext4: Use bitops to read/modify EXT4_I(inode)->i_state
+
+commit 19f5fb7ad679bb361222c7916086435020c37cce upstream (as of v2.6.33-git11)
+
+At several places we modify EXT4_I(inode)->i_state without holding
+i_mutex (ext4_release_file, ext4_bmap, ext4_journalled_writepage,
+ext4_do_update_inode, ...). These modifications are racy and we can
+lose updates to i_state. So convert handling of i_state to use bitops
+which are atomic.
+
+Cc: Jan Kara <jack@suse.cz>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 41 +++++++++++++++++++++++++++++------------
+ fs/ext4/extents.c | 8 ++++----
+ fs/ext4/file.c | 4 ++--
+ fs/ext4/ialloc.c | 3 ++-
+ fs/ext4/inode.c | 38 ++++++++++++++++++++------------------
+ fs/ext4/migrate.c | 6 +++---
+ fs/ext4/xattr.c | 22 +++++++++++-----------
+ 7 files changed, 71 insertions(+), 51 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -313,17 +313,6 @@ static inline __u32 ext4_mask_flags(umod
+ return flags & EXT4_OTHER_FLMASK;
+ }
+
+-/*
+- * Inode dynamic state flags
+- */
+-#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */
+-#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
+-#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
+-#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
+-#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
+-#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
+-#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/
+-
+ /* Used to pass group descriptor data when online resize is done */
+ struct ext4_new_group_input {
+ __u32 group; /* Group number for this data */
+@@ -624,7 +613,7 @@ struct ext4_inode_info {
+ * near to their parent directory's inode.
+ */
+ ext4_group_t i_block_group;
+- __u32 i_state; /* Dynamic state flags for ext4 */
++ unsigned long i_state_flags; /* Dynamic state flags */
+
+ ext4_lblk_t i_dir_start_lookup;
+ #ifdef CONFIG_EXT4_FS_XATTR
+@@ -1044,6 +1033,34 @@ static inline int ext4_valid_inum(struct
+ (ino >= EXT4_FIRST_INO(sb) &&
+ ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
+ }
++
++/*
++ * Inode dynamic state flags
++ */
++enum {
++ EXT4_STATE_JDATA, /* journaled data exists */
++ EXT4_STATE_NEW, /* inode is newly created */
++ EXT4_STATE_XATTR, /* has in-inode xattrs */
++ EXT4_STATE_NO_EXPAND, /* No space for expansion */
++ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
++ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
++ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
++};
++
++static inline int ext4_test_inode_state(struct inode *inode, int bit)
++{
++ return test_bit(bit, &EXT4_I(inode)->i_state_flags);
++}
++
++static inline void ext4_set_inode_state(struct inode *inode, int bit)
++{
++ set_bit(bit, &EXT4_I(inode)->i_state_flags);
++}
++
++static inline void ext4_clear_inode_state(struct inode *inode, int bit)
++{
++ clear_bit(bit, &EXT4_I(inode)->i_state_flags);
++}
+ #else
+ /* Assume that user mode programs are passing in an ext4fs superblock, not
+ * a kernel struct super_block. This will allow us to call the feature-test
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3082,7 +3082,7 @@ ext4_ext_handle_uninitialized_extents(ha
+ if (io)
+ io->flag = DIO_AIO_UNWRITTEN;
+ else
+- EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
++ ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+ goto out;
+ }
+ /* async DIO end_io complete, convert the filled extent to written */
+@@ -3368,8 +3368,8 @@ int ext4_ext_get_blocks(handle_t *handle
+ if (io)
+ io->flag = DIO_AIO_UNWRITTEN;
+ else
+- EXT4_I(inode)->i_state |=
+- EXT4_STATE_DIO_UNWRITTEN;;
++ ext4_set_inode_state(inode,
++ EXT4_STATE_DIO_UNWRITTEN);
+ }
+ }
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+@@ -3745,7 +3745,7 @@ static int ext4_xattr_fiemap(struct inod
+ int error = 0;
+
+ /* in-inode? */
+- if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
++ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+ struct ext4_iloc iloc;
+ int offset; /* offset of xattr in inode */
+
+--- a/fs/ext4/file.c
++++ b/fs/ext4/file.c
+@@ -35,9 +35,9 @@
+ */
+ static int ext4_release_file(struct inode *inode, struct file *filp)
+ {
+- if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
++ if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
+ ext4_alloc_da_blocks(inode);
+- EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
++ ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+ }
+ /* if we are the last writer on the inode, drop the block reservation */
+ if ((filp->f_mode & FMODE_WRITE) &&
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -1029,7 +1029,8 @@ got:
+ inode->i_generation = sbi->s_next_generation++;
+ spin_unlock(&sbi->s_next_gen_lock);
+
+- ei->i_state = EXT4_STATE_NEW;
++ ei->i_state_flags = 0;
++ ext4_set_inode_state(inode, EXT4_STATE_NEW);
+
+ ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1348,7 +1348,7 @@ int ext4_get_blocks(handle_t *handle, st
+ * i_data's format changing. Force the migrate
+ * to fail by clearing migrate flags
+ */
+- EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
++ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+ }
+
+ /*
+@@ -1835,7 +1835,7 @@ static int ext4_journalled_write_end(str
+ new_i_size = pos + copied;
+ if (new_i_size > inode->i_size)
+ i_size_write(inode, pos+copied);
+- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
++ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ ext4_update_i_disksize(inode, new_i_size);
+ ret2 = ext4_mark_inode_dirty(handle, inode);
+@@ -2673,7 +2673,7 @@ static int __ext4_journalled_writepage(s
+ ret = err;
+
+ walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
+- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
++ ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ out:
+ return ret;
+ }
+@@ -3344,7 +3344,8 @@ static sector_t ext4_bmap(struct address
+ filemap_write_and_wait(mapping);
+ }
+
+- if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
++ if (EXT4_JOURNAL(inode) &&
++ ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
+ /*
+ * This is a REALLY heavyweight approach, but the use of
+ * bmap on dirty files is expected to be extremely rare:
+@@ -3363,7 +3364,7 @@ static sector_t ext4_bmap(struct address
+ * everything they get.
+ */
+
+- EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
++ ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
+ journal = EXT4_JOURNAL(inode);
+ jbd2_journal_lock_updates(journal);
+ err = jbd2_journal_flush(journal);
+@@ -3831,8 +3832,8 @@ static ssize_t ext4_ext_direct_IO(int rw
+ if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+ ext4_free_io_end(iocb->private);
+ iocb->private = NULL;
+- } else if (ret > 0 && (EXT4_I(inode)->i_state &
+- EXT4_STATE_DIO_UNWRITTEN)) {
++ } else if (ret > 0 && ext4_test_inode_state(inode,
++ EXT4_STATE_DIO_UNWRITTEN)) {
+ int err;
+ /*
+ * for non AIO case, since the IO is already
+@@ -3842,7 +3843,7 @@ static ssize_t ext4_ext_direct_IO(int rw
+ offset, ret);
+ if (err < 0)
+ ret = err;
+- EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
++ ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+ }
+ return ret;
+ }
+@@ -4490,7 +4491,7 @@ void ext4_truncate(struct inode *inode)
+ return;
+
+ if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+- ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
++ ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+ ext4_ext_truncate(inode);
+@@ -4776,7 +4777,7 @@ int ext4_get_inode_loc(struct inode *ino
+ {
+ /* We have all inode data except xattrs in memory here. */
+ return __ext4_get_inode_loc(inode, iloc,
+- !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
++ !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
+ }
+
+ void ext4_set_inode_flags(struct inode *inode)
+@@ -4870,7 +4871,7 @@ struct inode *ext4_iget(struct super_blo
+ }
+ inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+
+- ei->i_state = 0;
++ ei->i_state_flags = 0;
+ ei->i_dir_start_lookup = 0;
+ ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
+ /* We now have enough fields to check if the inode was active or not.
+@@ -4953,7 +4954,7 @@ struct inode *ext4_iget(struct super_blo
+ EXT4_GOOD_OLD_INODE_SIZE +
+ ei->i_extra_isize;
+ if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
+- ei->i_state |= EXT4_STATE_XATTR;
++ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ }
+ } else
+ ei->i_extra_isize = 0;
+@@ -5093,7 +5094,7 @@ static int ext4_do_update_inode(handle_t
+
+ /* For fields not not tracking in the in-memory inode,
+ * initialise them to zero for new inodes. */
+- if (ei->i_state & EXT4_STATE_NEW)
++ if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
+ memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+
+ ext4_get_inode_flags(ei);
+@@ -5189,7 +5190,7 @@ static int ext4_do_update_inode(handle_t
+ rc = ext4_handle_dirty_metadata(handle, inode, bh);
+ if (!err)
+ err = rc;
+- ei->i_state &= ~EXT4_STATE_NEW;
++ ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+
+ ext4_update_inode_fsync_trans(handle, inode, 0);
+ out_brelse:
+@@ -5613,8 +5614,8 @@ static int ext4_expand_extra_isize(struc
+ entry = IFIRST(header);
+
+ /* No extended attributes present */
+- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
+- header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
++ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
++ header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
+ new_extra_isize);
+ EXT4_I(inode)->i_extra_isize = new_extra_isize;
+@@ -5658,7 +5659,7 @@ int ext4_mark_inode_dirty(handle_t *hand
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (ext4_handle_valid(handle) &&
+ EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+- !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
++ !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
+ /*
+ * We need extra buffer credits since we may write into EA block
+ * with this same handle. If journal_extend fails, then it will
+@@ -5672,7 +5673,8 @@ int ext4_mark_inode_dirty(handle_t *hand
+ sbi->s_want_extra_isize,
+ iloc, handle);
+ if (ret) {
+- EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
++ ext4_set_inode_state(inode,
++ EXT4_STATE_NO_EXPAND);
+ if (mnt_count !=
+ le16_to_cpu(sbi->s_es->s_mnt_count)) {
+ ext4_warning(inode->i_sb, __func__,
+--- a/fs/ext4/migrate.c
++++ b/fs/ext4/migrate.c
+@@ -357,12 +357,12 @@ static int ext4_ext_swap_inode_data(hand
+ * happened after we started the migrate. We need to
+ * fail the migrate
+ */
+- if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
++ if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
+ retval = -EAGAIN;
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto err_out;
+ } else
+- EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
++ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+ /*
+ * We have the extent map build with the tmp inode.
+ * Now copy the i_data across
+@@ -524,7 +524,7 @@ int ext4_ext_migrate(struct inode *inode
+ * allocation.
+ */
+ down_read((&EXT4_I(inode)->i_data_sem));
+- EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
++ ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+ handle = ext4_journal_start(inode, 1);
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -267,7 +267,7 @@ ext4_xattr_ibody_get(struct inode *inode
+ void *end;
+ int error;
+
+- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
++ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+ return -ENODATA;
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+@@ -393,7 +393,7 @@ ext4_xattr_ibody_list(struct inode *inod
+ void *end;
+ int error;
+
+- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
++ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+ return 0;
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+@@ -903,7 +903,7 @@ ext4_xattr_ibody_find(struct inode *inod
+ is->s.base = is->s.first = IFIRST(header);
+ is->s.here = is->s.first;
+ is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+- if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
++ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+ error = ext4_xattr_check_names(IFIRST(header), is->s.end);
+ if (error)
+ return error;
+@@ -935,10 +935,10 @@ ext4_xattr_ibody_set(handle_t *handle, s
+ header = IHDR(inode, ext4_raw_inode(&is->iloc));
+ if (!IS_LAST_ENTRY(s->first)) {
+ header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+- EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
++ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ } else {
+ header->h_magic = cpu_to_le32(0);
+- EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
++ ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
+ }
+ return 0;
+ }
+@@ -981,8 +981,8 @@ ext4_xattr_set_handle(handle_t *handle,
+ if (strlen(name) > 255)
+ return -ERANGE;
+ down_write(&EXT4_I(inode)->xattr_sem);
+- no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
+- EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
++ no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
++ ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+@@ -992,10 +992,10 @@ ext4_xattr_set_handle(handle_t *handle,
+ if (error)
+ goto cleanup;
+
+- if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
++ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
+ struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
+ memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+- EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
++ ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+ }
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+@@ -1047,7 +1047,7 @@ ext4_xattr_set_handle(handle_t *handle,
+ ext4_xattr_update_super_block(handle, inode->i_sb);
+ inode->i_ctime = ext4_current_time(inode);
+ if (!value)
+- EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
++ ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
+ error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+ /*
+ * The bh is consumed by ext4_mark_iloc_dirty, even with
+@@ -1062,7 +1062,7 @@ cleanup:
+ brelse(is.iloc.bh);
+ brelse(bs.bh);
+ if (no_expand == 0)
+- EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
++ ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
+ up_write(&EXT4_I(inode)->xattr_sem);
+ return error;
+ }
--- /dev/null
+From 04cbf99a9333c66de2474429c01e13d110aa5fd0 Mon Sep 17 00:00:00 2001
+From: Curt Wohlgemuth <curtw@google.com>
+Date: Sun, 30 May 2010 22:49:28 -0400
+Subject: ext4: Fix BUG_ON at fs/buffer.c:652 in no journal mode
+
+commit 73b50c1c92666d326b5fa2c945d46509f2f6d91f upstream (as of v2.6.33-git11)
+
+Calls to ext4_handle_dirty_metadata should only pass in an inode
+pointer for inode-specific metadata, and not for shared metadata
+blocks such as inode table blocks, block group descriptors, the
+superblock, etc.
+
+The BUG_ON can get tripped when updating a special device (such as a
+block device) that is opened (so that i_mapping is set in
+fs/block_dev.c) and the file system is mounted in no journal mode.
+
+Addresses-Google-Bug: #2404870
+
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4_jbd2.c | 2 +-
+ fs/ext4/ialloc.c | 2 +-
+ fs/ext4/inode.c | 6 +++---
+ fs/ext4/namei.c | 4 ++--
+ 4 files changed, 7 insertions(+), 7 deletions(-)
+
+--- a/fs/ext4/ext4_jbd2.c
++++ b/fs/ext4/ext4_jbd2.c
+@@ -89,7 +89,7 @@ int __ext4_handle_dirty_metadata(const c
+ ext4_journal_abort_handle(where, __func__, bh,
+ handle, err);
+ } else {
+- if (inode && bh)
++ if (inode)
+ mark_buffer_dirty_inode(bh, inode);
+ else
+ mark_buffer_dirty(bh);
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -904,7 +904,7 @@ repeat_in_this_group:
+ BUFFER_TRACE(inode_bitmap_bh,
+ "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle,
+- inode,
++ NULL,
+ inode_bitmap_bh);
+ if (err)
+ goto fail;
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -5158,7 +5158,7 @@ static int ext4_do_update_inode(handle_t
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+ sb->s_dirt = 1;
+ ext4_handle_sync(handle);
+- err = ext4_handle_dirty_metadata(handle, inode,
++ err = ext4_handle_dirty_metadata(handle, NULL,
+ EXT4_SB(sb)->s_sbh);
+ }
+ }
+@@ -5187,7 +5187,7 @@ static int ext4_do_update_inode(handle_t
+ }
+
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+- rc = ext4_handle_dirty_metadata(handle, inode, bh);
++ rc = ext4_handle_dirty_metadata(handle, NULL, bh);
+ if (!err)
+ err = rc;
+ ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+@@ -5741,7 +5741,7 @@ static int ext4_pin_inode(handle_t *hand
+ err = jbd2_journal_get_write_access(handle, iloc.bh);
+ if (!err)
+ err = ext4_handle_dirty_metadata(handle,
+- inode,
++ NULL,
+ iloc.bh);
+ brelse(iloc.bh);
+ }
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -2024,7 +2024,7 @@ int ext4_orphan_add(handle_t *handle, st
+ /* Insert this inode at the head of the on-disk orphan list... */
+ NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
+ EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+- err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
++ err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+ rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (!err)
+ err = rc;
+@@ -2096,7 +2096,7 @@ int ext4_orphan_del(handle_t *handle, st
+ if (err)
+ goto out_brelse;
+ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+- err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
++ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+ } else {
+ struct ext4_iloc iloc2;
+ struct inode *i_prev =
--- /dev/null
+From 9d176d321904553ab92a5df99e25ccb268a5560e Mon Sep 17 00:00:00 2001
+From: Jiaying Zhang <jiayingz@google.com>
+Date: Sun, 30 May 2010 22:49:29 -0400
+Subject: ext4: Add flag to files with blocks intentionally past EOF
+
+commit c8d46e41bc744c8fa0092112af3942fcd46c8b18 upstream (as of v2.6.33-git11)
+
+fallocate() may potentially instantiate blocks past EOF, depending
+on the flags used when it is called.
+
+e2fsck currently has a test for blocks past i_size, and it
+sometimes trips up - noticeably on xfstests 013 which runs fsstress.
+
+This patch from Jiayang does fix it up - it (along with
+e2fsprogs updates and other patches recently from Aneesh) has
+survived many fsstress runs in a row.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: Jiaying Zhang <jiayingz@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 6 ++++--
+ fs/ext4/extents.c | 22 +++++++++++++++++++++-
+ fs/ext4/inode.c | 9 ++++++++-
+ fs/ext4/ioctl.c | 9 +++++++++
+ 4 files changed, 42 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -284,10 +284,12 @@ struct flex_groups {
+ #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
+ #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
+ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
++#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
++#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
+ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
+
+-#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
+-#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */
++#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
++#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */
+
+ /* Flags that should be inherited by new inodes from their parent. */
+ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3191,7 +3191,7 @@ int ext4_ext_get_blocks(handle_t *handle
+ {
+ struct ext4_ext_path *path = NULL;
+ struct ext4_extent_header *eh;
+- struct ext4_extent newex, *ex;
++ struct ext4_extent newex, *ex, *last_ex;
+ ext4_fsblk_t newblock;
+ int err = 0, depth, ret, cache_type;
+ unsigned int allocated = 0;
+@@ -3372,6 +3372,19 @@ int ext4_ext_get_blocks(handle_t *handle
+ EXT4_STATE_DIO_UNWRITTEN);
+ }
+ }
++
++ if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
++ if (eh->eh_entries) {
++ last_ex = EXT_LAST_EXTENT(eh);
++ if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
++ + ext4_ext_get_actual_len(last_ex))
++ EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
++ } else {
++ WARN_ON(eh->eh_entries == 0);
++ ext4_error(inode->i_sb, __func__,
++ "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
++ }
++ }
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (err) {
+ /* free data blocks we just allocated */
+@@ -3505,6 +3518,13 @@ static void ext4_falloc_update_inode(str
+ i_size_write(inode, new_size);
+ if (new_size > EXT4_I(inode)->i_disksize)
+ ext4_update_i_disksize(inode, new_size);
++ } else {
++ /*
++ * Mark that we allocate beyond EOF so the subsequent truncate
++ * can proceed even if the new size is the same as i_size.
++ */
++ if (new_size > i_size_read(inode))
++ EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
+ }
+
+ }
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -4490,6 +4490,8 @@ void ext4_truncate(struct inode *inode)
+ if (!ext4_can_truncate(inode))
+ return;
+
++ EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
++
+ if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+ ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+
+@@ -5345,7 +5347,9 @@ int ext4_setattr(struct dentry *dentry,
+ }
+
+ if (S_ISREG(inode->i_mode) &&
+- attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
++ attr->ia_valid & ATTR_SIZE &&
++ (attr->ia_size < inode->i_size ||
++ (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
+ handle_t *handle;
+
+ handle = ext4_journal_start(inode, 3);
+@@ -5376,6 +5380,9 @@ int ext4_setattr(struct dentry *dentry,
+ goto err_out;
+ }
+ }
++ /* ext4_truncate will clear the flag */
++ if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
++ ext4_truncate(inode);
+ }
+
+ rc = inode_setattr(inode, attr);
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsig
+ flags &= ~EXT4_EXTENTS_FL;
+ }
+
++ if (flags & EXT4_EOFBLOCKS_FL) {
++ /* we don't support adding EOFBLOCKS flag */
++ if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
++ err = -EOPNOTSUPP;
++ goto flags_out;
++ }
++ } else if (oldflags & EXT4_EOFBLOCKS_FL)
++ ext4_truncate(inode);
++
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
--- /dev/null
+From 2cbbb92297f15740e27f2e87eb21ab86d4432cba Mon Sep 17 00:00:00 2001
+From: Tao Ma <tao.ma@oracle.com>
+Date: Sun, 30 May 2010 22:49:30 -0400
+Subject: ext4: Fix fencepost error in chosing choosing group vs file preallocation.
+
+commit cc483f102c3f703e853c96f95a654f0106fb2603 upstream (as of v2.6.33-git11)
+
+The ext4 multiblock allocator decides whether to use group or file
+preallocation based on the file size. When the file size reaches
+s_mb_stream_request (default is 16 blocks), it changes to use a
+file-specific preallocation. This is cool, but it has a tiny problem.
+
+See a simple script:
+mkfs.ext4 -b 1024 /dev/sda8 1000000
+mount -t ext4 -o nodelalloc /dev/sda8 /mnt/ext4
+for((i=0;i<5;i++))
+do
+cat /mnt/4096>>/mnt/ext4/a #4096 is a file with 4096 characters.
+cat /mnt/4096>>/mnt/ext4/b
+done
+debuge4fs -R 'stat a' /dev/sda8|grep BLOCKS -A 1
+
+And you get
+BLOCKS:
+(0-14):8705-8719, (15):2356, (16-19):8465-8468
+
+So there are 3 extents, a bit strange for the lonely 15th logical
+block. As we write to the 16 blocks, we choose file preallocation in
+ext4_mb_group_or_file, but in ext4_mb_normalize_request, we meet with
+the 16*1024 range, so no preallocation will be carried. file b then
+reserves the space after '2356', so when when write 16, we start from
+another part.
+
+This patch just change the check in ext4_mb_group_or_file, so
+that for the lonely 15 we will still use group preallocation.
+After the patch, we will get:
+debuge4fs -R 'stat a' /dev/sda8|grep BLOCKS -A 1
+BLOCKS:
+(0-15):8705-8720, (16-19):8465-8468
+
+Looks more sane. Thanks.
+
+Signed-off-by: Tao Ma <tao.ma@oracle.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/mballoc.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -3938,7 +3938,7 @@ static void ext4_mb_group_or_file(struct
+
+ /* don't use group allocation for large files */
+ size = max(size, isize);
+- if (size >= sbi->s_mb_stream_request) {
++ if (size > sbi->s_mb_stream_request) {
+ ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+ return;
+ }
--- /dev/null
+From 492c93e8097f0bf58b2884064af85242fabe5d71 Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:31 -0400
+Subject: ext4: fix error handling in migrate
+
+commit f39490bcd1691d65dc33689222a12e1fc13dd824 upstream (as of v2.6.33-git11)
+
+Set i_nlink to zero for temporary inode from very beginning.
+otherwise we may fail to start new journal handle and this
+inode will be unreferenced but with i_nlink == 1
+Since we hold inode reference it can not be pruned.
+
+Also add missed journal_start retval check.
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/migrate.c | 29 ++++++++++++++---------------
+ 1 file changed, 14 insertions(+), 15 deletions(-)
+
+--- a/fs/ext4/migrate.c
++++ b/fs/ext4/migrate.c
+@@ -494,14 +494,10 @@ int ext4_ext_migrate(struct inode *inode
+ }
+ i_size_write(tmp_inode, i_size_read(inode));
+ /*
+- * We don't want the inode to be reclaimed
+- * if we got interrupted in between. We have
+- * this tmp inode carrying reference to the
+- * data blocks of the original file. We set
+- * the i_nlink to zero at the last stage after
+- * switching the original file to extent format
++ * Set the i_nlink to zero so it will be deleted later
++ * when we drop inode reference.
+ */
+- tmp_inode->i_nlink = 1;
++ tmp_inode->i_nlink = 0;
+
+ ext4_ext_tree_init(handle, tmp_inode);
+ ext4_orphan_add(handle, tmp_inode);
+@@ -528,6 +524,16 @@ int ext4_ext_migrate(struct inode *inode
+ up_read((&EXT4_I(inode)->i_data_sem));
+
+ handle = ext4_journal_start(inode, 1);
++ if (IS_ERR(handle)) {
++ /*
++ * It is impossible to update on-disk structures without
++ * a handle, so just rollback in-core changes and live other
++ * work to orphan_list_cleanup()
++ */
++ ext4_orphan_del(NULL, tmp_inode);
++ retval = PTR_ERR(handle);
++ goto out;
++ }
+
+ ei = EXT4_I(inode);
+ i_data = ei->i_data;
+@@ -609,15 +615,8 @@ err_out:
+
+ /* Reset the extent details */
+ ext4_ext_tree_init(handle, tmp_inode);
+-
+- /*
+- * Set the i_nlink to zero so that
+- * generic_drop_inode really deletes the
+- * inode
+- */
+- tmp_inode->i_nlink = 0;
+-
+ ext4_journal_stop(handle);
++out:
+ unlock_new_inode(tmp_inode);
+ iput(tmp_inode);
+
--- /dev/null
+From 6c582d8b4e6868f8e16d160c0435530d5f8fa8e5 Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:32 -0400
+Subject: ext4: explicitly remove inode from orphan list after failed direct io
+
+commit da1dafca84413145f5ac59998b4cdd06fb89f721 upstream (as of v2.6.33-git11)
+
+Otherwise non-empty orphan list will be triggered on umount.
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3480,6 +3480,9 @@ retry:
+ * but cannot extend i_size. Bail out and pretend
+ * the write failed... */
+ ret = PTR_ERR(handle);
++ if (inode->i_nlink)
++ ext4_orphan_del(NULL, inode);
++
+ goto out;
+ }
+ if (inode->i_nlink)
--- /dev/null
+From 7765050b0f7e5ffc9146c5cea83a14774ff03a73 Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:33 -0400
+Subject: ext4: Handle non empty on-disk orphan link
+
+commit 6e3617e579e070d3655a93ee9ed7149113e795e0 upstream (as of v2.6.33-git11)
+
+In case of truncate errors we explicitly remove inode from in-core
+orphan list via orphan_del(NULL, inode) without modifying the on-disk list.
+
+But later on, the same inode may be inserted in the orphan list again
+which will result the on-disk linked list getting corrupted. If inode
+i_dtime contains valid value, then skip on-disk list modification.
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/namei.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -2020,6 +2020,13 @@ int ext4_orphan_add(handle_t *handle, st
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ if (err)
+ goto out_unlock;
++ /*
++ * Due to previous errors inode may be already a part of on-disk
++ * orphan list. If so skip on-disk list modification.
++ */
++ if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
++ (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
++ goto mem_insert;
+
+ /* Insert this inode at the head of the on-disk orphan list... */
+ NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
+@@ -2037,6 +2044,7 @@ int ext4_orphan_add(handle_t *handle, st
+ *
+ * This is safe: on error we're going to ignore the orphan list
+ * anyway on the next recovery. */
++mem_insert:
+ if (!err)
+ list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+
--- /dev/null
+From 5921c8d6a6e598b1101b5785f09bbe334e92957d Mon Sep 17 00:00:00 2001
+From: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
+Date: Sun, 30 May 2010 22:49:34 -0400
+Subject: ext4: make "offset" consistent in ext4_check_dir_entry()
+
+commit b8b8afe236e97b6359d46d3a3f8c46455e192271 upstream (as of v2.6.33-git11)
+
+The callers of ext4_check_dir_entry() usually pass in the "file
+offset" (ext4_readdir, htree_dirblock_to_tree, search_dirblock,
+ext4_dx_find_entry, empty_dir), but a few callers (add_dirent_to_buf,
+ext4_delete_entry) only pass in the buffer offset.
+
+To accomodate those last two (which would be hard to fix otherwise),
+this patch changes ext4_check_dir_entry() to print the physical block
+number and the relative offset as well as the passed-in offset.
+
+Signed-off-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/dir.c | 8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+--- a/fs/ext4/dir.c
++++ b/fs/ext4/dir.c
+@@ -84,9 +84,11 @@ int ext4_check_dir_entry(const char *fun
+
+ if (error_msg != NULL)
+ ext4_error(dir->i_sb, function,
+- "bad entry in directory #%lu: %s - "
+- "offset=%u, inode=%u, rec_len=%d, name_len=%d",
+- dir->i_ino, error_msg, offset,
++ "bad entry in directory #%lu: %s - block=%llu"
++ "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
++ dir->i_ino, error_msg,
++ (unsigned long long) bh->b_blocknr,
++ (unsigned) (offset%bh->b_size), offset,
+ le32_to_cpu(de->inode),
+ rlen, de->name_len);
+ return error_msg == NULL ? 1 : 0;
--- /dev/null
+From 98cc8ca4405bfb2d511c83ced6c46153c04d5f76 Mon Sep 17 00:00:00 2001
+From: Akira Fujita <a-fujita@rs.jp.nec.com>
+Date: Sun, 30 May 2010 22:49:35 -0400
+Subject: ext4: Fix insertion point of extent in mext_insert_across_blocks()
+
+commit 5fd5249aa36fad98c9fd5edced352939e54f9324 upstream (as of v2.6.33-git11)
+
+If the leaf node has 2 extent space or fewer and EXT4_IOC_MOVE_EXT
+ioctl is called with the file offset where after the 2nd extent
+covers, mext_insert_across_blocks() always tries to insert extent into
+the first extent. As a result, the file gets corrupted because of
+wrong extent order. The patch fixes this problem.
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -252,6 +252,7 @@ mext_insert_across_blocks(handle_t *hand
+ }
+
+ o_start->ee_len = start_ext->ee_len;
++ eblock = le32_to_cpu(start_ext->ee_block);
+ new_flag = 1;
+
+ } else if (start_ext->ee_len && new_ext->ee_len &&
+@@ -262,6 +263,7 @@ mext_insert_across_blocks(handle_t *hand
+ * orig |------------------------------|
+ */
+ o_start->ee_len = start_ext->ee_len;
++ eblock = le32_to_cpu(start_ext->ee_block);
+ new_flag = 1;
+
+ } else if (!start_ext->ee_len && new_ext->ee_len &&
+@@ -502,6 +504,7 @@ mext_leaf_block(handle_t *handle, struct
+ le32_to_cpu(oext->ee_block) + oext_alen) {
+ start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
+ le32_to_cpu(oext->ee_block));
++ start_ext.ee_block = oext->ee_block;
+ copy_extent_status(oext, &start_ext);
+ } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
+ prev_ext = oext - 1;
+@@ -515,6 +518,7 @@ mext_leaf_block(handle_t *handle, struct
+ start_ext.ee_len = cpu_to_le16(
+ ext4_ext_get_actual_len(prev_ext) +
+ new_ext_alen);
++ start_ext.ee_block = oext->ee_block;
+ copy_extent_status(prev_ext, &start_ext);
+ new_ext.ee_len = 0;
+ }
--- /dev/null
+From 06518e8c9d0a67cb024545b880849b68b79a5390 Mon Sep 17 00:00:00 2001
+From: Akira Fujita <a-fujita@rs.jp.nec.com>
+Date: Sun, 30 May 2010 22:49:36 -0400
+Subject: ext4: Fix the NULL reference in double_down_write_data_sem()
+
+commit 7247c0caa23d94a1cb6b307edba9dc45fb0798d4 upstream (as of v2.6.33-git11)
+
+If EXT4_IOC_MOVE_EXT ioctl is called with NULL donor_fd, fget() in
+ext4_ioctl() gets inappropriate file structure for donor; so we need
+to do this check earlier, before calling double_down_write_data_sem().
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -953,14 +953,6 @@ mext_check_arguments(struct inode *orig_
+ unsigned int blkbits = orig_inode->i_blkbits;
+ unsigned int blocksize = 1 << blkbits;
+
+- /* Regular file check */
+- if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+- ext4_debug("ext4 move extent: The argument files should be "
+- "regular file [ino:orig %lu, donor %lu]\n",
+- orig_inode->i_ino, donor_inode->i_ino);
+- return -EINVAL;
+- }
+-
+ if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
+ ext4_debug("ext4 move extent: suid or sgid is set"
+ " to donor file [ino:orig %lu, donor %lu]\n",
+@@ -1207,6 +1199,14 @@ ext4_move_extents(struct file *o_filp, s
+ orig_inode->i_ino, donor_inode->i_ino);
+ return -EINVAL;
+ }
++
++ /* Regular file check */
++ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
++ ext4_debug("ext4 move extent: The argument files should be "
++ "regular file [ino:orig %lu, donor %lu]\n",
++ orig_inode->i_ino, donor_inode->i_ino);
++ return -EINVAL;
++ }
+
+ /* Protect orig and donor inodes against a truncate */
+ ret1 = mext_inode_double_lock(orig_inode, donor_inode);
--- /dev/null
+From eee98b87da36ae78c6867d8ce1943f65a16da648 Mon Sep 17 00:00:00 2001
+From: Akira Fujita <a-fujita@rs.jp.nec.com>
+Date: Sun, 30 May 2010 22:49:37 -0400
+Subject: ext4: Code cleanup for EXT4_IOC_MOVE_EXT ioctl
+
+commit c437b2733520599a2c6e0dbcdeae611319f84707 upstream (as of v2.6.33-git11)
+
+a) Fix sparse warning in ext4_ioctl()
+b) Remove unneeded variable in mext_leaf_block()
+c) Fix spelling typo in mext_check_arguments()
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ioctl.c | 3 ++-
+ fs/ext4/move_extent.c | 4 +---
+ 2 files changed, 3 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -258,7 +258,8 @@ setversion_out:
+ if (me.moved_len > 0)
+ file_remove_suid(donor_filp);
+
+- if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
++ if (copy_to_user((struct move_extent __user *)arg,
++ &me, sizeof(me)))
+ err = -EFAULT;
+ mext_out:
+ fput(donor_filp);
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -477,7 +477,6 @@ mext_leaf_block(handle_t *handle, struct
+ struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
+ struct ext4_extent new_ext, start_ext, end_ext;
+ ext4_lblk_t new_ext_end;
+- ext4_fsblk_t new_phys_end;
+ int oext_alen, new_ext_alen, end_ext_alen;
+ int depth = ext_depth(orig_inode);
+ int ret;
+@@ -491,7 +490,6 @@ mext_leaf_block(handle_t *handle, struct
+ new_ext.ee_len = dext->ee_len;
+ new_ext_alen = ext4_ext_get_actual_len(&new_ext);
+ new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
+- new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
+
+ /*
+ * Case: original extent is first
+@@ -932,7 +930,7 @@ out2:
+ }
+
+ /**
+- * mext_check_argumants - Check whether move extent can be done
++ * mext_check_arguments - Check whether move extent can be done
+ *
+ * @orig_inode: original inode
+ * @donor_inode: donor inode
--- /dev/null
+From 24bce2c3022a0ff4cb418ed11173bef96bd9806a Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Sun, 30 May 2010 22:49:38 -0400
+Subject: ext4: Fix estimate of # of blocks needed to write indirect-mapped files
+
+commit d330a5befb88875a9b3d2db62f9b74dadf660b13 upstream (as of v2.6.34-rc3)
+
+http://bugzilla.kernel.org/show_bug.cgi?id=15420
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1060,7 +1060,7 @@ static int ext4_indirect_calc_metadata_a
+ sector_t lblock)
+ {
+ struct ext4_inode_info *ei = EXT4_I(inode);
+- int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1;
++ sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
+ int blk_bits;
+
+ if (lblock < EXT4_NDIR_BLOCKS)
+@@ -1075,7 +1075,7 @@ static int ext4_indirect_calc_metadata_a
+ }
+ ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+ ei->i_da_metadata_calc_len = 1;
+- blk_bits = roundup_pow_of_two(lblock + 1);
++ blk_bits = order_base_2(lblock);
+ return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
+ }
+
--- /dev/null
+From 0177767f12e4ebcb387fc3c7e5945611ce0dd6f1 Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Sun, 30 May 2010 22:49:39 -0400
+Subject: ext4: Fixed inode allocator to correctly track a flex_bg's used_dirs
+
+commit c4caae25187ff3f5e837c6f04eb1acc2723c72d3 upstream (as of v2.6.34-rc3)
+
+When used_dirs was introduced for the flex_groups struct, it looks
+like the accounting was not put into place properly, in some places
+manipulating free_inodes rather than used_dirs.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ialloc.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -268,7 +268,7 @@ void ext4_free_inode(handle_t *handle, s
+ ext4_group_t f;
+
+ f = ext4_flex_group(sbi, block_group);
+- atomic_dec(&sbi->s_flex_groups[f].free_inodes);
++ atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+ }
+
+ }
+@@ -779,7 +779,7 @@ static int ext4_claim_inode(struct super
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, group);
+
+- atomic_inc(&sbi->s_flex_groups[f].free_inodes);
++ atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+ }
+ }
+ gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
--- /dev/null
+From 457ad9487d209f3c7bcb6de32aa393f75ba5e22d Mon Sep 17 00:00:00 2001
+From: Curt Wohlgemuth <curtw@google.com>
+Date: Sun, 30 May 2010 22:49:40 -0400
+Subject: ext4: Fix possible lost inode write in no journal mode
+
+commit 8b472d739b2ddd8ab7fb278874f696cd95b25a5e upstream (as of v2.6.34-rc6)
+
+In the no-journal case, ext4_write_inode() will fetch the bh and call
+sync_dirty_buffer() on it. However, if the bh has already been
+written and the bh reclaimed for some other purpose, AND if the inode
+is the only one in the inode table block in use, then
+ext4_get_inode_loc() will not read the inode table block from disk,
+but as an optimization, fill the block with zero's assuming that its
+caller will copy in the on-disk version of the inode. This is not
+done by ext4_write_inode(), so the contents of the inode can simply
+get lost. The fix is to use __ext4_get_inode_loc() with in_mem set to
+0, instead of ext4_get_inode_loc(). Long term the API needs to be
+fixed so it's obvious why latter is not safe.
+
+Addresses-Google-Bug: #2526446
+
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -5260,7 +5260,7 @@ int ext4_write_inode(struct inode *inode
+ } else {
+ struct ext4_iloc iloc;
+
+- err = ext4_get_inode_loc(inode, &iloc);
++ err = __ext4_get_inode_loc(inode, &iloc, 0);
+ if (err)
+ return err;
+ if (wait)
--- /dev/null
+From 62de51f3a99493a99d7f4e3793b5952b40880ea0 Mon Sep 17 00:00:00 2001
+From: Curt Wohlgemuth <curtw@google.com>
+Date: Sun, 30 May 2010 22:49:41 -0400
+Subject: ext4: Fix buffer head leaks after calls to ext4_get_inode_loc()
+
+commit fd2dd9fbaf9e498ec63eef298921e36556f7214c upstream (as of v2.6.34-rc6)
+
+Calls to ext4_get_inode_loc() returns with a reference to a buffer
+head in iloc->bh. The callers of this function in ext4_write_inode()
+when in no journal mode and in ext4_xattr_fiemap() don't release the
+buffer head after using it.
+
+Addresses-Google-Bug: #2548165
+
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c | 1 +
+ fs/ext4/inode.c | 1 +
+ 2 files changed, 2 insertions(+)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3778,6 +3778,7 @@ static int ext4_xattr_fiemap(struct inod
+ physical += offset;
+ length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
+ flags |= FIEMAP_EXTENT_DATA_INLINE;
++ brelse(iloc.bh);
+ } else { /* external block */
+ physical = EXT4_I(inode)->i_file_acl << blockbits;
+ length = inode->i_sb->s_blocksize;
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -5273,6 +5273,7 @@ int ext4_write_inode(struct inode *inode
+ (unsigned long long)iloc.bh->b_blocknr);
+ err = -EIO;
+ }
++ brelse(iloc.bh);
+ }
+ return err;
+ }
--- /dev/null
+From 462d9c2b296ce81bf4c6a6899e256ae6188f9a5a Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 30 May 2010 22:49:42 -0400
+Subject: ext4: Issue the discard operation *before* releasing the blocks to be reused
+
+commit b90f687018e6d6c77d981b09203780f7001407e5 upstream (as of v2.6.34-rc6)
+
+Otherwise, we can end up having data corruption because the blocks
+could get reused and then discarded!
+
+https://bugzilla.kernel.org/show_bug.cgi?id=15579
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/mballoc.c | 24 +++++++++++-------------
+ 1 file changed, 11 insertions(+), 13 deletions(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -2537,6 +2537,17 @@ static void release_blocks_on_commit(jou
+ mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+ entry->count, entry->group, entry);
+
++ if (test_opt(sb, DISCARD)) {
++ ext4_fsblk_t discard_block;
++
++ discard_block = entry->start_blk +
++ ext4_group_first_block_no(sb, entry->group);
++ trace_ext4_discard_blocks(sb,
++ (unsigned long long)discard_block,
++ entry->count);
++ sb_issue_discard(sb, discard_block, entry->count);
++ }
++
+ err = ext4_mb_load_buddy(sb, entry->group, &e4b);
+ /* we expect to find existing buddy because it's pinned */
+ BUG_ON(err != 0);
+@@ -2558,19 +2569,6 @@ static void release_blocks_on_commit(jou
+ page_cache_release(e4b.bd_bitmap_page);
+ }
+ ext4_unlock_group(sb, entry->group);
+- if (test_opt(sb, DISCARD)) {
+- ext4_fsblk_t discard_block;
+- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+-
+- discard_block = (ext4_fsblk_t)entry->group *
+- EXT4_BLOCKS_PER_GROUP(sb)
+- + entry->start_blk
+- + le32_to_cpu(es->s_first_data_block);
+- trace_ext4_discard_blocks(sb,
+- (unsigned long long)discard_block,
+- entry->count);
+- sb_issue_discard(sb, discard_block, entry->count);
+- }
+ kmem_cache_free(ext4_free_ext_cachep, entry);
+ ext4_mb_release_desc(&e4b);
+ }
--- /dev/null
+From 6aac59ef585709fa8e03cf86dc741954b3af47c7 Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:43 -0400
+Subject: ext4: check missed return value in ext4_sync_file()
+
+commit 0671e704658b9f26f85e78d51176daa861f955c7 upstream (as of v2.6.34-git13)
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/fsync.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/fsync.c
++++ b/fs/ext4/fsync.c
+@@ -101,7 +101,7 @@ int ext4_sync_file(struct file *file, st
+ (journal->j_fs_dev != journal->j_dev) &&
+ (journal->j_flags & JBD2_BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+- jbd2_log_wait_commit(journal, commit_tid);
++ ret = jbd2_log_wait_commit(journal, commit_tid);
+ } else if (journal->j_flags & JBD2_BARRIER)
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ return ret;
--- /dev/null
+From bc65559adfab46dcbcab65d1830490c5043983bf Mon Sep 17 00:00:00 2001
+From: Jing Zhang <zj.barak@gmail.com>
+Date: Sun, 30 May 2010 22:49:44 -0400
+Subject: ext4: fix memory leaks in error path handling of ext4_ext_zeroout()
+
+commit b720303df7352d4a7a1f61e467e0a124913c0d41 upstream (as of v2.6.34-git13)
+
+When EIO occurs after bio is submitted, there is no memory free
+operation for bio, which results in memory leakage. And there is also
+no check against bio_alloc() for bio.
+
+Acked-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
+Signed-off-by: Jing Zhang <zj.barak@gmail.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c | 15 ++++++++-------
+ 1 file changed, 8 insertions(+), 7 deletions(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -2446,7 +2446,7 @@ static void bi_complete(struct bio *bio,
+ /* FIXME!! we need to try to merge to left or right after zero-out */
+ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
+ {
+- int ret = -EIO;
++ int ret;
+ struct bio *bio;
+ int blkbits, blocksize;
+ sector_t ee_pblock;
+@@ -2470,6 +2470,9 @@ static int ext4_ext_zeroout(struct inode
+ len = ee_len;
+
+ bio = bio_alloc(GFP_NOIO, len);
++ if (!bio)
++ return -ENOMEM;
++
+ bio->bi_sector = ee_pblock;
+ bio->bi_bdev = inode->i_sb->s_bdev;
+
+@@ -2497,17 +2500,15 @@ static int ext4_ext_zeroout(struct inode
+ submit_bio(WRITE, bio);
+ wait_for_completion(&event);
+
+- if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+- ret = 0;
+- else {
+- ret = -EIO;
+- break;
++ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
++ bio_put(bio);
++ return -EIO;
+ }
+ bio_put(bio);
+ ee_len -= done;
+ ee_pblock += done << (blkbits - 9);
+ }
+- return ret;
++ return 0;
+ }
+
+ #define EXT4_EXT_ZERO_LEN 7
--- /dev/null
+From dc93068aadac2019c504112d2761773e64e7ba72 Mon Sep 17 00:00:00 2001
+From: Jing Zhang <zj.barak@gmail.com>
+Date: Sun, 30 May 2010 22:49:45 -0400
+Subject: ext4: Remove unnecessary call to ext4_get_group_desc() in mballoc
+
+commit 62e823a2cba18509ee826d775270e8ef9071b5bc upstream (as of v2.6.34-git13)
+
+Signed-off-by: Jing Zhang <zj.barak@gmail.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/mballoc.c | 2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -2027,7 +2027,6 @@ repeat:
+
+ for (i = 0; i < ngroups; group++, i++) {
+ struct ext4_group_info *grp;
+- struct ext4_group_desc *desc;
+
+ if (group == ngroups)
+ group = 0;
+@@ -2050,7 +2049,6 @@ repeat:
+ }
+
+ ac->ac_groups_scanned++;
+- desc = ext4_get_group_desc(sb, group, NULL);
+ if (cr == 0)
+ ext4_mb_simple_scan_group(ac, &e4b);
+ else if (cr == 1 &&
--- /dev/null
+From 5fc0d2b4f06dfd2a941e23171a5a4a155383c47a Mon Sep 17 00:00:00 2001
+From: Jing Zhang <zj.barak@gmail.com>
+Date: Sun, 30 May 2010 22:49:46 -0400
+Subject: ext4: rename ext4_mb_release_desc() to ext4_mb_unload_buddy()
+
+commit e39e07fdfd98be8650385f12a7b81d6adc547510 upstream (as of v2.6.34-git13)
+
+This function cleans up after ext4_mb_load_buddy(), so the renaming
+makes the code clearer.
+
+Signed-off-by: Jing Zhang <zj.barak@gmail.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/mballoc.c | 24 ++++++++++++------------
+ 1 file changed, 12 insertions(+), 12 deletions(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -1150,7 +1150,7 @@ err:
+ return ret;
+ }
+
+-static void ext4_mb_release_desc(struct ext4_buddy *e4b)
++static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
+ {
+ if (e4b->bd_bitmap_page)
+ page_cache_release(e4b->bd_bitmap_page);
+@@ -1618,7 +1618,7 @@ int ext4_mb_try_best_found(struct ext4_a
+ }
+
+ ext4_unlock_group(ac->ac_sb, group);
+- ext4_mb_release_desc(e4b);
++ ext4_mb_unload_buddy(e4b);
+
+ return 0;
+ }
+@@ -1674,7 +1674,7 @@ int ext4_mb_find_by_goal(struct ext4_all
+ ext4_mb_use_best_found(ac, e4b);
+ }
+ ext4_unlock_group(ac->ac_sb, group);
+- ext4_mb_release_desc(e4b);
++ ext4_mb_unload_buddy(e4b);
+
+ return 0;
+ }
+@@ -2044,7 +2044,7 @@ repeat:
+ if (!ext4_mb_good_group(ac, group, cr)) {
+ /* someone did allocation from this group */
+ ext4_unlock_group(sb, group);
+- ext4_mb_release_desc(&e4b);
++ ext4_mb_unload_buddy(&e4b);
+ continue;
+ }
+
+@@ -2058,7 +2058,7 @@ repeat:
+ ext4_mb_complex_scan_group(ac, &e4b);
+
+ ext4_unlock_group(sb, group);
+- ext4_mb_release_desc(&e4b);
++ ext4_mb_unload_buddy(&e4b);
+
+ if (ac->ac_status != AC_STATUS_CONTINUE)
+ break;
+@@ -2148,7 +2148,7 @@ static int ext4_mb_seq_groups_show(struc
+ ext4_lock_group(sb, group);
+ memcpy(&sg, ext4_get_group_info(sb, group), i);
+ ext4_unlock_group(sb, group);
+- ext4_mb_release_desc(&e4b);
++ ext4_mb_unload_buddy(&e4b);
+
+ seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
+ sg.info.bb_fragments, sg.info.bb_first_free);
+@@ -2568,7 +2568,7 @@ static void release_blocks_on_commit(jou
+ }
+ ext4_unlock_group(sb, entry->group);
+ kmem_cache_free(ext4_free_ext_cachep, entry);
+- ext4_mb_release_desc(&e4b);
++ ext4_mb_unload_buddy(&e4b);
+ }
+
+ mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
+@@ -3705,7 +3705,7 @@ out:
+ ext4_unlock_group(sb, group);
+ if (ac)
+ kmem_cache_free(ext4_ac_cachep, ac);
+- ext4_mb_release_desc(&e4b);
++ ext4_mb_unload_buddy(&e4b);
+ put_bh(bitmap_bh);
+ return free;
+ }
+@@ -3809,7 +3809,7 @@ repeat:
+ if (bitmap_bh == NULL) {
+ ext4_error(sb, __func__, "Error in reading block "
+ "bitmap for %u", group);
+- ext4_mb_release_desc(&e4b);
++ ext4_mb_unload_buddy(&e4b);
+ continue;
+ }
+
+@@ -3818,7 +3818,7 @@ repeat:
+ ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+ ext4_unlock_group(sb, group);
+
+- ext4_mb_release_desc(&e4b);
++ ext4_mb_unload_buddy(&e4b);
+ put_bh(bitmap_bh);
+
+ list_del(&pa->u.pa_tmp_list);
+@@ -4082,7 +4082,7 @@ ext4_mb_discard_lg_preallocations(struct
+ ext4_mb_release_group_pa(&e4b, pa, ac);
+ ext4_unlock_group(sb, group);
+
+- ext4_mb_release_desc(&e4b);
++ ext4_mb_unload_buddy(&e4b);
+ list_del(&pa->u.pa_tmp_list);
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ }
+@@ -4584,7 +4584,7 @@ do_more:
+ atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
+ }
+
+- ext4_mb_release_desc(&e4b);
++ ext4_mb_unload_buddy(&e4b);
+
+ *freed += count;
+
--- /dev/null
+From 9a0bd6ee7ccc0cfdc614dbc6a4708d596ec53f82 Mon Sep 17 00:00:00 2001
+From: Christian Borntraeger <borntraeger@de.ibm.com>
+Date: Sun, 30 May 2010 22:49:47 -0400
+Subject: ext4: allow defrag (EXT4_IOC_MOVE_EXT) in 32bit compat mode
+
+commit b684b2ee9409f2890a8b3aea98525bbe5f84e276 upstream (as of v2.6.34-git13)
+
+I have an x86_64 kernel with i386 userspace. e4defrag fails on the
+EXT4_IOC_MOVE_EXT ioctl because it is not wired up for the compat
+case. It seems that struct move_extent is compat save, only types
+with fixed widths are used:
+{
+ __u32 reserved; /* should be zero */
+ __u32 donor_fd; /* donor file descriptor */
+ __u64 orig_start; /* logical start offset in block for orig */
+ __u64 donor_start; /* logical start offset in block for donor */
+ __u64 len; /* block length to be moved */
+ __u64 moved_len; /* moved block length */
+};
+
+Lets just wire up EXT4_IOC_MOVE_EXT for the compat case.
+
+Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Reviewed-by: Eric Sandeen <sandeen@redhat.com>
+CC: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ioctl.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -375,6 +375,8 @@ long ext4_compat_ioctl(struct file *file
+ break;
+ case EXT4_IOC_GROUP_ADD:
+ break;
++ case EXT4_IOC_MOVE_EXT:
++ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
--- /dev/null
+From 93984006ca6af7d067409fd6db2bedd999af2b0d Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:48 -0400
+Subject: ext4: fix quota accounting in case of fallocate
+
+commit 35121c9860316d7799cea0fbc359a9186e7c2747 upstream (as of v2.6.34-git13)
+
+allocated_meta_data is already included in 'used' variable.
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1149,7 +1149,8 @@ void ext4_da_update_reserve_space(struct
+ */
+ if (allocated_meta_blocks)
+ vfs_dq_claim_block(inode, allocated_meta_blocks);
+- vfs_dq_release_reservation_block(inode, mdb_free + used);
++ vfs_dq_release_reservation_block(inode, mdb_free + used -
++ allocated_meta_blocks);
+ }
+
+ /*
--- /dev/null
+From 9e92f0bbe85a6ceead4b1215861f1a30bfe1d9dc Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Sun, 30 May 2010 22:49:49 -0400
+Subject: ext4: check s_log_groups_per_flex in online resize code
+
+commit 42007efd569f1cf3bfb9a61da60ef6c2179508ca upstream (as of v2.6.34-git13)
+
+If groups_per_flex < 2, sbi->s_flex_groups[] doesn't get filled out,
+and every other access to this first tests s_log_groups_per_flex;
+same thing needs to happen in resize or we'll wander off into
+a null pointer when doing an online resize of the file system.
+
+Thanks to Christoph Biedl, who came up with the trivial testcase:
+
+# truncate --size 128M fsfile
+# mkfs.ext3 -F fsfile
+# tune2fs -O extents,uninit_bg,dir_index,flex_bg,huge_file,dir_nlink,extra_isize fsfile
+# e2fsck -yDf -C0 fsfile
+# truncate --size 132M fsfile
+# losetup /dev/loop0 fsfile
+# mount /dev/loop0 mnt
+# resize2fs -p /dev/loop0
+
+ https://bugzilla.kernel.org/show_bug.cgi?id=13549
+
+Reported-by: Alessandro Polverini <alex@nibbles.it>
+Test-case-by: Christoph Biedl <bugzilla.kernel.bpeb@manchmal.in-ulm.de>
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/resize.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/resize.c
++++ b/fs/ext4/resize.c
+@@ -930,7 +930,8 @@ int ext4_group_add(struct super_block *s
+ percpu_counter_add(&sbi->s_freeinodes_counter,
+ EXT4_INODES_PER_GROUP(sb));
+
+- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
++ sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group;
+ flex_group = ext4_flex_group(sbi, input->group);
+ atomic_add(input->free_blocks_count,
--- /dev/null
+From 168b7c0d3438662c33488f73a27036f14c176efc Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Sun, 30 May 2010 22:49:50 -0400
+Subject: ext4: don't return to userspace after freezing the fs with a mutex held
+
+commit 6b0310fbf087ad6e9e3b8392adca97cd77184084 upstream (as of v2.6.34-git13)
+
+ext4_freeze() used jbd2_journal_lock_updates() which takes
+the j_barrier mutex, and then returns to userspace. The
+kernel does not like this:
+
+================================================
+[ BUG: lock held when returning to user space! ]
+------------------------------------------------
+lvcreate/1075 is leaving the kernel with locks still held!
+1 lock held by lvcreate/1075:
+ #0: (&journal->j_barrier){+.+...}, at: [<ffffffff811c6214>]
+jbd2_journal_lock_updates+0xe1/0xf0
+
+Use vfs_check_frozen() added to ext4_journal_start_sb() and
+ext4_force_commit() instead.
+
+Addresses-Red-Hat-Bugzilla: #568503
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/super.c | 20 ++++++++++----------
+ 1 file changed, 10 insertions(+), 10 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -227,6 +227,7 @@ handle_t *ext4_journal_start_sb(struct s
+ if (sb->s_flags & MS_RDONLY)
+ return ERR_PTR(-EROFS);
+
++ vfs_check_frozen(sb, SB_FREEZE_WRITE);
+ /* Special case here: if the journal has aborted behind our
+ * backs (eg. EIO in the commit thread), then we still need to
+ * take the FS itself readonly cleanly. */
+@@ -3391,8 +3392,10 @@ int ext4_force_commit(struct super_block
+ return 0;
+
+ journal = EXT4_SB(sb)->s_journal;
+- if (journal)
++ if (journal) {
++ vfs_check_frozen(sb, SB_FREEZE_WRITE);
+ ret = ext4_journal_force_commit(journal);
++ }
+
+ return ret;
+ }
+@@ -3441,18 +3444,16 @@ static int ext4_freeze(struct super_bloc
+ * the journal.
+ */
+ error = jbd2_journal_flush(journal);
+- if (error < 0) {
+- out:
+- jbd2_journal_unlock_updates(journal);
+- return error;
+- }
++ if (error < 0)
++ goto out;
+
+ /* Journal blocked and flushed, clear needs_recovery flag. */
+ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ error = ext4_commit_super(sb, 1);
+- if (error)
+- goto out;
+- return 0;
++out:
++ /* we rely on s_frozen to stop further updates */
++ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
++ return error;
+ }
+
+ /*
+@@ -3469,7 +3470,6 @@ static int ext4_unfreeze(struct super_bl
+ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_commit_super(sb, 1);
+ unlock_super(sb);
+- jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ return 0;
+ }
+
--- /dev/null
+From 0778bf26394249a97740013f92198b5272703e8b Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Sun, 30 May 2010 22:49:51 -0400
+Subject: ext4: stop issuing discards if not supported by device
+
+commit a30eec2a8650a77f754e84b2e15f062fe652baa7 upstream (as of v2.6.34-git13)
+
+Turn off issuance of discard requests if the device does
+not support it - similar to the action we take for barriers.
+This will save a little computation time if a non-discardable
+device is mounted with -o discard, and also makes it obvious
+that it's not doing what was asked at mount time ...
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/mballoc.c | 8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -2536,6 +2536,7 @@ static void release_blocks_on_commit(jou
+ entry->count, entry->group, entry);
+
+ if (test_opt(sb, DISCARD)) {
++ int ret;
+ ext4_fsblk_t discard_block;
+
+ discard_block = entry->start_blk +
+@@ -2543,7 +2544,12 @@ static void release_blocks_on_commit(jou
+ trace_ext4_discard_blocks(sb,
+ (unsigned long long)discard_block,
+ entry->count);
+- sb_issue_discard(sb, discard_block, entry->count);
++ ret = sb_issue_discard(sb, discard_block, entry->count);
++ if (ret == EOPNOTSUPP) {
++ ext4_warning(sb, __func__,
++ "discard not supported, disabling");
++ clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
++ }
+ }
+
+ err = ext4_mb_load_buddy(sb, entry->group, &e4b);
--- /dev/null
+From 2f4283aff3e5415fa36cbf81aa2a6247bfbb0527 Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Sun, 30 May 2010 22:49:52 -0400
+Subject: ext4: don't scan/accumulate more pages than mballoc will allocate
+
+commit c445e3e0a5c2804524dec6e55f66d63f6bc5bc3e upstream (as of v2.6.34-git13)
+
+There was a bug reported on RHEL5 that a 10G dd on a 12G box
+had a very, very slow sync after that.
+
+At issue was the loop in write_cache_pages scanning all the way
+to the end of the 10G file, even though the subsequent call
+to mpage_da_submit_io would only actually write a smallish amt; then
+we went back to the write_cache_pages loop ... wasting tons of time
+in calling __mpage_da_writepage for thousands of pages we would
+just revisit (many times) later.
+
+Upstream it's not such a big issue for sys_sync because we get
+to the loop with a much smaller nr_to_write, which limits the loop.
+
+However, talking with Aneesh he realized that fsync upstream still
+gets here with a very large nr_to_write and we face the same problem.
+
+This patch makes mpage_add_bh_to_extent stop the loop after we've
+accumulated 2048 pages, by setting mpd->io_done = 1; which ultimately
+causes the write_cache_pages loop to break.
+
+Repeating the test with a dirty_ratio of 80 (to leave something for
+fsync to do), I don't see huge IO performance gains, but the reduction
+in cpu usage is striking: 80% usage with stock, and 2% with the
+below patch. Instrumenting the loop in write_cache_pages clearly
+shows that we are wasting time here.
+
+Eventually we need to change mpage_da_map_pages() also submit its I/O
+to the block layer, subsuming mpage_da_submit_io(), and then change it
+call ext4_get_blocks() multiple times.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -2361,6 +2361,15 @@ static void mpage_add_bh_to_extent(struc
+ sector_t next;
+ int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+
++ /*
++ * XXX Don't go larger than mballoc is willing to allocate
++ * This is a stopgap solution. We eventually need to fold
++ * mpage_da_submit_io() into this function and then call
++ * ext4_get_blocks() multiple times in a loop
++ */
++ if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
++ goto flush_it;
++
+ /* check if thereserved journal credits might overflow */
+ if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (nrblocks >= EXT4_MAX_TRANS_DATA) {
--- /dev/null
+From 3f9db529f4db9500a2bc9d296258a0dd8f9ac03e Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:53 -0400
+Subject: ext4: Do not zero out uninitialized extents beyond i_size
+
+commit 21ca087a3891efab4d45488db8febee474d26c68 upstream (as of v2.6.34-git13)
+
+The extents code will sometimes zero out blocks and mark them as
+initialized instead of splitting an extent into several smaller ones.
+This optimization however, causes problems if the extent is beyond
+i_size because fsck will complain if there are uninitialized blocks
+after i_size as this can not be distinguished from an inode that has
+an incorrect i_size field.
+
+https://bugzilla.kernel.org/show_bug.cgi?id=15742
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c | 67 +++++++++++++++++++++++++++++++++++++++++-------------
+ 1 file changed, 51 insertions(+), 16 deletions(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -2533,11 +2533,21 @@ static int ext4_ext_convert_to_initializ
+ struct ext4_extent *ex2 = NULL;
+ struct ext4_extent *ex3 = NULL;
+ struct ext4_extent_header *eh;
+- ext4_lblk_t ee_block;
++ ext4_lblk_t ee_block, eof_block;
+ unsigned int allocated, ee_len, depth;
+ ext4_fsblk_t newblock;
+ int err = 0;
+ int ret = 0;
++ int may_zeroout;
++
++ ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
++ "block %llu, max_blocks %u\n", inode->i_ino,
++ (unsigned long long)iblock, max_blocks);
++
++ eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
++ inode->i_sb->s_blocksize_bits;
++ if (eof_block < iblock + max_blocks)
++ eof_block = iblock + max_blocks;
+
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+@@ -2546,16 +2556,23 @@ static int ext4_ext_convert_to_initializ
+ ee_len = ext4_ext_get_actual_len(ex);
+ allocated = ee_len - (iblock - ee_block);
+ newblock = iblock - ee_block + ext_pblock(ex);
++
+ ex2 = ex;
+ orig_ex.ee_block = ex->ee_block;
+ orig_ex.ee_len = cpu_to_le16(ee_len);
+ ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+
++ /*
++ * It is safe to convert extent to initialized via explicit
++ * zeroout only if extent is fully insde i_size or new_size.
++ */
++ may_zeroout = ee_block + ee_len <= eof_block;
++
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
+- if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
++ if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+@@ -2586,7 +2603,7 @@ static int ext4_ext_convert_to_initializ
+ if (allocated > max_blocks) {
+ unsigned int newdepth;
+ /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
+- if (allocated <= EXT4_EXT_ZERO_LEN) {
++ if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
+ /*
+ * iblock == ee_block is handled by the zerouout
+ * at the beginning.
+@@ -2662,7 +2679,7 @@ static int ext4_ext_convert_to_initializ
+ ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+ ext4_ext_mark_uninitialized(ex3);
+ err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
+- if (err == -ENOSPC) {
++ if (err == -ENOSPC && may_zeroout) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+@@ -2686,8 +2703,10 @@ static int ext4_ext_convert_to_initializ
+ * update the extent length after successful insert of the
+ * split extent
+ */
+- orig_ex.ee_len = cpu_to_le16(ee_len -
+- ext4_ext_get_actual_len(ex3));
++ ee_len -= ext4_ext_get_actual_len(ex3);
++ orig_ex.ee_len = cpu_to_le16(ee_len);
++ may_zeroout = ee_block + ee_len <= eof_block;
++
+ depth = newdepth;
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, iblock, path);
+@@ -2711,7 +2730,7 @@ static int ext4_ext_convert_to_initializ
+ * otherwise give the extent a chance to merge to left
+ */
+ if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
+- iblock != ee_block) {
++ iblock != ee_block && may_zeroout) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+@@ -2780,7 +2799,7 @@ static int ext4_ext_convert_to_initializ
+ goto out;
+ insert:
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
+- if (err == -ENOSPC) {
++ if (err == -ENOSPC && may_zeroout) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+@@ -2840,14 +2859,21 @@ static int ext4_split_unwritten_extents(
+ struct ext4_extent *ex2 = NULL;
+ struct ext4_extent *ex3 = NULL;
+ struct ext4_extent_header *eh;
+- ext4_lblk_t ee_block;
++ ext4_lblk_t ee_block, eof_block;
+ unsigned int allocated, ee_len, depth;
+ ext4_fsblk_t newblock;
+ int err = 0;
++ int may_zeroout;
++
++ ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
++ "block %llu, max_blocks %u\n", inode->i_ino,
++ (unsigned long long)iblock, max_blocks);
++
++ eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
++ inode->i_sb->s_blocksize_bits;
++ if (eof_block < iblock + max_blocks)
++ eof_block = iblock + max_blocks;
+
+- ext_debug("ext4_split_unwritten_extents: inode %lu,"
+- "iblock %llu, max_blocks %u\n", inode->i_ino,
+- (unsigned long long)iblock, max_blocks);
+ depth = ext_depth(inode);
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+@@ -2855,12 +2881,19 @@ static int ext4_split_unwritten_extents(
+ ee_len = ext4_ext_get_actual_len(ex);
+ allocated = ee_len - (iblock - ee_block);
+ newblock = iblock - ee_block + ext_pblock(ex);
++
+ ex2 = ex;
+ orig_ex.ee_block = ex->ee_block;
+ orig_ex.ee_len = cpu_to_le16(ee_len);
+ ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+
+ /*
++ * It is safe to convert extent to initialized via explicit
++ * zeroout only if extent is fully insde i_size or new_size.
++ */
++ may_zeroout = ee_block + ee_len <= eof_block;
++
++ /*
+ * If the uninitialized extent begins at the same logical
+ * block where the write begins, and the write completely
+ * covers the extent, then we don't need to split it.
+@@ -2894,7 +2927,7 @@ static int ext4_split_unwritten_extents(
+ ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+ ext4_ext_mark_uninitialized(ex3);
+ err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
+- if (err == -ENOSPC) {
++ if (err == -ENOSPC && may_zeroout) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
+@@ -2918,8 +2951,10 @@ static int ext4_split_unwritten_extents(
+ * update the extent length after successful insert of the
+ * split extent
+ */
+- orig_ex.ee_len = cpu_to_le16(ee_len -
+- ext4_ext_get_actual_len(ex3));
++ ee_len -= ext4_ext_get_actual_len(ex3);
++ orig_ex.ee_len = cpu_to_le16(ee_len);
++ may_zeroout = ee_block + ee_len <= eof_block;
++
+ depth = newdepth;
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, iblock, path);
+@@ -2965,7 +3000,7 @@ static int ext4_split_unwritten_extents(
+ goto out;
+ insert:
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+- if (err == -ENOSPC) {
++ if (err == -ENOSPC && may_zeroout) {
+ err = ext4_ext_zeroout(inode, &orig_ex);
+ if (err)
+ goto fix_extent_len;
--- /dev/null
+From ae42cce7e825bdc82a8e9c30a87c342d1e364e57 Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:54 -0400
+Subject: ext4: clean up inode bitmaps manipulation in ext4_free_inode
+
+commit d17413c08cd2b1dd2bf2cfdbb0f7b736b2b2b15c upstrea (as of v2..34-git13)
+
+- Reorganize locking scheme to batch two atomic operation in to one.
+ This also allow us to state what healthy group must obey following rule
+ ext4_free_inodes_count(sb, gdp) == ext4_count_free(inode_bitmap, NUM);
+- Fix possible undefined pointer dereference.
+- Even if group descriptor stats aren't accessible we have to update
+ inode bitmaps.
+- Move non-group members update out of group_lock.
+
+Note: this commit has been observed to fix fs corruption problems
+under heavy fs load
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ialloc.c | 85 +++++++++++++++++++++++++------------------------------
+ 1 file changed, 39 insertions(+), 46 deletions(-)
+
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -244,57 +244,50 @@ void ext4_free_inode(handle_t *handle, s
+ if (fatal)
+ goto error_return;
+
+- /* Ok, now we can actually update the inode bitmaps.. */
+- cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
+- bit, bitmap_bh->b_data);
+- if (!cleared)
+- ext4_error(sb, "ext4_free_inode",
+- "bit already cleared for inode %lu", ino);
+- else {
+- gdp = ext4_get_group_desc(sb, block_group, &bh2);
+-
++ fatal = -ESRCH;
++ gdp = ext4_get_group_desc(sb, block_group, &bh2);
++ if (gdp) {
+ BUFFER_TRACE(bh2, "get_write_access");
+ fatal = ext4_journal_get_write_access(handle, bh2);
+- if (fatal) goto error_return;
+-
+- if (gdp) {
+- ext4_lock_group(sb, block_group);
+- count = ext4_free_inodes_count(sb, gdp) + 1;
+- ext4_free_inodes_set(sb, gdp, count);
+- if (is_directory) {
+- count = ext4_used_dirs_count(sb, gdp) - 1;
+- ext4_used_dirs_set(sb, gdp, count);
+- if (sbi->s_log_groups_per_flex) {
+- ext4_group_t f;
+-
+- f = ext4_flex_group(sbi, block_group);
+- atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+- }
+-
+- }
+- gdp->bg_checksum = ext4_group_desc_csum(sbi,
+- block_group, gdp);
+- ext4_unlock_group(sb, block_group);
+- percpu_counter_inc(&sbi->s_freeinodes_counter);
+- if (is_directory)
+- percpu_counter_dec(&sbi->s_dirs_counter);
++ }
++ ext4_lock_group(sb, block_group);
++ cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
++ if (fatal || !cleared) {
++ ext4_unlock_group(sb, block_group);
++ goto out;
++ }
+
+- if (sbi->s_log_groups_per_flex) {
+- ext4_group_t f;
++ count = ext4_free_inodes_count(sb, gdp) + 1;
++ ext4_free_inodes_set(sb, gdp, count);
++ if (is_directory) {
++ count = ext4_used_dirs_count(sb, gdp) - 1;
++ ext4_used_dirs_set(sb, gdp, count);
++ percpu_counter_dec(&sbi->s_dirs_counter);
++ }
++ gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
++ ext4_unlock_group(sb, block_group);
+
+- f = ext4_flex_group(sbi, block_group);
+- atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+- }
+- }
+- BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+- err = ext4_handle_dirty_metadata(handle, NULL, bh2);
+- if (!fatal) fatal = err;
++ percpu_counter_inc(&sbi->s_freeinodes_counter);
++ if (sbi->s_log_groups_per_flex) {
++ ext4_group_t f = ext4_flex_group(sbi, block_group);
++
++ atomic_inc(&sbi->s_flex_groups[f].free_inodes);
++ if (is_directory)
++ atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+ }
+- BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+- if (!fatal)
+- fatal = err;
+- sb->s_dirt = 1;
++ BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
++ fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
++out:
++ if (cleared) {
++ BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
++ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
++ if (!fatal)
++ fatal = err;
++ sb->s_dirt = 1;
++ } else
++ ext4_error(sb, "ext4_free_inode",
++ "bit already cleared for inode %lu", ino);
++
+ error_return:
+ brelse(bitmap_bh);
+ ext4_std_error(sb, fatal);
--- /dev/null
+From 73337c4a1e35c3dedceb9e2d3af84da8614e6a45 Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:55 -0400
+Subject: ext4: init statistics after journal recovery
+
+commit 84061e07c5fbbbf9dc8aef8fb750fc3a2dfc31f3 upstream (as of v2.6.34-git13)
+
+Currently block/inode/dir counters initialized before journal was
+recovered. In fact after journal recovery this info will probably
+change. And freeblocks it critical for correct delalloc mode
+accounting.
+
+https://bugzilla.kernel.org/show_bug.cgi?id=15768
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Acked-by: Jan Kara <jack@suse.cz>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/super.c | 41 ++++++++++++++++++-----------------------
+ 1 file changed, 18 insertions(+), 23 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -2695,24 +2695,6 @@ static int ext4_fill_super(struct super_
+ get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+ spin_lock_init(&sbi->s_next_gen_lock);
+
+- err = percpu_counter_init(&sbi->s_freeblocks_counter,
+- ext4_count_free_blocks(sb));
+- if (!err) {
+- err = percpu_counter_init(&sbi->s_freeinodes_counter,
+- ext4_count_free_inodes(sb));
+- }
+- if (!err) {
+- err = percpu_counter_init(&sbi->s_dirs_counter,
+- ext4_count_dirs(sb));
+- }
+- if (!err) {
+- err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+- }
+- if (err) {
+- ext4_msg(sb, KERN_ERR, "insufficient memory");
+- goto failed_mount3;
+- }
+-
+ sbi->s_stripe = ext4_get_stripe_size(sbi);
+ sbi->s_max_writeback_mb_bump = 128;
+
+@@ -2832,7 +2814,20 @@ static int ext4_fill_super(struct super_
+ set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+
+ no_journal:
+-
++ err = percpu_counter_init(&sbi->s_freeblocks_counter,
++ ext4_count_free_blocks(sb));
++ if (!err)
++ err = percpu_counter_init(&sbi->s_freeinodes_counter,
++ ext4_count_free_inodes(sb));
++ if (!err)
++ err = percpu_counter_init(&sbi->s_dirs_counter,
++ ext4_count_dirs(sb));
++ if (!err)
++ err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
++ if (err) {
++ ext4_msg(sb, KERN_ERR, "insufficient memory");
++ goto failed_mount_wq;
++ }
+ if (test_opt(sb, NOBH)) {
+ if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
+ ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
+@@ -2965,6 +2960,10 @@ failed_mount_wq:
+ jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
+ }
++ percpu_counter_destroy(&sbi->s_freeblocks_counter);
++ percpu_counter_destroy(&sbi->s_freeinodes_counter);
++ percpu_counter_destroy(&sbi->s_dirs_counter);
++ percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+ failed_mount3:
+ if (sbi->s_flex_groups) {
+ if (is_vmalloc_addr(sbi->s_flex_groups))
+@@ -2972,10 +2971,6 @@ failed_mount3:
+ else
+ kfree(sbi->s_flex_groups);
+ }
+- percpu_counter_destroy(&sbi->s_freeblocks_counter);
+- percpu_counter_destroy(&sbi->s_freeinodes_counter);
+- percpu_counter_destroy(&sbi->s_dirs_counter);
+- percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+ failed_mount2:
+ for (i = 0; i < db_count; i++)
+ brelse(sbi->s_group_desc[i]);
--- /dev/null
+From 2db9e1a9cc528228b60ece755187b60331db966d Mon Sep 17 00:00:00 2001
+From: Curt Wohlgemuth <curtw@google.com>
+Date: Sun, 30 May 2010 22:49:56 -0400
+Subject: ext4: Remove extraneous newlines in ext4_msg() calls
+
+commit fbe845ddf368f77f86aa7500f8fd2690f54c66a8 upstream (as of v2.6.34-git13)
+
+Addresses-Google-Bug: #2562325
+
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c | 6 +++---
+ fs/ext4/super.c | 2 +-
+ 2 files changed, 4 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -2294,7 +2294,7 @@ static int mpage_da_map_blocks(struct mp
+ ext4_msg(mpd->inode->i_sb, KERN_CRIT,
+ "delayed block allocation failed for inode %lu at "
+ "logical offset %llu with max blocks %zd with "
+- "error %d\n", mpd->inode->i_ino,
++ "error %d", mpd->inode->i_ino,
+ (unsigned long long) next,
+ mpd->b_size >> mpd->inode->i_blkbits, err);
+ printk(KERN_CRIT "This should not happen!! "
+@@ -2956,7 +2956,7 @@ retry:
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
+- "%ld pages, ino %lu; err %d\n", __func__,
++ "%ld pages, ino %lu; err %d", __func__,
+ wbc->nr_to_write, inode->i_ino, ret);
+ goto out_writepages;
+ }
+@@ -3031,7 +3031,7 @@ retry:
+ if (pages_skipped != wbc->pages_skipped)
+ ext4_msg(inode->i_sb, KERN_CRIT,
+ "This should not happen leaving %s "
+- "with nr_to_write = %ld ret = %d\n",
++ "with nr_to_write = %ld ret = %d",
+ __func__, wbc->nr_to_write, ret);
+
+ /* Update index */
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -2902,7 +2902,7 @@ no_journal:
+ err = ext4_setup_system_zone(sb);
+ if (err) {
+ ext4_msg(sb, KERN_ERR, "failed to initialize system "
+- "zone (%d)\n", err);
++ "zone (%d)", err);
+ goto failed_mount4;
+ }
+
--- /dev/null
+From 1050094d53941e319e9d50d4171f060dddd5dc87 Mon Sep 17 00:00:00 2001
+From: Nikanth Karthikesan <knikanth@suse.de>
+Date: Sun, 30 May 2010 22:49:57 -0400
+Subject: ext4: Prevent creation of files larger than RLIMIT_FSIZE using fallocate
+
+commit 6d19c42b7cf81c39632b6d4dbc514e8449bcd346 upstream (as of v2.6.34-git13)
+
+Currently using posix_fallocate one can bypass an RLIMIT_FSIZE limit
+and create a file larger than the limit. Add a check for that.
+
+Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
+Signed-off-by: Amit Arora <aarora@in.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3607,6 +3607,11 @@ long ext4_fallocate(struct inode *inode,
+ */
+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ mutex_lock(&inode->i_mutex);
++ ret = inode_newsize_ok(inode, (len + offset));
++ if (ret) {
++ mutex_unlock(&inode->i_mutex);
++ return ret;
++ }
+ retry:
+ while (ret >= 0 && ret < max_blocks) {
+ block = block + ret;
--- /dev/null
+From 7d4df70b86aef3e1c2b92bede60009527b3470fd Mon Sep 17 00:00:00 2001
+From: Curt Wohlgemuth <curtw@google.com>
+Date: Sun, 30 May 2010 22:49:58 -0400
+Subject: ext4: check for a good block group before loading buddy pages
+
+commit 8a57d9d61a6e361c7bb159dda797672c1df1a691 upstream (as of v2.6.34-git13)
+
+This adds a new field in ext4_group_info to cache the largest available
+block range in a block group; and don't load the buddy pages until *after*
+we've done a sanity check on the block group.
+
+With large allocation requests (e.g., fallocate(), 8MiB) and relatively full
+partitions, it's easy to have no block groups with a block extent large
+enough to satisfy the input request length. This currently causes the loop
+during cr == 0 in ext4_mb_regular_allocator() to load the buddy bitmap pages
+for EVERY block group. That can be a lot of pages. The patch below allows
+us to call ext4_mb_good_group() BEFORE we load the buddy pages (although we
+have check again after we lock the block group).
+
+Addresses-Google-Bug: #2578108
+Addresses-Google-Bug: #2704453
+
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 1
+ fs/ext4/mballoc.c | 70 +++++++++++++++++++++++++++++++++++++++++++-----------
+ 2 files changed, 58 insertions(+), 13 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1657,6 +1657,7 @@ struct ext4_group_info {
+ ext4_grpblk_t bb_first_free; /* first free block */
+ ext4_grpblk_t bb_free; /* total free blocks */
+ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
++ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
+ struct list_head bb_prealloc_list;
+ #ifdef DOUBLE_CHECK
+ void *bb_bitmap;
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(str
+ }
+ }
+
++/*
++ * Cache the order of the largest free extent we have available in this block
++ * group.
++ */
++static void
++mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
++{
++ int i;
++ int bits;
++
++ grp->bb_largest_free_order = -1; /* uninit */
++
++ bits = sb->s_blocksize_bits + 1;
++ for (i = bits; i >= 0; i--) {
++ if (grp->bb_counters[i] > 0) {
++ grp->bb_largest_free_order = i;
++ break;
++ }
++ }
++}
++
+ static noinline_for_stack
+ void ext4_mb_generate_buddy(struct super_block *sb,
+ void *buddy, void *bitmap, ext4_group_t group)
+@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super
+ */
+ grp->bb_free = free;
+ }
++ mb_set_largest_free_order(sb, grp);
+
+ clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+
+@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super
+ * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
+ * So it can have information regarding groups_per_page which
+ * is blocks_per_page/2
++ *
++ * Locking note: This routine takes the block group lock of all groups
++ * for this page; do not hold this lock when calling this routine!
+ */
+
+ static int ext4_mb_init_cache(struct page *page, char *incore)
+@@ -910,6 +935,11 @@ out:
+ return err;
+ }
+
++/*
++ * Locking note: This routine calls ext4_mb_init_cache(), which takes the
++ * block group lock of all groups for this page; do not hold the BG lock when
++ * calling this routine!
++ */
+ static noinline_for_stack
+ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+ {
+@@ -1004,6 +1034,11 @@ err:
+ return ret;
+ }
+
++/*
++ * Locking note: This routine calls ext4_mb_init_cache(), which takes the
++ * block group lock of all groups for this page; do not hold the BG lock when
++ * calling this routine!
++ */
+ static noinline_for_stack int
+ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+ struct ext4_buddy *e4b)
+@@ -1300,6 +1335,7 @@ static void mb_free_blocks(struct inode
+ buddy = buddy2;
+ } while (1);
+ }
++ mb_set_largest_free_order(sb, e4b->bd_info);
+ mb_check_buddy(e4b);
+ }
+
+@@ -1428,6 +1464,7 @@ static int mb_mark_used(struct ext4_budd
+ e4b->bd_info->bb_counters[ord]++;
+ e4b->bd_info->bb_counters[ord]++;
+ }
++ mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
+
+ mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+ mb_check_buddy(e4b);
+@@ -1823,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_al
+ }
+ }
+
++/* This is now called BEFORE we load the buddy bitmap. */
+ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
+ ext4_group_t group, int cr)
+ {
+ unsigned free, fragments;
+- unsigned i, bits;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
+ struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
+
+ BUG_ON(cr < 0 || cr >= 4);
+- BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
++
++ /* We only do this if the grp has never been initialized */
++ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
++ int ret = ext4_mb_init_group(ac->ac_sb, group);
++ if (ret)
++ return 0;
++ }
+
+ free = grp->bb_free;
+ fragments = grp->bb_fragments;
+@@ -1845,17 +1888,16 @@ static int ext4_mb_good_group(struct ext
+ case 0:
+ BUG_ON(ac->ac_2order == 0);
+
++ if (grp->bb_largest_free_order < ac->ac_2order)
++ return 0;
++
+ /* Avoid using the first bg of a flexgroup for data files */
+ if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
+ (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
+ ((group % flex_size) == 0))
+ return 0;
+
+- bits = ac->ac_sb->s_blocksize_bits + 1;
+- for (i = ac->ac_2order; i <= bits; i++)
+- if (grp->bb_counters[i] > 0)
+- return 1;
+- break;
++ return 1;
+ case 1:
+ if ((free / fragments) >= ac->ac_g_ex.fe_len)
+ return 1;
+@@ -2026,14 +2068,11 @@ repeat:
+ group = ac->ac_g_ex.fe_group;
+
+ for (i = 0; i < ngroups; group++, i++) {
+- struct ext4_group_info *grp;
+-
+ if (group == ngroups)
+ group = 0;
+
+- /* quick check to skip empty groups */
+- grp = ext4_get_group_info(sb, group);
+- if (grp->bb_free == 0)
++ /* This now checks without needing the buddy page */
++ if (!ext4_mb_good_group(ac, group, cr))
+ continue;
+
+ err = ext4_mb_load_buddy(sb, group, &e4b);
+@@ -2041,8 +2080,12 @@ repeat:
+ goto out;
+
+ ext4_lock_group(sb, group);
++
++ /*
++ * We need to check again after locking the
++ * block group
++ */
+ if (!ext4_mb_good_group(ac, group, cr)) {
+- /* someone did allocation from this group */
+ ext4_unlock_group(sb, group);
+ ext4_mb_unload_buddy(&e4b);
+ continue;
+@@ -2255,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_b
+ INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+ init_rwsem(&meta_group_info[i]->alloc_sem);
+ meta_group_info[i]->bb_free_root.rb_node = NULL;
++ meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
+
+ #ifdef DOUBLE_CHECK
+ {
--- /dev/null
+From ab93377b76de07d4c8aacde97418651c7df6854e Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Sun, 30 May 2010 22:49:59 -0400
+Subject: ext4: Show journal_checksum option
+
+commit 39a4bade8c1826b658316d66ee81c09b0a4d7d42 upstream (as of v2.6.34-git13)
+
+We failed to show journal_checksum option in /proc/mounts. Fix it.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/super.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -877,6 +877,8 @@ static int ext4_show_options(struct seq_
+ seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+ seq_puts(seq, ",journal_async_commit");
++ else if (test_opt(sb, JOURNAL_CHECKSUM))
++ seq_puts(seq, ",journal_checksum");
+ if (test_opt(sb, NOBH))
+ seq_puts(seq, ",nobh");
+ if (test_opt(sb, I_VERSION))
--- /dev/null
+From cc781d3f1f03b2fd24b7260ed319dc34bf605ed0 Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:50:00 -0400
+Subject: ext4: Use bitops to read/modify i_flags in struct ext4_inode_info
+
+commit 12e9b892002d9af057655d35b44db8ee9243b0dc upstream (as of v2.6.34-git13)
+
+At several places we modify EXT4_I(inode)->i_flags without holding
+i_mutex (ext4_do_update_inode, ...). These modifications are racy and
+we can lose updates to i_flags. So convert handling of i_flags to use
+bitops which are atomic.
+
+https://bugzilla.kernel.org/show_bug.cgi?id=15792
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/dir.c | 4 -
+ fs/ext4/ext4.h | 109 +++++++++++++++++++++++++++++++++++++++++++-------
+ fs/ext4/ext4_jbd2.h | 6 +-
+ fs/ext4/extents.c | 10 ++--
+ fs/ext4/file.c | 2
+ fs/ext4/ialloc.c | 4 -
+ fs/ext4/inode.c | 30 ++++++-------
+ fs/ext4/mballoc.c | 4 -
+ fs/ext4/migrate.c | 2
+ fs/ext4/move_extent.c | 4 -
+ fs/ext4/namei.c | 10 ++--
+ fs/ext4/super.c | 1
+ fs/ext4/xattr.c | 4 -
+ 13 files changed, 135 insertions(+), 55 deletions(-)
+
+--- a/fs/ext4/dir.c
++++ b/fs/ext4/dir.c
+@@ -111,7 +111,7 @@ static int ext4_readdir(struct file *fil
+
+ if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+- ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
++ ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
+ ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
+ err = ext4_dx_readdir(filp, dirent, filldir);
+ if (err != ERR_BAD_DX_DIR) {
+@@ -122,7 +122,7 @@ static int ext4_readdir(struct file *fil
+ * We don't set the inode dirty flag since it's not
+ * critical that it get flushed back to the disk.
+ */
+- EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
++ ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
+ }
+ stored = 0;
+ offset = filp->f_pos & (sb->s_blocksize - 1);
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -315,6 +315,83 @@ static inline __u32 ext4_mask_flags(umod
+ return flags & EXT4_OTHER_FLMASK;
+ }
+
++/*
++ * Inode flags used for atomic set/get
++ */
++enum {
++ EXT4_INODE_SECRM = 0, /* Secure deletion */
++ EXT4_INODE_UNRM = 1, /* Undelete */
++ EXT4_INODE_COMPR = 2, /* Compress file */
++ EXT4_INODE_SYNC = 3, /* Synchronous updates */
++ EXT4_INODE_IMMUTABLE = 4, /* Immutable file */
++ EXT4_INODE_APPEND = 5, /* writes to file may only append */
++ EXT4_INODE_NODUMP = 6, /* do not dump file */
++ EXT4_INODE_NOATIME = 7, /* do not update atime */
++/* Reserved for compression usage... */
++ EXT4_INODE_DIRTY = 8,
++ EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
++ EXT4_INODE_NOCOMPR = 10, /* Don't compress */
++ EXT4_INODE_ECOMPR = 11, /* Compression error */
++/* End compression flags --- maybe not all used */
++ EXT4_INODE_INDEX = 12, /* hash-indexed directory */
++ EXT4_INODE_IMAGIC = 13, /* AFS directory */
++ EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */
++ EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */
++ EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */
++ EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
++ EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
++ EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
++ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
++ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
++ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
++};
++
++#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
++#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
++ printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
++ EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
++
++/*
++ * Since it's pretty easy to mix up bit numbers and hex values, and we
++ * can't do a compile-time test for ENUM values, we use a run-time
++ * test to make sure that EXT4_XXX_FL is consistent with respect to
++ * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop
++ * out so it won't cost any extra space in the compiled kernel image.
++ * But it's important that these values are the same, since we are
++ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
++ * must be consistent with the values of FS_XXX_FL defined in
++ * include/linux/fs.h and the on-disk values found in ext2, ext3, and
++ * ext4 filesystems, and of course the values defined in e2fsprogs.
++ *
++ * It's not paranoia if the Murphy's Law really *is* out to get you. :-)
++ */
++static inline void ext4_check_flag_values(void)
++{
++ CHECK_FLAG_VALUE(SECRM);
++ CHECK_FLAG_VALUE(UNRM);
++ CHECK_FLAG_VALUE(COMPR);
++ CHECK_FLAG_VALUE(SYNC);
++ CHECK_FLAG_VALUE(IMMUTABLE);
++ CHECK_FLAG_VALUE(APPEND);
++ CHECK_FLAG_VALUE(NODUMP);
++ CHECK_FLAG_VALUE(NOATIME);
++ CHECK_FLAG_VALUE(DIRTY);
++ CHECK_FLAG_VALUE(COMPRBLK);
++ CHECK_FLAG_VALUE(NOCOMPR);
++ CHECK_FLAG_VALUE(ECOMPR);
++ CHECK_FLAG_VALUE(INDEX);
++ CHECK_FLAG_VALUE(IMAGIC);
++ CHECK_FLAG_VALUE(JOURNAL_DATA);
++ CHECK_FLAG_VALUE(NOTAIL);
++ CHECK_FLAG_VALUE(DIRSYNC);
++ CHECK_FLAG_VALUE(TOPDIR);
++ CHECK_FLAG_VALUE(HUGE_FILE);
++ CHECK_FLAG_VALUE(EXTENTS);
++ CHECK_FLAG_VALUE(EA_INODE);
++ CHECK_FLAG_VALUE(EOFBLOCKS);
++ CHECK_FLAG_VALUE(RESERVED);
++}
++
+ /* Used to pass group descriptor data when online resize is done */
+ struct ext4_new_group_input {
+ __u32 group; /* Group number for this data */
+@@ -603,9 +680,8 @@ struct ext4_ext_cache {
+ */
+ struct ext4_inode_info {
+ __le32 i_data[15]; /* unconverted */
+- __u32 i_flags;
+- ext4_fsblk_t i_file_acl;
+ __u32 i_dtime;
++ ext4_fsblk_t i_file_acl;
+
+ /*
+ * i_block_group is the number of the block group which contains
+@@ -616,6 +692,7 @@ struct ext4_inode_info {
+ */
+ ext4_group_t i_block_group;
+ unsigned long i_state_flags; /* Dynamic state flags */
++ unsigned long i_flags;
+
+ ext4_lblk_t i_dir_start_lookup;
+ #ifdef CONFIG_EXT4_FS_XATTR
+@@ -1049,20 +1126,22 @@ enum {
+ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
+ };
+
+-static inline int ext4_test_inode_state(struct inode *inode, int bit)
+-{
+- return test_bit(bit, &EXT4_I(inode)->i_state_flags);
+-}
+-
+-static inline void ext4_set_inode_state(struct inode *inode, int bit)
+-{
+- set_bit(bit, &EXT4_I(inode)->i_state_flags);
++#define EXT4_INODE_BIT_FNS(name, field) \
++static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
++{ \
++ return test_bit(bit, &EXT4_I(inode)->i_##field); \
++} \
++static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
++{ \
++ set_bit(bit, &EXT4_I(inode)->i_##field); \
++} \
++static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
++{ \
++ clear_bit(bit, &EXT4_I(inode)->i_##field); \
+ }
+
+-static inline void ext4_clear_inode_state(struct inode *inode, int bit)
+-{
+- clear_bit(bit, &EXT4_I(inode)->i_state_flags);
+-}
++EXT4_INODE_BIT_FNS(flag, flags)
++EXT4_INODE_BIT_FNS(state, state_flags)
+ #else
+ /* Assume that user mode programs are passing in an ext4fs superblock, not
+ * a kernel struct super_block. This will allow us to call the feature-test
+@@ -1247,7 +1326,7 @@ struct ext4_dir_entry_2 {
+
+ #define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
+ EXT4_FEATURE_COMPAT_DIR_INDEX) && \
+- (EXT4_I(dir)->i_flags & EXT4_INDEX_FL))
++ ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
+ #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
+ #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
+
+--- a/fs/ext4/ext4_jbd2.h
++++ b/fs/ext4/ext4_jbd2.h
+@@ -282,7 +282,7 @@ static inline int ext4_should_journal_da
+ return 1;
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+ return 1;
+- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
++ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+ return 1;
+ return 0;
+ }
+@@ -293,7 +293,7 @@ static inline int ext4_should_order_data
+ return 0;
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
++ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+ return 0;
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+ return 1;
+@@ -306,7 +306,7 @@ static inline int ext4_should_writeback_
+ return 0;
+ if (EXT4_JOURNAL(inode) == NULL)
+ return 1;
+- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
++ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+ return 0;
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+ return 1;
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3409,12 +3409,12 @@ int ext4_ext_get_blocks(handle_t *handle
+ }
+ }
+
+- if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
++ if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
+ if (eh->eh_entries) {
+ last_ex = EXT_LAST_EXTENT(eh);
+ if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
+ + ext4_ext_get_actual_len(last_ex))
+- EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
++ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+ } else {
+ WARN_ON(eh->eh_entries == 0);
+ ext4_error(inode->i_sb, __func__,
+@@ -3560,7 +3560,7 @@ static void ext4_falloc_update_inode(str
+ * can proceed even if the new size is the same as i_size.
+ */
+ if (new_size > i_size_read(inode))
+- EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
++ ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+ }
+
+ }
+@@ -3588,7 +3588,7 @@ long ext4_fallocate(struct inode *inode,
+ * currently supporting (pre)allocate mode for extent-based
+ * files _only_
+ */
+- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EOPNOTSUPP;
+
+ /* preallocation to directories is currently not supported */
+@@ -3838,7 +3838,7 @@ int ext4_fiemap(struct inode *inode, str
+ int error = 0;
+
+ /* fallback to generic here if not in extents fmt */
+- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return generic_block_fiemap(inode, fieinfo, start, len,
+ ext4_get_block);
+
+--- a/fs/ext4/file.c
++++ b/fs/ext4/file.c
+@@ -65,7 +65,7 @@ ext4_file_write(struct kiocb *iocb, cons
+ * is smaller than s_maxbytes, which is for extent-mapped files.
+ */
+
+- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ size_t length = iov_length(iov, nr_segs);
+
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -497,7 +497,7 @@ static int find_group_orlov(struct super
+
+ if (S_ISDIR(mode) &&
+ ((parent == sb->s_root->d_inode) ||
+- (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
++ (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
+ int best_ndir = inodes_per_group;
+ int ret = -1;
+
+@@ -1044,7 +1044,7 @@ got:
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ /* set extent flag only for directory, file and normal symlink*/
+ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
+- EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
++ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+ ext4_ext_tree_init(handle, inode);
+ }
+ }
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -957,7 +957,7 @@ static int ext4_ind_get_blocks(handle_t
+ int count = 0;
+ ext4_fsblk_t first_block = 0;
+
+- J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
++ J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
+ J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
+ depth = ext4_block_to_path(inode, iblock, offsets,
+ &blocks_to_boundary);
+@@ -1085,7 +1085,7 @@ static int ext4_indirect_calc_metadata_a
+ */
+ static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
+ {
+- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
++ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return ext4_ext_calc_metadata_amount(inode, lblock);
+
+ return ext4_indirect_calc_metadata_amount(inode, lblock);
+@@ -1274,7 +1274,7 @@ int ext4_get_blocks(handle_t *handle, st
+ * file system block.
+ */
+ down_read((&EXT4_I(inode)->i_data_sem));
+- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
++ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
+ bh, 0);
+ } else {
+@@ -1336,7 +1336,7 @@ int ext4_get_blocks(handle_t *handle, st
+ * We need to check for EXT4 here because migrate
+ * could have changed the inode type in between
+ */
+- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
++ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ retval = ext4_ext_get_blocks(handle, inode, block, max_blocks,
+ bh, flags);
+ } else {
+@@ -2371,7 +2371,7 @@ static void mpage_add_bh_to_extent(struc
+ goto flush_it;
+
+ /* check if thereserved journal credits might overflow */
+- if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
++ if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
+ if (nrblocks >= EXT4_MAX_TRANS_DATA) {
+ /*
+ * With non-extent format we are limited by the journal
+@@ -2836,7 +2836,7 @@ static int ext4_da_writepages_trans_bloc
+ * number of contiguous block. So we will limit
+ * number of contiguous block to a sane value
+ */
+- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
+ (max_blocks > EXT4_MAX_TRANS_DATA))
+ max_blocks = EXT4_MAX_TRANS_DATA;
+
+@@ -3872,7 +3872,7 @@ static ssize_t ext4_direct_IO(int rw, st
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+
+- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
++ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+@@ -4503,12 +4503,12 @@ void ext4_truncate(struct inode *inode)
+ if (!ext4_can_truncate(inode))
+ return;
+
+- EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
++ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+
+ if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+ ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+
+- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
++ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ ext4_ext_truncate(inode);
+ return;
+ }
+@@ -5350,7 +5350,7 @@ int ext4_setattr(struct dentry *dentry,
+ }
+
+ if (attr->ia_valid & ATTR_SIZE) {
+- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (attr->ia_size > sbi->s_bitmap_maxbytes) {
+@@ -5363,7 +5363,7 @@ int ext4_setattr(struct dentry *dentry,
+ if (S_ISREG(inode->i_mode) &&
+ attr->ia_valid & ATTR_SIZE &&
+ (attr->ia_size < inode->i_size ||
+- (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
++ (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
+ handle_t *handle;
+
+ handle = ext4_journal_start(inode, 3);
+@@ -5395,7 +5395,7 @@ int ext4_setattr(struct dentry *dentry,
+ }
+ }
+ /* ext4_truncate will clear the flag */
+- if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
++ if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
+ ext4_truncate(inode);
+ }
+
+@@ -5471,7 +5471,7 @@ static int ext4_indirect_trans_blocks(st
+
+ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+ {
+- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
+ return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
+ }
+@@ -5806,9 +5806,9 @@ int ext4_change_inode_journal_flag(struc
+ */
+
+ if (val)
+- EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
++ ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+ else
+- EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
++ ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+ ext4_set_aops(inode);
+
+ jbd2_journal_unlock_updates(journal);
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -2008,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_al
+ sbi = EXT4_SB(sb);
+ ngroups = ext4_get_groups_count(sb);
+ /* non-extent files are limited to low blocks/groups */
+- if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
++ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
+ ngroups = sbi->s_blockfile_groups;
+
+ BUG_ON(ac->ac_status == AC_STATUS_FOUND);
+@@ -3176,7 +3176,7 @@ ext4_mb_use_preallocated(struct ext4_all
+ continue;
+
+ /* non-extent files can't have physical blocks past 2^32 */
+- if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
++ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
+ pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+ continue;
+
+--- a/fs/ext4/migrate.c
++++ b/fs/ext4/migrate.c
+@@ -465,7 +465,7 @@ int ext4_ext_migrate(struct inode *inode
+ */
+ if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+- (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++ (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EINVAL;
+
+ if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -975,11 +975,11 @@ mext_check_arguments(struct inode *orig_
+ }
+
+ /* Ext4 move extent supports only extent based file */
+- if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
++ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
+ ext4_debug("ext4 move extent: orig file is not extents "
+ "based file [ino:orig %lu]\n", orig_inode->i_ino);
+ return -EOPNOTSUPP;
+- } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
++ } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
+ ext4_debug("ext4 move extent: donor file is not extents "
+ "based file [ino:donor %lu]\n", donor_inode->i_ino);
+ return -EOPNOTSUPP;
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -660,7 +660,7 @@ int ext4_htree_fill_tree(struct file *di
+ dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
+ start_hash, start_minor_hash));
+ dir = dir_file->f_path.dentry->d_inode;
+- if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
++ if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
+ hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+ if (hinfo.hash_version <= DX_HASH_TEA)
+ hinfo.hash_version +=
+@@ -805,7 +805,7 @@ static void ext4_update_dx_flag(struct i
+ {
+ if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_COMPAT_DIR_INDEX))
+- EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
++ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+ }
+
+ /*
+@@ -1424,7 +1424,7 @@ static int make_indexed_dir(handle_t *ha
+ brelse(bh);
+ return retval;
+ }
+- EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
++ ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
+ data1 = bh2->b_data;
+
+ memcpy (data1, de, len);
+@@ -1497,7 +1497,7 @@ static int ext4_add_entry(handle_t *hand
+ retval = ext4_dx_add_entry(handle, dentry, inode);
+ if (!retval || (retval != ERR_BAD_DX_DIR))
+ return retval;
+- EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
++ ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
+ dx_fallback++;
+ ext4_mark_inode_dirty(handle, dir);
+ }
+@@ -2292,7 +2292,7 @@ retry:
+ }
+ } else {
+ /* clear the extent format for fast symlink */
+- EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
++ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+ inode->i_op = &ext4_fast_symlink_inode_operations;
+ memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
+ inode->i_size = l-1;
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -3999,6 +3999,7 @@ static int __init init_ext4_fs(void)
+ {
+ int err;
+
++ ext4_check_flag_values();
+ err = init_ext4_system_zone();
+ if (err)
+ return err;
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -816,7 +816,7 @@ inserted:
+ EXT4_I(inode)->i_block_group);
+
+ /* non-extent files can't have physical blocks past 2^32 */
+- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+
+ block = ext4_new_meta_blocks(handle, inode,
+@@ -824,7 +824,7 @@ inserted:
+ if (error)
+ goto cleanup;
+
+- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
+
+ ea_idebug(inode, "creating block %d", block);
--- /dev/null
+From 570f16c4bfa97a7b2d3b3e6c0b8936ee91f32481 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 30 May 2010 22:50:01 -0400
+Subject: ext4: Avoid crashing on NULL ptr dereference on a filesystem error
+
+commit f70f362b4a6fe47c239dbfb3efc0cc2c10e4f09c upstream (as of v2.6.34-git13)
+
+If the EOFBLOCK_FL flag is set when it should not be and the inode is
+zero length, then eh_entries is zero, and ex is NULL, so dereferencing
+ex to print ex->ee_block causes a kernel OOPS in
+ext4_ext_map_blocks().
+
+On top of that, the error message which is printed isn't very helpful.
+So we fix this by printing something more explanatory which doesn't
+involve trying to print ex->ee_block.
+
+Addresses-Google-Bug: #2655740
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c | 11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3281,8 +3281,8 @@ int ext4_ext_get_blocks(handle_t *handle
+ */
+ if (path[depth].p_ext == NULL && depth != 0) {
+ ext4_error(inode->i_sb, __func__, "bad extent address "
+- "inode: %lu, iblock: %d, depth: %d",
+- inode->i_ino, iblock, depth);
++ "inode: %lu, iblock: %lu, depth: %d",
++ inode->i_ino, (unsigned long) iblock, depth);
+ err = -EIO;
+ goto out2;
+ }
+@@ -3418,8 +3418,11 @@ int ext4_ext_get_blocks(handle_t *handle
+ } else {
+ WARN_ON(eh->eh_entries == 0);
+ ext4_error(inode->i_sb, __func__,
+- "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
+- }
++ "inode#%lu, eh->eh_entries = 0 and "
++ "EOFBLOCKS_FL set", inode->i_ino);
++ err = -EIO;
++ goto out2;
++ }
+ }
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (err) {
--- /dev/null
+From 3b2905c2bc46795b9c8e54ddc435bd78f4391972 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 30 May 2010 22:50:02 -0400
+Subject: ext4: Clear the EXT4_EOFBLOCKS_FL flag only when warranted
+
+commit 786ec7915e530936b9eb2e3d12274145cab7aa7d upstream (as of v2.6.34-git13)
+
+Dimitry Monakhov discovered an edge case where it was possible for the
+EXT4_EOFBLOCKS_FL flag could get cleared unnecessarily. This is true;
+I have a test case that can be exercised via downloading and
+decompressing the file:
+
+wget ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/ext4-testcases/eofblocks-fl-test-case.img.bz2
+bunzip2 eofblocks-fl-test-case.img
+dd if=/dev/zero of=eofblocks-fl-test-case.img bs=1k seek=17925 bs=1k count=1 conv=notrunc
+
+However, triggering it in real life is highly unlikely since it
+requires an extremely fragmented sparse file with a hole in exactly
+the right place in the extent tree. (It actually took quite a bit of
+work to generate this test case.) Still, it's nice to get even
+extreme corner cases to be correct, so this patch makes sure that we
+don't clear the EXT4_EOFBLOCKS_FL incorrectly even in this corner
+case.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c | 26 ++++++++++++++++++--------
+ 1 file changed, 18 insertions(+), 8 deletions(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3229,7 +3229,7 @@ int ext4_ext_get_blocks(handle_t *handle
+ struct ext4_extent_header *eh;
+ struct ext4_extent newex, *ex, *last_ex;
+ ext4_fsblk_t newblock;
+- int err = 0, depth, ret, cache_type;
++ int i, err = 0, depth, ret, cache_type;
+ unsigned int allocated = 0;
+ struct ext4_allocation_request ar;
+ ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+@@ -3410,19 +3410,29 @@ int ext4_ext_get_blocks(handle_t *handle
+ }
+
+ if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
+- if (eh->eh_entries) {
+- last_ex = EXT_LAST_EXTENT(eh);
+- if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
+- + ext4_ext_get_actual_len(last_ex))
+- ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+- } else {
+- WARN_ON(eh->eh_entries == 0);
++ if (unlikely(!eh->eh_entries)) {
+ ext4_error(inode->i_sb, __func__,
+ "inode#%lu, eh->eh_entries = 0 and "
+ "EOFBLOCKS_FL set", inode->i_ino);
+ err = -EIO;
+ goto out2;
+ }
++ last_ex = EXT_LAST_EXTENT(eh);
++ /*
++ * If the current leaf block was reached by looking at
++ * the last index block all the way down the tree, and
++ * we are extending the inode beyond the last extent
++ * in the current leaf block, then clear the
++ * EOFBLOCKS_FL flag.
++ */
++ for (i = depth-1; i >= 0; i--) {
++ if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
++ break;
++ }
++ if ((i < 0) &&
++ (iblock + ar.len > le32_to_cpu(last_ex->ee_block) +
++ ext4_ext_get_actual_len(last_ex)))
++ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+ }
+ err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+ if (err) {
--- /dev/null
+From b3143b86111dcac45717136a6d776f993aace17f Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:50:03 -0400
+Subject: ext4: restart ext4_ext_remove_space() after transaction restart
+
+commit 0617b83fa239db9743a18ce6cc0e556f4d0fd567 upstream (as of v2.6.34-git13)
+
+If i_data_sem was internally dropped due to transaction restart, it is
+necessary to restart path look-up because extents tree was possibly
+modified by ext4_get_block().
+
+https://bugzilla.kernel.org/show_bug.cgi?id=15827
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Acked-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c | 16 +++++++++-------
+ 1 file changed, 9 insertions(+), 7 deletions(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_rest
+ if (err <= 0)
+ return err;
+ err = ext4_truncate_restart_trans(handle, inode, needed);
+- /*
+- * We have dropped i_data_sem so someone might have cached again
+- * an extent we are going to truncate.
+- */
+- ext4_ext_invalidate_cache(inode);
++ if (err == 0)
++ err = -EAGAIN;
+
+ return err;
+ }
+@@ -2263,7 +2260,7 @@ static int ext4_ext_remove_space(struct
+ int depth = ext_depth(inode);
+ struct ext4_ext_path *path;
+ handle_t *handle;
+- int i = 0, err = 0;
++ int i, err;
+
+ ext_debug("truncate since %u\n", start);
+
+@@ -2272,23 +2269,26 @@ static int ext4_ext_remove_space(struct
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
++again:
+ ext4_ext_invalidate_cache(inode);
+
+ /*
+ * We start scanning from right side, freeing all the blocks
+ * after i_size and walking into the tree depth-wise.
+ */
++ depth = ext_depth(inode);
+ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
+ if (path == NULL) {
+ ext4_journal_stop(handle);
+ return -ENOMEM;
+ }
++ path[0].p_depth = depth;
+ path[0].p_hdr = ext_inode_hdr(inode);
+ if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
+ err = -EIO;
+ goto out;
+ }
+- path[0].p_depth = depth;
++ i = err = 0;
+
+ while (i >= 0 && err == 0) {
+ if (i == depth) {
+@@ -2382,6 +2382,8 @@ static int ext4_ext_remove_space(struct
+ out:
+ ext4_ext_drop_refs(path);
+ kfree(path);
++ if (err == -EAGAIN)
++ goto again;
+ ext4_journal_stop(handle);
+
+ return err;
--- /dev/null
+From e58debc557cca3fa1ce0f893978be42dfa489699 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 30 May 2010 22:50:04 -0400
+Subject: ext4: Conditionally define compat ioctl numbers
+
+commit 899ad0cea6ad7ff4ba24b16318edbc3cbbe03fad upstream (as of v2.6.34-git13)
+
+It is unnecessary, and in general impossible, to define the compat
+ioctl numbers except when building the filesystem with CONFIG_COMPAT
+defined.
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -462,6 +462,7 @@ struct ext4_new_group_data {
+ #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
+ #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
+
++#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+ /*
+ * ioctl commands in 32 bit emulation
+ */
+@@ -477,6 +478,7 @@ struct ext4_new_group_data {
+ #endif
+ #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
+ #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
++#endif
+
+
+ /*
--- /dev/null
+From a496748686cdccd4b5bf1b5696919e380dc48da0 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 30 May 2010 22:50:05 -0400
+Subject: ext4: Fix compat EXT4_IOC_ADD_GROUP
+
+commit 4d92dc0f00a775dc2e1267b0e00befb783902fe7 upstream (as of v2.6.34-git13)
+
+struct ext4_new_group_input needs to be converted because u64 has
+only 32-bit alignment on some 32-bit architectures, notably i386.
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 16 ++++++++++++++++
+ fs/ext4/ioctl.c | 25 +++++++++++++++++++++++--
+ 2 files changed, 39 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -29,6 +29,9 @@
+ #include <linux/wait.h>
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
++#ifdef __KERNEL__
++#include <linux/compat.h>
++#endif
+
+ /*
+ * The fourth extended filesystem constants/structures
+@@ -403,6 +406,18 @@ struct ext4_new_group_input {
+ __u16 unused;
+ };
+
++#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
++struct compat_ext4_new_group_input {
++ u32 group;
++ compat_u64 block_bitmap;
++ compat_u64 inode_bitmap;
++ compat_u64 inode_table;
++ u32 blocks_count;
++ u16 reserved_blocks;
++ u16 unused;
++};
++#endif
++
+ /* The struct ext4_new_group_input in kernel space, with free_blocks_count */
+ struct ext4_new_group_data {
+ __u32 group;
+@@ -473,6 +488,7 @@ struct ext4_new_group_data {
+ #define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
+ #define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
+ #define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
++#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
+ #ifdef CONFIG_JBD2_DEBUG
+ #define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
+ #endif
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -373,8 +373,29 @@ long ext4_compat_ioctl(struct file *file
+ case EXT4_IOC32_SETRSVSZ:
+ cmd = EXT4_IOC_SETRSVSZ;
+ break;
+- case EXT4_IOC_GROUP_ADD:
+- break;
++ case EXT4_IOC32_GROUP_ADD: {
++ struct compat_ext4_new_group_input __user *uinput;
++ struct ext4_new_group_input input;
++ mm_segment_t old_fs;
++ int err;
++
++ uinput = compat_ptr(arg);
++ err = get_user(input.group, &uinput->group);
++ err |= get_user(input.block_bitmap, &uinput->block_bitmap);
++ err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
++ err |= get_user(input.inode_table, &uinput->inode_table);
++ err |= get_user(input.blocks_count, &uinput->blocks_count);
++ err |= get_user(input.reserved_blocks,
++ &uinput->reserved_blocks);
++ if (err)
++ return -EFAULT;
++ old_fs = get_fs();
++ set_fs(KERNEL_DS);
++ err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
++ (unsigned long) &input);
++ set_fs(old_fs);
++ return err;
++ }
+ case EXT4_IOC_MOVE_EXT:
+ break;
+ default:
--- /dev/null
+From 2959737e6c8ee73e85bf706f11b272bab323597f Mon Sep 17 00:00:00 2001
+From: Frank Mayhar <fmayhar@google.com>
+Date: Sun, 30 May 2010 22:50:06 -0400
+Subject: ext4: Make fsync sync new parent directories in no-journal mode
+
+commit 14ece1028b3ed53ffec1b1213ffc6acaf79ad77c upstream (as of v2.6.34-git13)
+
+Add a new ext4 state to tell us when a file has been newly created; use
+that state in ext4_sync_file in no-journal mode to tell us when we need
+to sync the parent directory as well as the inode and data itself. This
+fixes a problem in which a panic or power failure may lose the entire
+file even when using fsync, since the parent directory entry is lost.
+
+Addresses-Google-Bug: #2480057
+
+Signed-off-by: Frank Mayhar <fmayhar@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h | 1 +
+ fs/ext4/fsync.c | 31 +++++++++++++++++++++++++++++--
+ fs/ext4/namei.c | 2 ++
+ 3 files changed, 32 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1142,6 +1142,7 @@ enum {
+ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
+ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
+ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
++ EXT4_STATE_NEWENTRY, /* File just added to dir */
+ };
+
+ #define EXT4_INODE_BIT_FNS(name, field) \
+--- a/fs/ext4/fsync.c
++++ b/fs/ext4/fsync.c
+@@ -35,6 +35,29 @@
+ #include <trace/events/ext4.h>
+
+ /*
++ * If we're not journaling and this is a just-created file, we have to
++ * sync our parent directory (if it was freshly created) since
++ * otherwise it will only be written by writeback, leaving a huge
++ * window during which a crash may lose the file. This may apply for
++ * the parent directory's parent as well, and so on recursively, if
++ * they are also freshly created.
++ */
++static void ext4_sync_parent(struct inode *inode)
++{
++ struct dentry *dentry = NULL;
++
++ while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
++ ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
++ dentry = list_entry(inode->i_dentry.next,
++ struct dentry, d_alias);
++ if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
++ break;
++ inode = dentry->d_parent->d_inode;
++ sync_mapping_buffers(inode->i_mapping);
++ }
++}
++
++/*
+ * akpm: A new design for ext4_sync_file().
+ *
+ * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
+@@ -67,8 +90,12 @@ int ext4_sync_file(struct file *file, st
+ if (ret < 0)
+ return ret;
+
+- if (!journal)
+- return simple_fsync(file, dentry, datasync);
++ if (!journal) {
++ ret = simple_fsync(file, dentry, datasync);
++ if (!ret && !list_empty(&inode->i_dentry))
++ ext4_sync_parent(inode);
++ return ret;
++ }
+
+ /*
+ * data=writeback,ordered:
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -1525,6 +1525,8 @@ static int ext4_add_entry(handle_t *hand
+ de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
+ retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+ brelse(bh);
++ if (retval == 0)
++ ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
+ return retval;
+ }
+
usb-add-quirk-for-broadcom-bt-dongle.patch
usb-ftdi-add-support-for-the-rt-system-vx-7-radio-programming-cable.patch
ethtool-fix-potential-user-buffer-overflow-for-ethtool_-g-s-rxfh.patch
+0001-ext4-Fix-potential-quota-deadlock.patch
+0002-ext4-replace-BUG-with-return-EIO-in-ext4_ext_get_blo.patch
+0003-ext4-jbd2-Add-barriers-for-file-systems-with-exernal.patch
+0004-ext4-Eliminate-potential-double-free-on-error-path.patch
+0005-ext4-return-correct-wbc.nr_to_write-in-ext4_da_write.patch
+0006-ext4-Ensure-zeroout-blocks-have-no-dirty-metadata.patch
+0007-ext4-Patch-up-how-we-claim-metadata-blocks-for-quota.patch
+0008-ext4-Fix-accounting-of-reserved-metadata-blocks.patch
+0009-ext4-Calculate-metadata-requirements-more-accurately.patch
+0010-ext4-Handle-EDQUOT-error-on-write.patch
+0011-ext4-Fix-quota-accounting-error-with-fallocate.patch
+0012-ext4-Drop-EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE-flag.patch
+0013-ext4-Use-bitops-to-read-modify-EXT4_I-inode-i_state.patch
+0014-ext4-Fix-BUG_ON-at-fs-buffer.c-652-in-no-journal-mod.patch
+0015-ext4-Add-flag-to-files-with-blocks-intentionally-pas.patch
+0016-ext4-Fix-fencepost-error-in-chosing-choosing-group-v.patch
+0017-ext4-fix-error-handling-in-migrate.patch
+0018-ext4-explicitly-remove-inode-from-orphan-list-after-.patch
+0019-ext4-Handle-non-empty-on-disk-orphan-link.patch
+0020-ext4-make-offset-consistent-in-ext4_check_dir_entry.patch
+0021-ext4-Fix-insertion-point-of-extent-in-mext_insert_ac.patch
+0022-ext4-Fix-the-NULL-reference-in-double_down_write_dat.patch
+0023-ext4-Code-cleanup-for-EXT4_IOC_MOVE_EXT-ioctl.patch
+0024-ext4-Fix-estimate-of-of-blocks-needed-to-write-indir.patch
+0025-ext4-Fixed-inode-allocator-to-correctly-track-a-flex.patch
+0026-ext4-Fix-possible-lost-inode-write-in-no-journal-mod.patch
+0027-ext4-Fix-buffer-head-leaks-after-calls-to-ext4_get_i.patch
+0028-ext4-Issue-the-discard-operation-before-releasing-th.patch
+0029-ext4-check-missed-return-value-in-ext4_sync_file.patch
+0030-ext4-fix-memory-leaks-in-error-path-handling-of-ext4.patch
+0031-ext4-Remove-unnecessary-call-to-ext4_get_group_desc-.patch
+0032-ext4-rename-ext4_mb_release_desc-to-ext4_mb_unload_b.patch
+0033-ext4-allow-defrag-EXT4_IOC_MOVE_EXT-in-32bit-compat-.patch
+0034-ext4-fix-quota-accounting-in-case-of-fallocate.patch
+0035-ext4-check-s_log_groups_per_flex-in-online-resize-co.patch
+0036-ext4-don-t-return-to-userspace-after-freezing-the-fs.patch
+0037-ext4-stop-issuing-discards-if-not-supported-by-devic.patch
+0038-ext4-don-t-scan-accumulate-more-pages-than-mballoc-w.patch
+0039-ext4-Do-not-zero-out-uninitialized-extents-beyond-i_.patch
+0040-ext4-clean-up-inode-bitmaps-manipulation-in-ext4_fre.patch
+0041-ext4-init-statistics-after-journal-recovery.patch
+0042-ext4-Remove-extraneous-newlines-in-ext4_msg-calls.patch
+0043-ext4-Prevent-creation-of-files-larger-than-RLIMIT_FS.patch
+0044-ext4-check-for-a-good-block-group-before-loading-bud.patch
+0045-ext4-Show-journal_checksum-option.patch
+0046-ext4-Use-bitops-to-read-modify-i_flags-in-struct-ext.patch
+0047-ext4-Avoid-crashing-on-NULL-ptr-dereference-on-a-fil.patch
+0048-ext4-Clear-the-EXT4_EOFBLOCKS_FL-flag-only-when-warr.patch
+0049-ext4-restart-ext4_ext_remove_space-after-transaction.patch
+0050-ext4-Conditionally-define-compat-ioctl-numbers.patch
+0051-ext4-Fix-compat-EXT4_IOC_ADD_GROUP.patch
+0052-ext4-Make-fsync-sync-new-parent-directories-in-no-jo.patch
+0001-KVM-MMU-Remove-user-access-when-allowing-kernel-acce.patch
+0002-KVM-SVM-Handle-MCEs-early-in-the-vmexit-process.patch
+0003-KVM-SVM-Implement-workaround-for-Erratum-383.patch
+0004-KVM-MMU-invalidate-and-flush-on-spte-small-large-pag.patch