From: Greg Kroah-Hartman Date: Wed, 28 Jul 2010 23:35:27 +0000 (-0700) Subject: .32 patches for ext4 and kvm X-Git-Tag: v2.6.27.49~18 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7d77745665297ff780257b7c6b53ac7b55a4dcc4;p=thirdparty%2Fkernel%2Fstable-queue.git .32 patches for ext4 and kvm --- diff --git a/queue-2.6.32/0001-KVM-MMU-Remove-user-access-when-allowing-kernel-acce.patch b/queue-2.6.32/0001-KVM-MMU-Remove-user-access-when-allowing-kernel-acce.patch new file mode 100644 index 00000000000..262239369ff --- /dev/null +++ b/queue-2.6.32/0001-KVM-MMU-Remove-user-access-when-allowing-kernel-acce.patch @@ -0,0 +1,29 @@ +From 268a9f6207f354daedf0f92b0b57986bea37b69c Mon Sep 17 00:00:00 2001 +From: Avi Kivity +Date: Thu, 27 May 2010 14:35:58 +0300 +Subject: KVM: MMU: Remove user access when allowing kernel access to gpte.w=0 page + +If cr0.wp=0, we have to allow the guest kernel access to a page with pte.w=0. +We do that by setting spte.w=1, since the host cr0.wp must remain set so the +host can write protect pages. Once we allow write access, we must remove +user access otherwise we mistakenly allow the user to write the page. + +Reviewed-by: Xiao Guangrong +Signed-off-by: Avi Kivity +(cherry picked from commit 69325a122580d3a7b26589e8efdd6663001c3297) +--- + arch/x86/kvm/mmu.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -1843,6 +1843,9 @@ static int set_spte(struct kvm_vcpu *vcp + + spte |= PT_WRITABLE_MASK; + ++ if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) ++ spte &= ~PT_USER_MASK; ++ + /* + * Optimization: for pte sync, if spte was writable the hash + * lookup is unnecessary (and expensive). Write protection diff --git a/queue-2.6.32/0001-ext4-Fix-potential-quota-deadlock.patch b/queue-2.6.32/0001-ext4-Fix-potential-quota-deadlock.patch new file mode 100644 index 00000000000..210732535fe --- /dev/null +++ b/queue-2.6.32/0001-ext4-Fix-potential-quota-deadlock.patch @@ -0,0 +1,242 @@ +From fea2aabf4ac586092b1a3acb4adb234bb4bf6266 Mon Sep 17 00:00:00 2001 +From: Dmitry Monakhov +Date: Sun, 30 May 2010 22:49:14 -0400 +Subject: ext4: Fix potential quota deadlock + +commit d21cd8f163ac44b15c465aab7306db931c606908 upstream (as of v2.6.33-rc2) + +We have to delay vfs_dq_claim_space() until allocation context destruction. +Currently we have following call-trace: +ext4_mb_new_blocks() + /* task is already holding ac->alloc_semp */ + ->ext4_mb_mark_diskspace_used + ->vfs_dq_claim_space() /* acquire dqptr_sem here. Possible deadlock */ + ->ext4_mb_release_context() /* drop ac->alloc_semp here */ + +Let's move quota claiming to ext4_da_update_reserve_space() + + ======================================================= + [ INFO: possible circular locking dependency detected ] + 2.6.32-rc7 #18 + ------------------------------------------------------- + write-truncate-/3465 is trying to acquire lock: + (&s->s_dquot.dqptr_sem){++++..}, at: [] dquot_claim_space+0x3b/0x1b0 + + but task is already holding lock: + (&meta_group_info[i]->alloc_sem){++++..}, at: [] ext4_mb_load_buddy+0xb2/0x370 + + which lock already depends on the new lock. + + the existing dependency chain (in reverse order) is: + + -> #3 (&meta_group_info[i]->alloc_sem){++++..}: + [] __lock_acquire+0xd7b/0x1260 + [] lock_acquire+0xba/0xd0 + [] down_read+0x51/0x90 + [] ext4_mb_load_buddy+0xb2/0x370 + [] ext4_mb_free_blocks+0x46c/0x870 + [] ext4_free_blocks+0x73/0x130 + [] ext4_ext_truncate+0x76c/0x8d0 + [] ext4_truncate+0x187/0x5e0 + [] vmtruncate+0x6b/0x70 + [] inode_setattr+0x62/0x190 + [] ext4_setattr+0x25a/0x370 + [] notify_change+0x151/0x340 + [] do_truncate+0x6d/0xa0 + [] may_open+0x1d4/0x200 + [] do_filp_open+0x1eb/0x910 + [] do_sys_open+0x6d/0x140 + [] sys_open+0x2e/0x40 + [] sysenter_do_call+0x12/0x32 + + -> #2 (&ei->i_data_sem){++++..}: + [] __lock_acquire+0xd7b/0x1260 + [] lock_acquire+0xba/0xd0 + [] down_read+0x51/0x90 + [] ext4_get_blocks+0x47/0x450 + [] ext4_getblk+0x61/0x1d0 + [] ext4_bread+0x1f/0xa0 + [] ext4_quota_write+0x12c/0x310 + [] qtree_write_dquot+0x93/0x120 + [] v2_write_dquot+0x28/0x30 + [] dquot_commit+0xab/0xf0 + [] ext4_write_dquot+0x77/0x90 + [] ext4_mark_dquot_dirty+0x2f/0x50 + [] dquot_alloc_inode+0x101/0x180 + [] ext4_new_inode+0x602/0xf00 + [] ext4_create+0x89/0x150 + [] vfs_create+0xa2/0xc0 + [] do_filp_open+0x7a7/0x910 + [] do_sys_open+0x6d/0x140 + [] sys_open+0x2e/0x40 + [] sysenter_do_call+0x12/0x32 + + -> #1 (&sb->s_type->i_mutex_key#7/4){+.+...}: + [] __lock_acquire+0xd7b/0x1260 + [] lock_acquire+0xba/0xd0 + [] mutex_lock_nested+0x65/0x2d0 + [] vfs_load_quota_inode+0x4bd/0x5a0 + [] vfs_quota_on_path+0x5f/0x70 + [] ext4_quota_on+0x112/0x190 + [] sys_quotactl+0x44a/0x8a0 + [] sysenter_do_call+0x12/0x32 + + -> #0 (&s->s_dquot.dqptr_sem){++++..}: + [] __lock_acquire+0x1091/0x1260 + [] lock_acquire+0xba/0xd0 + [] down_read+0x51/0x90 + [] dquot_claim_space+0x3b/0x1b0 + [] ext4_mb_mark_diskspace_used+0x36f/0x380 + [] ext4_mb_new_blocks+0x34a/0x530 + [] ext4_ext_get_blocks+0x122b/0x13c0 + [] ext4_get_blocks+0x226/0x450 + [] mpage_da_map_blocks+0xc3/0xaa0 + [] ext4_da_writepages+0x506/0x790 + [] do_writepages+0x22/0x50 + [] __filemap_fdatawrite_range+0x6d/0x80 + [] filemap_flush+0x2b/0x30 + [] ext4_alloc_da_blocks+0x5c/0x60 + [] ext4_release_file+0x75/0xb0 + [] __fput+0xf9/0x210 + [] fput+0x27/0x30 + [] filp_close+0x4c/0x80 + [] put_files_struct+0x6e/0xd0 + [] exit_files+0x47/0x60 + [] do_exit+0x144/0x710 + [] do_group_exit+0x38/0xa0 + [] get_signal_to_deliver+0x2ac/0x410 + [] do_notify_resume+0xb9/0x890 + [] work_notifysig+0x13/0x21 + + other info that might help us debug this: + + 3 locks held by write-truncate-/3465: + #0: (jbd2_handle){+.+...}, at: [] start_this_handle+0x38f/0x5c0 + #1: (&ei->i_data_sem){++++..}, at: [] ext4_get_blocks+0xb6/0x450 + #2: (&meta_group_info[i]->alloc_sem){++++..}, at: [] ext4_mb_load_buddy+0xb2/0x370 + + stack backtrace: + Pid: 3465, comm: write-truncate- Not tainted 2.6.32-rc7 #18 + Call Trace: + [] ? printk+0x1d/0x22 + [] print_circular_bug+0xca/0xd0 + [] __lock_acquire+0x1091/0x1260 + [] ? sched_clock_local+0xd2/0x170 + [] ? trace_hardirqs_off_caller+0x20/0xd0 + [] lock_acquire+0xba/0xd0 + [] ? dquot_claim_space+0x3b/0x1b0 + [] down_read+0x51/0x90 + [] ? dquot_claim_space+0x3b/0x1b0 + [] dquot_claim_space+0x3b/0x1b0 + [] ext4_mb_mark_diskspace_used+0x36f/0x380 + [] ext4_mb_new_blocks+0x34a/0x530 + [] ? ext4_ext_find_extent+0x25d/0x280 + [] ext4_ext_get_blocks+0x122b/0x13c0 + [] ? sched_clock_local+0xd2/0x170 + [] ? sched_clock_cpu+0x120/0x160 + [] ? cpu_clock+0x4f/0x60 + [] ? trace_hardirqs_off_caller+0x20/0xd0 + [] ? down_write+0x8c/0xa0 + [] ext4_get_blocks+0x226/0x450 + [] ? sched_clock_cpu+0x120/0x160 + [] ? cpu_clock+0x4f/0x60 + [] ? trace_hardirqs_off+0xb/0x10 + [] mpage_da_map_blocks+0xc3/0xaa0 + [] ? find_get_pages_tag+0x16c/0x180 + [] ? find_get_pages_tag+0x0/0x180 + [] ? __mpage_da_writepage+0x16d/0x1a0 + [] ? pagevec_lookup_tag+0x2e/0x40 + [] ? write_cache_pages+0xdb/0x3d0 + [] ? __mpage_da_writepage+0x0/0x1a0 + [] ext4_da_writepages+0x506/0x790 + [] ? cpu_clock+0x4f/0x60 + [] ? sched_clock_local+0xd2/0x170 + [] ? sched_clock_cpu+0x120/0x160 + [] ? sched_clock_cpu+0x120/0x160 + [] ? ext4_da_writepages+0x0/0x790 + [] do_writepages+0x22/0x50 + [] __filemap_fdatawrite_range+0x6d/0x80 + [] filemap_flush+0x2b/0x30 + [] ext4_alloc_da_blocks+0x5c/0x60 + [] ext4_release_file+0x75/0xb0 + [] __fput+0xf9/0x210 + [] fput+0x27/0x30 + [] filp_close+0x4c/0x80 + [] put_files_struct+0x6e/0xd0 + [] exit_files+0x47/0x60 + [] do_exit+0x144/0x710 + [] ? lock_release_holdtime+0x33/0x210 + [] ? _spin_unlock_irq+0x27/0x30 + [] do_group_exit+0x38/0xa0 + [] ? trace_hardirqs_on+0xb/0x10 + [] get_signal_to_deliver+0x2ac/0x410 + [] do_notify_resume+0xb9/0x890 + [] ? trace_hardirqs_off_caller+0x20/0xd0 + [] ? lock_release_holdtime+0x33/0x210 + [] ? autoremove_wake_function+0x0/0x50 + [] ? trace_hardirqs_on_caller+0x134/0x190 + [] ? trace_hardirqs_on+0xb/0x10 + [] ? security_file_permission+0x14/0x20 + [] ? vfs_write+0x131/0x190 + [] ? do_sync_write+0x0/0x120 + [] ? sysenter_do_call+0x27/0x32 + [] work_notifysig+0x13/0x21 + +CC: Theodore Ts'o +Signed-off-by: Dmitry Monakhov +Signed-off-by: Jan Kara +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 9 +++++++-- + fs/ext4/mballoc.c | 6 ------ + 2 files changed, 7 insertions(+), 8 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1088,7 +1088,7 @@ static int ext4_calc_metadata_amount(str + static void ext4_da_update_reserve_space(struct inode *inode, int used) + { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +- int total, mdb, mdb_free; ++ int total, mdb, mdb_free, mdb_claim = 0; + + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + /* recalculate the number of metablocks still need to be reserved */ +@@ -1101,7 +1101,9 @@ static void ext4_da_update_reserve_space + + if (mdb_free) { + /* Account for allocated meta_blocks */ +- mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; ++ mdb_claim = EXT4_I(inode)->i_allocated_meta_blocks; ++ BUG_ON(mdb_free < mdb_claim); ++ mdb_free -= mdb_claim; + + /* update fs dirty blocks counter */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); +@@ -1112,8 +1114,11 @@ static void ext4_da_update_reserve_space + /* update per-inode reservations */ + BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); + EXT4_I(inode)->i_reserved_data_blocks -= used; ++ percpu_counter_sub(&sbi->s_dirtyblocks_counter, used + mdb_claim); + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + ++ vfs_dq_claim_block(inode, used + mdb_claim); ++ + /* + * free those over-booking quota for metadata blocks + */ +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -2755,12 +2755,6 @@ ext4_mb_mark_diskspace_used(struct ext4_ + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); +- else { +- percpu_counter_sub(&sbi->s_dirtyblocks_counter, +- ac->ac_b_ex.fe_len); +- /* convert reserved quota blocks to real quota blocks */ +- vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len); +- } + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, diff --git a/queue-2.6.32/0002-KVM-SVM-Handle-MCEs-early-in-the-vmexit-process.patch b/queue-2.6.32/0002-KVM-SVM-Handle-MCEs-early-in-the-vmexit-process.patch new file mode 100644 index 00000000000..e051f864cb6 --- /dev/null +++ b/queue-2.6.32/0002-KVM-SVM-Handle-MCEs-early-in-the-vmexit-process.patch @@ -0,0 +1,56 @@ +From 9ce5c64e94beb615d6581e7b8839bb0173903425 Mon Sep 17 00:00:00 2001 +From: Joerg Roedel +Date: Mon, 17 May 2010 14:43:34 +0200 +Subject: KVM: SVM: Handle MCEs early in the vmexit process + +This patch moves handling of the MC vmexits to an earlier +point in the vmexit. The handle_exit function is too late +because the vcpu might alreadry have changed its physical +cpu. + +Cc: stable@kernel.org +Signed-off-by: Joerg Roedel +Signed-off-by: Avi Kivity +(cherry picked from commit fe5913e4e1700cbfc337f4b1da9ddb26f6a55586) +--- + arch/x86/kvm/svm.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -1257,7 +1257,7 @@ static int nm_interception(struct vcpu_s + return 1; + } + +-static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) ++static void svm_handle_mce(struct vcpu_svm *svm) + { + /* + * On an #MC intercept the MCE handler is not called automatically in +@@ -1267,6 +1267,11 @@ static int mc_interception(struct vcpu_s + "int $0x12\n"); + /* not sure if we ever come back to this point */ + ++ return; ++} ++ ++static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) ++{ + return 1; + } + +@@ -2717,6 +2722,14 @@ static void svm_vcpu_run(struct kvm_vcpu + vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); + vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); + } ++ ++ /* ++ * We need to handle MC intercepts here before the vcpu has a chance to ++ * change the physical cpu ++ */ ++ if (unlikely(svm->vmcb->control.exit_code == ++ SVM_EXIT_EXCP_BASE + MC_VECTOR)) ++ svm_handle_mce(svm); + } + + #undef R diff --git a/queue-2.6.32/0002-ext4-replace-BUG-with-return-EIO-in-ext4_ext_get_blo.patch b/queue-2.6.32/0002-ext4-replace-BUG-with-return-EIO-in-ext4_ext_get_blo.patch new file mode 100644 index 00000000000..85c7e28eca9 --- /dev/null +++ b/queue-2.6.32/0002-ext4-replace-BUG-with-return-EIO-in-ext4_ext_get_blo.patch @@ -0,0 +1,41 @@ +From f57e36578513418a67eef4912c8503a47a4993aa Mon Sep 17 00:00:00 2001 +From: Surbhi Palande +Date: Sun, 30 May 2010 22:49:16 -0400 +Subject: ext4: replace BUG() with return -EIO in ext4_ext_get_blocks + +commit 034fb4c95fc0fed4ec4a50778127b92c6f2aec01 upstream (as of v2.6.33-rc3) + +This patch fixes the Kernel BZ #14286. When the address of an extent +corresponding to a valid block is corrupted, a -EIO should be reported +instead of a BUG(). This situation should not normally not occur +except in the case of a corrupted filesystem. If however it does, +then the system should not panic directly but depending on the mount +time options appropriate action should be taken. If the mount options +so permit, the I/O should be gracefully aborted by returning a -EIO. + +http://bugzilla.kernel.org/show_bug.cgi?id=14286 + +Signed-off-by: Surbhi Palande +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/extents.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3196,7 +3196,13 @@ int ext4_ext_get_blocks(handle_t *handle + * this situation is possible, though, _during_ tree modification; + * this is why assert can't be put in ext4_ext_find_extent() + */ +- BUG_ON(path[depth].p_ext == NULL && depth != 0); ++ if (path[depth].p_ext == NULL && depth != 0) { ++ ext4_error(inode->i_sb, __func__, "bad extent address " ++ "inode: %lu, iblock: %d, depth: %d", ++ inode->i_ino, iblock, depth); ++ err = -EIO; ++ goto out2; ++ } + eh = path[depth].p_hdr; + + ex = path[depth].p_ext; diff --git a/queue-2.6.32/0003-KVM-SVM-Implement-workaround-for-Erratum-383.patch b/queue-2.6.32/0003-KVM-SVM-Implement-workaround-for-Erratum-383.patch new file mode 100644 index 00000000000..5d8d39db8b1 --- /dev/null +++ b/queue-2.6.32/0003-KVM-SVM-Implement-workaround-for-Erratum-383.patch @@ -0,0 +1,165 @@ +From a61279422bc32ecbf85e3a6a9349287c7df0b0b1 Mon Sep 17 00:00:00 2001 +From: Joerg Roedel +Date: Mon, 17 May 2010 14:43:35 +0200 +Subject: KVM: SVM: Implement workaround for Erratum 383 + +This patch implements a workaround for AMD erratum 383 into +KVM. Without this erratum fix it is possible for a guest to +kill the host machine. This patch implements the suggested +workaround for hypervisors which will be published by the +next revision guide update. + +[jan: fix overflow warning on i386] +[xiao: fix unused variable warning] + +Cc: stable@kernel.org +Signed-off-by: Joerg Roedel +Signed-off-by: Jan Kiszka +Signed-off-by: Xiao Guangrong +Signed-off-by: Avi Kivity +(cherry picked from commit 67ec66077799f2fef84b21a643912b179c422281) +--- + arch/x86/include/asm/msr-index.h | 1 + arch/x86/kvm/svm.c | 84 ++++++++++++++++++++++++++++++++++++++- + 2 files changed, 84 insertions(+), 1 deletion(-) + +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -106,6 +106,7 @@ + #define MSR_AMD64_PATCH_LOADER 0xc0010020 + #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 + #define MSR_AMD64_OSVW_STATUS 0xc0010141 ++#define MSR_AMD64_DC_CFG 0xc0011022 + #define MSR_AMD64_IBSFETCHCTL 0xc0011030 + #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 + #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -27,6 +27,7 @@ + #include + #include + ++#include + #include + + #include +@@ -62,6 +63,8 @@ MODULE_LICENSE("GPL"); + #define nsvm_printk(fmt, args...) do {} while(0) + #endif + ++static bool erratum_383_found __read_mostly; ++ + static const u32 host_save_user_msrs[] = { + #ifdef CONFIG_X86_64 + MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, +@@ -299,6 +302,31 @@ static void skip_emulated_instruction(st + svm_set_interrupt_shadow(vcpu, 0); + } + ++static void svm_init_erratum_383(void) ++{ ++ u32 low, high; ++ int err; ++ u64 val; ++ ++ /* Only Fam10h is affected */ ++ if (boot_cpu_data.x86 != 0x10) ++ return; ++ ++ /* Use _safe variants to not break nested virtualization */ ++ val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); ++ if (err) ++ return; ++ ++ val |= (1ULL << 47); ++ ++ low = lower_32_bits(val); ++ high = upper_32_bits(val); ++ ++ native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); ++ ++ erratum_383_found = true; ++} ++ + static int has_svm(void) + { + const char *msg; +@@ -318,7 +346,6 @@ static void svm_hardware_disable(void *g + + static void svm_hardware_enable(void *garbage) + { +- + struct svm_cpu_data *svm_data; + uint64_t efer; + struct descriptor_table gdt_descr; +@@ -350,6 +377,10 @@ static void svm_hardware_enable(void *ga + + wrmsrl(MSR_VM_HSAVE_PA, + page_to_pfn(svm_data->save_area) << PAGE_SHIFT); ++ ++ svm_init_erratum_383(); ++ ++ return; + } + + static void svm_cpu_uninit(int cpu) +@@ -1257,8 +1288,59 @@ static int nm_interception(struct vcpu_s + return 1; + } + ++static bool is_erratum_383(void) ++{ ++ int err, i; ++ u64 value; ++ ++ if (!erratum_383_found) ++ return false; ++ ++ value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); ++ if (err) ++ return false; ++ ++ /* Bit 62 may or may not be set for this mce */ ++ value &= ~(1ULL << 62); ++ ++ if (value != 0xb600000000010015ULL) ++ return false; ++ ++ /* Clear MCi_STATUS registers */ ++ for (i = 0; i < 6; ++i) ++ native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); ++ ++ value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); ++ if (!err) { ++ u32 low, high; ++ ++ value &= ~(1ULL << 2); ++ low = lower_32_bits(value); ++ high = upper_32_bits(value); ++ ++ native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); ++ } ++ ++ /* Flush tlb to evict multi-match entries */ ++ __flush_tlb_all(); ++ ++ return true; ++} ++ + static void svm_handle_mce(struct vcpu_svm *svm) + { ++ if (is_erratum_383()) { ++ /* ++ * Erratum 383 triggered. Guest state is corrupt so kill the ++ * guest. ++ */ ++ pr_err("KVM: Guest triggered AMD Erratum 383\n"); ++ ++ set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests); ++ ++ return; ++ } ++ + /* + * On an #MC intercept the MCE handler is not called automatically in + * the host. So do it by hand here. diff --git a/queue-2.6.32/0003-ext4-jbd2-Add-barriers-for-file-systems-with-exernal.patch b/queue-2.6.32/0003-ext4-jbd2-Add-barriers-for-file-systems-with-exernal.patch new file mode 100644 index 00000000000..75bbb45519a --- /dev/null +++ b/queue-2.6.32/0003-ext4-jbd2-Add-barriers-for-file-systems-with-exernal.patch @@ -0,0 +1,141 @@ +From de6e76774ecec8a14ef63d3ad383479ca98633e6 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sun, 30 May 2010 22:49:17 -0400 +Subject: ext4, jbd2: Add barriers for file systems with exernal journals + +commit cc3e1bea5d87635c519da657303690f5538bb4eb upstream (as of v2.6.33-rc3) + +This is a bit complicated because we are trying to optimize when we +send barriers to the fs data disk. We could just throw in an extra +barrier to the data disk whenever we send a barrier to the journal +disk, but that's not always strictly necessary. + +We only need to send a barrier during a commit when there are data +blocks which are must be written out due to an inode written in +ordered mode, or if fsync() depends on the commit to force data blocks +to disk. Finally, before we drop transactions from the beginning of +the journal during a checkpoint operation, we need to guarantee that +any blocks that were flushed out to the data disk are firmly on the +rust platter before we drop the transaction from the journal. + +Thanks to Oleg Drokin for pointing out this flaw in ext3/ext4. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/fsync.c | 16 ++++++++++++++-- + fs/jbd2/checkpoint.c | 15 +++++++++++++++ + fs/jbd2/commit.c | 19 +++++++++++-------- + include/linux/jbd2.h | 1 + + 4 files changed, 41 insertions(+), 10 deletions(-) + +--- a/fs/ext4/fsync.c ++++ b/fs/ext4/fsync.c +@@ -88,9 +88,21 @@ int ext4_sync_file(struct file *file, st + return ext4_force_commit(inode->i_sb); + + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; +- if (jbd2_log_start_commit(journal, commit_tid)) ++ if (jbd2_log_start_commit(journal, commit_tid)) { ++ /* ++ * When the journal is on a different device than the ++ * fs data disk, we need to issue the barrier in ++ * writeback mode. (In ordered mode, the jbd2 layer ++ * will take care of issuing the barrier. In ++ * data=journal, all of the data blocks are written to ++ * the journal device.) ++ */ ++ if (ext4_should_writeback_data(inode) && ++ (journal->j_fs_dev != journal->j_dev) && ++ (journal->j_flags & JBD2_BARRIER)) ++ blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + jbd2_log_wait_commit(journal, commit_tid); +- else if (journal->j_flags & JBD2_BARRIER) ++ } else if (journal->j_flags & JBD2_BARRIER) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + return ret; + } +--- a/fs/jbd2/checkpoint.c ++++ b/fs/jbd2/checkpoint.c +@@ -22,6 +22,7 @@ + #include + #include + #include ++#include + #include + + /* +@@ -515,6 +516,20 @@ int jbd2_cleanup_journal_tail(journal_t + journal->j_tail_sequence = first_tid; + journal->j_tail = blocknr; + spin_unlock(&journal->j_state_lock); ++ ++ /* ++ * If there is an external journal, we need to make sure that ++ * any data blocks that were recently written out --- perhaps ++ * by jbd2_log_do_checkpoint() --- are flushed out before we ++ * drop the transactions from the external journal. It's ++ * unlikely this will be necessary, especially with a ++ * appropriately sized journal, but we need this to guarantee ++ * correctness. Fortunately jbd2_cleanup_journal_tail() ++ * doesn't get called all that often. ++ */ ++ if ((journal->j_fs_dev != journal->j_dev) && ++ (journal->j_flags & JBD2_BARRIER)) ++ blkdev_issue_flush(journal->j_fs_dev, NULL); + if (!(journal->j_flags & JBD2_ABORT)) + jbd2_journal_update_superblock(journal, 1); + return 0; +--- a/fs/jbd2/commit.c ++++ b/fs/jbd2/commit.c +@@ -259,6 +259,7 @@ static int journal_submit_data_buffers(j + ret = err; + spin_lock(&journal->j_list_lock); + J_ASSERT(jinode->i_transaction == commit_transaction); ++ commit_transaction->t_flushed_data_blocks = 1; + jinode->i_flags &= ~JI_COMMIT_RUNNING; + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); + } +@@ -708,8 +709,17 @@ start_journal_io: + } + } + +- /* Done it all: now write the commit record asynchronously. */ ++ /* ++ * If the journal is not located on the file system device, ++ * then we must flush the file system device before we issue ++ * the commit record ++ */ ++ if (commit_transaction->t_flushed_data_blocks && ++ (journal->j_fs_dev != journal->j_dev) && ++ (journal->j_flags & JBD2_BARRIER)) ++ blkdev_issue_flush(journal->j_fs_dev, NULL); + ++ /* Done it all: now write the commit record asynchronously. */ + if (JBD2_HAS_INCOMPAT_FEATURE(journal, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { + err = journal_submit_commit_record(journal, commit_transaction, +@@ -720,13 +730,6 @@ start_journal_io: + blkdev_issue_flush(journal->j_dev, NULL); + } + +- /* +- * This is the right place to wait for data buffers both for ASYNC +- * and !ASYNC commit. If commit is ASYNC, we need to wait only after +- * the commit block went to disk (which happens above). If commit is +- * SYNC, we need to wait for data buffers before we start writing +- * commit block, which happens below in such setting. +- */ + err = journal_finish_inode_data_buffers(journal, commit_transaction); + if (err) { + printk(KERN_WARNING +--- a/include/linux/jbd2.h ++++ b/include/linux/jbd2.h +@@ -653,6 +653,7 @@ struct transaction_s + * waiting for it to finish. + */ + unsigned int t_synchronous_commit:1; ++ unsigned int t_flushed_data_blocks:1; + + /* + * For use by the filesystem to store fs-specific data diff --git a/queue-2.6.32/0004-KVM-MMU-invalidate-and-flush-on-spte-small-large-pag.patch b/queue-2.6.32/0004-KVM-MMU-invalidate-and-flush-on-spte-small-large-pag.patch new file mode 100644 index 00000000000..e933f4bd00a --- /dev/null +++ b/queue-2.6.32/0004-KVM-MMU-invalidate-and-flush-on-spte-small-large-pag.patch @@ -0,0 +1,34 @@ +From 51e00c5c8ddedce8030521bf8645d90b82854980 Mon Sep 17 00:00:00 2001 +From: Marcelo Tosatti +Date: Fri, 28 May 2010 09:44:59 -0300 +Subject: KVM: MMU: invalidate and flush on spte small->large page size change + +Always invalidate spte and flush TLBs when changing page size, to make +sure different sized translations for the same address are never cached +in a CPU's TLB. + +Currently the only case where this occurs is when a non-leaf spte pointer is +overwritten by a leaf, large spte entry. This can happen after dirty +logging is disabled on a memslot, for example. + +Noticed by Andrea. + +KVM-Stable-Tag +Signed-off-by: Marcelo Tosatti +Signed-off-by: Avi Kivity +(cherry picked from commit 3be2264be3c00865116f997dc53ebcc90fe7fc4b) +--- + arch/x86/kvm/mmu.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/arch/x86/kvm/mmu.c ++++ b/arch/x86/kvm/mmu.c +@@ -1901,6 +1901,8 @@ static void mmu_set_spte(struct kvm_vcpu + + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(child, sptep); ++ __set_spte(sptep, shadow_trap_nonpresent_pte); ++ kvm_flush_remote_tlbs(vcpu->kvm); + } else if (pfn != spte_to_pfn(*sptep)) { + pgprintk("hfn old %lx new %lx\n", + spte_to_pfn(*sptep), pfn); diff --git a/queue-2.6.32/0004-ext4-Eliminate-potential-double-free-on-error-path.patch b/queue-2.6.32/0004-ext4-Eliminate-potential-double-free-on-error-path.patch new file mode 100644 index 00000000000..c590c7a4128 --- /dev/null +++ b/queue-2.6.32/0004-ext4-Eliminate-potential-double-free-on-error-path.patch @@ -0,0 +1,55 @@ +From 857855f2523af677951cb3bba61396813df6128d Mon Sep 17 00:00:00 2001 +From: Julia Lawall +Date: Sun, 30 May 2010 22:49:18 -0400 +Subject: ext4: Eliminate potential double free on error path + +commit d3533d72e7478a61a3e1936956fc825289a2acf4 upstream (as of v2.6.33-rc3) + +b_entry_name and buffer are initially NULL, are initialized within a loop +to the result of calling kmalloc, and are freed at the bottom of this loop. +The loop contains gotos to cleanup, which also frees b_entry_name and +buffer. Some of these gotos are before the reinitializations of +b_entry_name and buffer. To maintain the invariant that b_entry_name and +buffer are NULL at the top of the loop, and thus acceptable arguments to +kfree, these variables are now set to NULL after the kfrees. + +This seems to be the simplest solution. A more complicated solution +would be to introduce more labels in the error handling code at the end of +the function. + +A simplified version of the semantic match that finds this problem is as +follows: (http://coccinelle.lip6.fr/) + +// +@r@ +identifier E; +expression E1; +iterator I; +statement S; +@@ + +*kfree(E); +... when != E = E1 + when != I(E,...) S + when != &E +*kfree(E); +// + +Signed-off-by: Julia Lawall +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/xattr.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -1327,6 +1327,8 @@ retry: + goto cleanup; + kfree(b_entry_name); + kfree(buffer); ++ b_entry_name = NULL; ++ buffer = NULL; + brelse(is->iloc.bh); + kfree(is); + kfree(bs); diff --git a/queue-2.6.32/0005-ext4-return-correct-wbc.nr_to_write-in-ext4_da_write.patch b/queue-2.6.32/0005-ext4-return-correct-wbc.nr_to_write-in-ext4_da_write.patch new file mode 100644 index 00000000000..1de448b52af --- /dev/null +++ b/queue-2.6.32/0005-ext4-return-correct-wbc.nr_to_write-in-ext4_da_write.patch @@ -0,0 +1,33 @@ +From 657eba6d2e9501946a11cc4f53148e46e4b3cbe1 Mon Sep 17 00:00:00 2001 +From: Richard Kennedy +Date: Sun, 30 May 2010 22:49:19 -0400 +Subject: ext4: return correct wbc.nr_to_write in ext4_da_writepages + +commit 2faf2e19dd0e060eeb32442858ef495ac3083277 upstream (as of v2.6.33-rc3) + +When ext4_da_writepages increases the nr_to_write in writeback_control +then it must always re-base the return value. Originally there was a +(misguided) attempt prevent wbc.nr_to_write from going negative. In +fact, it's necessary to allow nr_to_write to be negative so that +wb_writeback() can correctly calculate how many pages were actually +written. + +Signed-off-by: Richard Kennedy +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3010,8 +3010,7 @@ retry: + out_writepages: + if (!no_nrwrite_index_update) + wbc->no_nrwrite_index_update = 0; +- if (wbc->nr_to_write > nr_to_writebump) +- wbc->nr_to_write -= nr_to_writebump; ++ wbc->nr_to_write -= nr_to_writebump; + wbc->range_start = range_start; + trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); + return ret; diff --git a/queue-2.6.32/0006-ext4-Ensure-zeroout-blocks-have-no-dirty-metadata.patch b/queue-2.6.32/0006-ext4-Ensure-zeroout-blocks-have-no-dirty-metadata.patch new file mode 100644 index 00000000000..f42e378d8e3 --- /dev/null +++ b/queue-2.6.32/0006-ext4-Ensure-zeroout-blocks-have-no-dirty-metadata.patch @@ -0,0 +1,55 @@ +From 436e2704a8b589fb1217add4f9e5be480773ca6c Mon Sep 17 00:00:00 2001 +From: Aneesh Kumar K.V +Date: Sun, 30 May 2010 22:49:20 -0400 +Subject: ext4: Ensure zeroout blocks have no dirty metadata + +commit 515f41c33a9d44a964264c9511ad2c869af1fac3 upstream (as of v2.6.33-rc3) + +This fixes a bug (found by Curt Wohlgemuth) in which new blocks +returned from an extent created with ext4_ext_zeroout() can have dirty +metadata still associated with them. + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: Curt Wohlgemuth +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/extents.c | 20 ++++++++++++++++++++ + 1 file changed, 20 insertions(+) + +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3029,6 +3029,14 @@ out: + return err; + } + ++static void unmap_underlying_metadata_blocks(struct block_device *bdev, ++ sector_t block, int count) ++{ ++ int i; ++ for (i = 0; i < count; i++) ++ unmap_underlying_metadata(bdev, block + i); ++} ++ + static int + ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, unsigned int max_blocks, +@@ -3104,6 +3112,18 @@ out: + } else + allocated = ret; + set_buffer_new(bh_result); ++ /* ++ * if we allocated more blocks than requested ++ * we need to make sure we unmap the extra block ++ * allocated. The actual needed block will get ++ * unmapped later when we find the buffer_head marked ++ * new. ++ */ ++ if (allocated > max_blocks) { ++ unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, ++ newblock + max_blocks, ++ allocated - max_blocks); ++ } + map_out: + set_buffer_mapped(bh_result); + out1: diff --git a/queue-2.6.32/0007-ext4-Patch-up-how-we-claim-metadata-blocks-for-quota.patch b/queue-2.6.32/0007-ext4-Patch-up-how-we-claim-metadata-blocks-for-quota.patch new file mode 100644 index 00000000000..2c83789f465 --- /dev/null +++ b/queue-2.6.32/0007-ext4-Patch-up-how-we-claim-metadata-blocks-for-quota.patch @@ -0,0 +1,266 @@ +From 74ded2cc0427839ccdda41f2738130f0eea77fde Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sun, 30 May 2010 22:49:21 -0400 +Subject: ext4: Patch up how we claim metadata blocks for quota purposes + +commit 0637c6f4135f592f094207c7c21e7c0fc5557834 upstream (as of v2.6.33-rc3) + +As reported in Kernel Bugzilla #14936, commit d21cd8f triggered a BUG +in the function ext4_da_update_reserve_space() found in +fs/ext4/inode.c. The root cause of this BUG() was caused by the fact +that ext4_calc_metadata_amount() can severely over-estimate how many +metadata blocks will be needed, especially when using direct +block-mapped files. + +In addition, it can also badly *under* estimate how much space is +needed, since ext4_calc_metadata_amount() assumes that the blocks are +contiguous, and this is not always true. If the application is +writing blocks to a sparse file, the number of metadata blocks +necessary can be severly underestimated by the functions +ext4_da_reserve_space(), ext4_da_update_reserve_space() and +ext4_da_release_space(). This was the cause of the dq_claim_space +reports found on kerneloops.org. + +Unfortunately, doing this right means that we need to massively +over-estimate the amount of free space needed. So in some cases we +may need to force the inode to be written to disk asynchronously in +to avoid spurious quota failures. + +http://bugzilla.kernel.org/show_bug.cgi?id=14936 + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 153 ++++++++++++++++++++++++++++++-------------------------- + 1 file changed, 82 insertions(+), 71 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1085,43 +1085,47 @@ static int ext4_calc_metadata_amount(str + return ext4_indirect_calc_metadata_amount(inode, blocks); + } + ++/* ++ * Called with i_data_sem down, which is important since we can call ++ * ext4_discard_preallocations() from here. ++ */ + static void ext4_da_update_reserve_space(struct inode *inode, int used) + { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +- int total, mdb, mdb_free, mdb_claim = 0; ++ struct ext4_inode_info *ei = EXT4_I(inode); ++ int mdb_free = 0; + +- spin_lock(&EXT4_I(inode)->i_block_reservation_lock); +- /* recalculate the number of metablocks still need to be reserved */ +- total = EXT4_I(inode)->i_reserved_data_blocks - used; +- mdb = ext4_calc_metadata_amount(inode, total); +- +- /* figure out how many metablocks to release */ +- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); +- mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; +- +- if (mdb_free) { +- /* Account for allocated meta_blocks */ +- mdb_claim = EXT4_I(inode)->i_allocated_meta_blocks; +- BUG_ON(mdb_free < mdb_claim); +- mdb_free -= mdb_claim; ++ spin_lock(&ei->i_block_reservation_lock); ++ if (unlikely(used > ei->i_reserved_data_blocks)) { ++ ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " ++ "with only %d reserved data blocks\n", ++ __func__, inode->i_ino, used, ++ ei->i_reserved_data_blocks); ++ WARN_ON(1); ++ used = ei->i_reserved_data_blocks; ++ } ++ ++ /* Update per-inode reservations */ ++ ei->i_reserved_data_blocks -= used; ++ used += ei->i_allocated_meta_blocks; ++ ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; ++ ei->i_allocated_meta_blocks = 0; ++ percpu_counter_sub(&sbi->s_dirtyblocks_counter, used); + +- /* update fs dirty blocks counter */ ++ if (ei->i_reserved_data_blocks == 0) { ++ /* ++ * We can release all of the reserved metadata blocks ++ * only when we have written all of the delayed ++ * allocation blocks. ++ */ ++ mdb_free = ei->i_allocated_meta_blocks; + percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); +- EXT4_I(inode)->i_allocated_meta_blocks = 0; +- EXT4_I(inode)->i_reserved_meta_blocks = mdb; ++ ei->i_allocated_meta_blocks = 0; + } +- +- /* update per-inode reservations */ +- BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); +- EXT4_I(inode)->i_reserved_data_blocks -= used; +- percpu_counter_sub(&sbi->s_dirtyblocks_counter, used + mdb_claim); + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + +- vfs_dq_claim_block(inode, used + mdb_claim); +- +- /* +- * free those over-booking quota for metadata blocks +- */ ++ /* Update quota subsystem */ ++ vfs_dq_claim_block(inode, used); + if (mdb_free) + vfs_dq_release_reservation_block(inode, mdb_free); + +@@ -1130,7 +1134,8 @@ static void ext4_da_update_reserve_space + * there aren't any writers on the inode, we can discard the + * inode's preallocations. + */ +- if (!total && (atomic_read(&inode->i_writecount) == 0)) ++ if ((ei->i_reserved_data_blocks == 0) && ++ (atomic_read(&inode->i_writecount) == 0)) + ext4_discard_preallocations(inode); + } + +@@ -1843,7 +1848,8 @@ static int ext4_da_reserve_space(struct + { + int retries = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +- unsigned long md_needed, mdblocks, total = 0; ++ struct ext4_inode_info *ei = EXT4_I(inode); ++ unsigned long md_needed, md_reserved, total = 0; + + /* + * recalculate the amount of metadata blocks to reserve +@@ -1851,35 +1857,44 @@ static int ext4_da_reserve_space(struct + * worse case is one extent per block + */ + repeat: +- spin_lock(&EXT4_I(inode)->i_block_reservation_lock); +- total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; +- mdblocks = ext4_calc_metadata_amount(inode, total); +- BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); +- +- md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; ++ spin_lock(&ei->i_block_reservation_lock); ++ md_reserved = ei->i_reserved_meta_blocks; ++ md_needed = ext4_calc_metadata_amount(inode, nrblocks); + total = md_needed + nrblocks; +- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); ++ spin_unlock(&ei->i_block_reservation_lock); + + /* + * Make quota reservation here to prevent quota overflow + * later. Real quota accounting is done at pages writeout + * time. + */ +- if (vfs_dq_reserve_block(inode, total)) ++ if (vfs_dq_reserve_block(inode, total)) { ++ /* ++ * We tend to badly over-estimate the amount of ++ * metadata blocks which are needed, so if we have ++ * reserved any metadata blocks, try to force out the ++ * inode and see if we have any better luck. ++ */ ++ if (md_reserved && retries++ <= 3) ++ goto retry; + return -EDQUOT; ++ } + + if (ext4_claim_free_blocks(sbi, total)) { + vfs_dq_release_reservation_block(inode, total); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { ++ retry: ++ if (md_reserved) ++ write_inode_now(inode, (retries == 3)); + yield(); + goto repeat; + } + return -ENOSPC; + } +- spin_lock(&EXT4_I(inode)->i_block_reservation_lock); +- EXT4_I(inode)->i_reserved_data_blocks += nrblocks; +- EXT4_I(inode)->i_reserved_meta_blocks += md_needed; +- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); ++ spin_lock(&ei->i_block_reservation_lock); ++ ei->i_reserved_data_blocks += nrblocks; ++ ei->i_reserved_meta_blocks += md_needed; ++ spin_unlock(&ei->i_block_reservation_lock); + + return 0; /* success */ + } +@@ -1887,49 +1902,45 @@ repeat: + static void ext4_da_release_space(struct inode *inode, int to_free) + { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); +- int total, mdb, mdb_free, release; ++ struct ext4_inode_info *ei = EXT4_I(inode); + + if (!to_free) + return; /* Nothing to release, exit */ + + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + +- if (!EXT4_I(inode)->i_reserved_data_blocks) { ++ if (unlikely(to_free > ei->i_reserved_data_blocks)) { + /* +- * if there is no reserved blocks, but we try to free some +- * then the counter is messed up somewhere. +- * but since this function is called from invalidate +- * page, it's harmless to return without any action ++ * if there aren't enough reserved blocks, then the ++ * counter is messed up somewhere. Since this ++ * function is called from invalidate page, it's ++ * harmless to return without any action. + */ +- printk(KERN_INFO "ext4 delalloc try to release %d reserved " +- "blocks for inode %lu, but there is no reserved " +- "data blocks\n", to_free, inode->i_ino); +- spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); +- return; ++ ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " ++ "ino %lu, to_free %d with only %d reserved " ++ "data blocks\n", inode->i_ino, to_free, ++ ei->i_reserved_data_blocks); ++ WARN_ON(1); ++ to_free = ei->i_reserved_data_blocks; + } ++ ei->i_reserved_data_blocks -= to_free; + +- /* recalculate the number of metablocks still need to be reserved */ +- total = EXT4_I(inode)->i_reserved_data_blocks - to_free; +- mdb = ext4_calc_metadata_amount(inode, total); +- +- /* figure out how many metablocks to release */ +- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); +- mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; +- +- release = to_free + mdb_free; +- +- /* update fs dirty blocks counter for truncate case */ +- percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); ++ if (ei->i_reserved_data_blocks == 0) { ++ /* ++ * We can release all of the reserved metadata blocks ++ * only when we have written all of the delayed ++ * allocation blocks. ++ */ ++ to_free += ei->i_allocated_meta_blocks; ++ ei->i_allocated_meta_blocks = 0; ++ } + +- /* update per-inode reservations */ +- BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); +- EXT4_I(inode)->i_reserved_data_blocks -= to_free; ++ /* update fs dirty blocks counter */ ++ percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); + +- BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); +- EXT4_I(inode)->i_reserved_meta_blocks = mdb; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + +- vfs_dq_release_reservation_block(inode, release); ++ vfs_dq_release_reservation_block(inode, to_free); + } + + static void ext4_da_page_release_reservation(struct page *page, diff --git a/queue-2.6.32/0008-ext4-Fix-accounting-of-reserved-metadata-blocks.patch b/queue-2.6.32/0008-ext4-Fix-accounting-of-reserved-metadata-blocks.patch new file mode 100644 index 00000000000..f150aa609f3 --- /dev/null +++ b/queue-2.6.32/0008-ext4-Fix-accounting-of-reserved-metadata-blocks.patch @@ -0,0 +1,41 @@ +From 81799214a5369211cf9046735dafcf59a29e7454 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sun, 30 May 2010 22:49:22 -0400 +Subject: ext4: Fix accounting of reserved metadata blocks + +commit ee5f4d9cdf32fd99172d11665c592a288c2b1ff4 upstream (as of v2.6.33-rc3) + +Commit 0637c6f had a typo which caused the reserved metadata blocks to +not be released correctly. Fix this. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1118,9 +1118,9 @@ static void ext4_da_update_reserve_space + * only when we have written all of the delayed + * allocation blocks. + */ +- mdb_free = ei->i_allocated_meta_blocks; ++ mdb_free = ei->i_reserved_meta_blocks; ++ ei->i_reserved_meta_blocks = 0; + percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); +- ei->i_allocated_meta_blocks = 0; + } + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + +@@ -1931,8 +1931,8 @@ static void ext4_da_release_space(struct + * only when we have written all of the delayed + * allocation blocks. + */ +- to_free += ei->i_allocated_meta_blocks; +- ei->i_allocated_meta_blocks = 0; ++ to_free += ei->i_reserved_meta_blocks; ++ ei->i_reserved_meta_blocks = 0; + } + + /* update fs dirty blocks counter */ diff --git a/queue-2.6.32/0009-ext4-Calculate-metadata-requirements-more-accurately.patch b/queue-2.6.32/0009-ext4-Calculate-metadata-requirements-more-accurately.patch new file mode 100644 index 00000000000..a3acc3cc427 --- /dev/null +++ b/queue-2.6.32/0009-ext4-Calculate-metadata-requirements-more-accurately.patch @@ -0,0 +1,295 @@ +From 665d82f8d039371ba402227e99d3b95078c97fb9 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sun, 30 May 2010 22:49:23 -0400 +Subject: ext4: Calculate metadata requirements more accurately + +commit 9d0be50230b333005635967f7ecd4897dbfd181b upstream (as of v2.6.33-rc3) + +In the past, ext4_calc_metadata_amount(), and its sub-functions +ext4_ext_calc_metadata_amount() and ext4_indirect_calc_metadata_amount() +badly over-estimated the number of metadata blocks that might be +required for delayed allocation blocks. This didn't matter as much +when functions which managed the reserved metadata blocks were more +aggressive about dropping reserved metadata blocks as delayed +allocation blocks were written, but unfortunately they were too +aggressive. This was fixed in commit 0637c6f, but as a result the +over-estimation by ext4_calc_metadata_amount() would lead to reserving +2-3 times the number of pending delayed allocation blocks as +potentially required metadata blocks. So if there are 1 megabytes of +blocks which have been not yet been allocation, up to 3 megabytes of +space would get reserved out of the user's quota and from the file +system free space pool until all of the inode's data blocks have been +allocated. + +This commit addresses this problem by much more accurately estimating +the number of metadata blocks that will be required. It will still +somewhat over-estimate the number of blocks needed, since it must make +a worst case estimate not knowing which physical blocks will be +needed, but it is much more accurate than before. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 2 + + fs/ext4/ext4_extents.h | 3 +- + fs/ext4/extents.c | 49 ++++++++++++++++++++++++------------- + fs/ext4/inode.c | 62 +++++++++++++++++++++++++++-------------------- + fs/ext4/super.c | 1 + + 5 files changed, 73 insertions(+), 44 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 4a825c1..23bfbbc 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -693,6 +693,8 @@ struct ext4_inode_info { + unsigned int i_reserved_meta_blocks; + unsigned int i_allocated_meta_blocks; + unsigned short i_delalloc_reserved_flag; ++ sector_t i_da_metadata_calc_last_lblock; ++ int i_da_metadata_calc_len; + + /* on-disk additional length */ + __u16 i_extra_isize; +diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h +index 2ca6864..bdb6ce7 100644 +--- a/fs/ext4/ext4_extents.h ++++ b/fs/ext4/ext4_extents.h +@@ -225,7 +225,8 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) + ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); + } + +-extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); ++extern int ext4_ext_calc_metadata_amount(struct inode *inode, ++ sector_t lblocks); + extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); + extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); + extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); +diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c +index b14fb6d..5f03f9f 100644 +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -296,29 +296,44 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) + * to allocate @blocks + * Worse case is one block per extent + */ +-int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) ++int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock) + { +- int lcap, icap, rcap, leafs, idxs, num; +- int newextents = blocks; +- +- rcap = ext4_ext_space_root_idx(inode, 0); +- lcap = ext4_ext_space_block(inode, 0); +- icap = ext4_ext_space_block_idx(inode, 0); ++ struct ext4_inode_info *ei = EXT4_I(inode); ++ int idxs, num = 0; + +- /* number of new leaf blocks needed */ +- num = leafs = (newextents + lcap - 1) / lcap; ++ idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) ++ / sizeof(struct ext4_extent_idx)); + + /* +- * Worse case, we need separate index block(s) +- * to link all new leaf blocks ++ * If the new delayed allocation block is contiguous with the ++ * previous da block, it can share index blocks with the ++ * previous block, so we only need to allocate a new index ++ * block every idxs leaf blocks. At ldxs**2 blocks, we need ++ * an additional index block, and at ldxs**3 blocks, yet ++ * another index blocks. + */ +- idxs = (leafs + icap - 1) / icap; +- do { +- num += idxs; +- idxs = (idxs + icap - 1) / icap; +- } while (idxs > rcap); ++ if (ei->i_da_metadata_calc_len && ++ ei->i_da_metadata_calc_last_lblock+1 == lblock) { ++ if ((ei->i_da_metadata_calc_len % idxs) == 0) ++ num++; ++ if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) ++ num++; ++ if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { ++ num++; ++ ei->i_da_metadata_calc_len = 0; ++ } else ++ ei->i_da_metadata_calc_len++; ++ ei->i_da_metadata_calc_last_lblock++; ++ return num; ++ } + +- return num; ++ /* ++ * In the worst case we need a new set of index blocks at ++ * every level of the inode's extent tree. ++ */ ++ ei->i_da_metadata_calc_len = 1; ++ ei->i_da_metadata_calc_last_lblock = lblock; ++ return ext_depth(inode) + 1; + } + + static int +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 533bb84..2e3f422 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1051,38 +1051,44 @@ qsize_t *ext4_get_reserved_space(struct inode *inode) + return &EXT4_I(inode)->i_reserved_quota; + } + #endif ++ + /* + * Calculate the number of metadata blocks need to reserve +- * to allocate @blocks for non extent file based file ++ * to allocate a new block at @lblocks for non extent file based file + */ +-static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) ++static int ext4_indirect_calc_metadata_amount(struct inode *inode, ++ sector_t lblock) + { +- int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); +- int ind_blks, dind_blks, tind_blks; +- +- /* number of new indirect blocks needed */ +- ind_blks = (blocks + icap - 1) / icap; ++ struct ext4_inode_info *ei = EXT4_I(inode); ++ int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1; ++ int blk_bits; + +- dind_blks = (ind_blks + icap - 1) / icap; ++ if (lblock < EXT4_NDIR_BLOCKS) ++ return 0; + +- tind_blks = 1; ++ lblock -= EXT4_NDIR_BLOCKS; + +- return ind_blks + dind_blks + tind_blks; ++ if (ei->i_da_metadata_calc_len && ++ (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { ++ ei->i_da_metadata_calc_len++; ++ return 0; ++ } ++ ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; ++ ei->i_da_metadata_calc_len = 1; ++ blk_bits = roundup_pow_of_two(lblock + 1); ++ return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; + } + + /* + * Calculate the number of metadata blocks need to reserve +- * to allocate given number of blocks ++ * to allocate a block located at @lblock + */ +-static int ext4_calc_metadata_amount(struct inode *inode, int blocks) ++static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) + { +- if (!blocks) +- return 0; +- + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) +- return ext4_ext_calc_metadata_amount(inode, blocks); ++ return ext4_ext_calc_metadata_amount(inode, lblock); + +- return ext4_indirect_calc_metadata_amount(inode, blocks); ++ return ext4_indirect_calc_metadata_amount(inode, lblock); + } + + /* +@@ -1120,6 +1126,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) + */ + mdb_free = ei->i_reserved_meta_blocks; + ei->i_reserved_meta_blocks = 0; ++ ei->i_da_metadata_calc_len = 0; + percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); + } + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); +@@ -1844,12 +1851,15 @@ static int ext4_journalled_write_end(struct file *file, + return ret ? ret : copied; + } + +-static int ext4_da_reserve_space(struct inode *inode, int nrblocks) ++/* ++ * Reserve a single block located at lblock ++ */ ++static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) + { + int retries = 0; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); +- unsigned long md_needed, md_reserved, total = 0; ++ unsigned long md_needed, md_reserved; + + /* + * recalculate the amount of metadata blocks to reserve +@@ -1859,8 +1869,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) + repeat: + spin_lock(&ei->i_block_reservation_lock); + md_reserved = ei->i_reserved_meta_blocks; +- md_needed = ext4_calc_metadata_amount(inode, nrblocks); +- total = md_needed + nrblocks; ++ md_needed = ext4_calc_metadata_amount(inode, lblock); + spin_unlock(&ei->i_block_reservation_lock); + + /* +@@ -1868,7 +1877,7 @@ repeat: + * later. Real quota accounting is done at pages writeout + * time. + */ +- if (vfs_dq_reserve_block(inode, total)) { ++ if (vfs_dq_reserve_block(inode, md_needed + 1)) { + /* + * We tend to badly over-estimate the amount of + * metadata blocks which are needed, so if we have +@@ -1880,8 +1889,8 @@ repeat: + return -EDQUOT; + } + +- if (ext4_claim_free_blocks(sbi, total)) { +- vfs_dq_release_reservation_block(inode, total); ++ if (ext4_claim_free_blocks(sbi, md_needed + 1)) { ++ vfs_dq_release_reservation_block(inode, md_needed + 1); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { + retry: + if (md_reserved) +@@ -1892,7 +1901,7 @@ repeat: + return -ENOSPC; + } + spin_lock(&ei->i_block_reservation_lock); +- ei->i_reserved_data_blocks += nrblocks; ++ ei->i_reserved_data_blocks++; + ei->i_reserved_meta_blocks += md_needed; + spin_unlock(&ei->i_block_reservation_lock); + +@@ -1933,6 +1942,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free) + */ + to_free += ei->i_reserved_meta_blocks; + ei->i_reserved_meta_blocks = 0; ++ ei->i_da_metadata_calc_len = 0; + } + + /* update fs dirty blocks counter */ +@@ -2546,7 +2556,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + * XXX: __block_prepare_write() unmaps passed block, + * is it OK? + */ +- ret = ext4_da_reserve_space(inode, 1); ++ ret = ext4_da_reserve_space(inode, iblock); + if (ret) + /* not enough space to reserve */ + return ret; +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 92943f2..252f30b 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -702,6 +702,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) + ei->i_reserved_data_blocks = 0; + ei->i_reserved_meta_blocks = 0; + ei->i_allocated_meta_blocks = 0; ++ ei->i_da_metadata_calc_len = 0; + ei->i_delalloc_reserved_flag = 0; + spin_lock_init(&(ei->i_block_reservation_lock)); + #ifdef CONFIG_QUOTA +-- +1.7.1 + diff --git a/queue-2.6.32/0010-ext4-Handle-EDQUOT-error-on-write.patch b/queue-2.6.32/0010-ext4-Handle-EDQUOT-error-on-write.patch new file mode 100644 index 00000000000..3bec88c9c43 --- /dev/null +++ b/queue-2.6.32/0010-ext4-Handle-EDQUOT-error-on-write.patch @@ -0,0 +1,76 @@ +From 34e8248f530c4db6c4ba200c945257e0713d9905 Mon Sep 17 00:00:00 2001 +From: Aneesh Kumar K.V +Date: Sun, 30 May 2010 22:49:24 -0400 +Subject: ext4: Handle -EDQUOT error on write + +commit 1db913823c0f8360fccbd24ca67eb073966a5ffd upstream (as of v2.6.33-rc6) + +We need to release the journal before we do a write_inode. Otherwise +we could deadlock. + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 32 ++++++++++++++++++-------------- + 1 file changed, 18 insertions(+), 14 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1877,24 +1877,12 @@ repeat: + * later. Real quota accounting is done at pages writeout + * time. + */ +- if (vfs_dq_reserve_block(inode, md_needed + 1)) { +- /* +- * We tend to badly over-estimate the amount of +- * metadata blocks which are needed, so if we have +- * reserved any metadata blocks, try to force out the +- * inode and see if we have any better luck. +- */ +- if (md_reserved && retries++ <= 3) +- goto retry; ++ if (vfs_dq_reserve_block(inode, md_needed + 1)) + return -EDQUOT; +- } + + if (ext4_claim_free_blocks(sbi, md_needed + 1)) { + vfs_dq_release_reservation_block(inode, md_needed + 1); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { +- retry: +- if (md_reserved) +- write_inode_now(inode, (retries == 3)); + yield(); + goto repeat; + } +@@ -3075,7 +3063,7 @@ static int ext4_da_write_begin(struct fi + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) + { +- int ret, retries = 0; ++ int ret, retries = 0, quota_retries = 0; + struct page *page; + pgoff_t index; + unsigned from, to; +@@ -3134,6 +3122,22 @@ retry: + + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; ++ ++ if ((ret == -EDQUOT) && ++ EXT4_I(inode)->i_reserved_meta_blocks && ++ (quota_retries++ < 3)) { ++ /* ++ * Since we often over-estimate the number of meta ++ * data blocks required, we may sometimes get a ++ * spurios out of quota error even though there would ++ * be enough space once we write the data blocks and ++ * find out how many meta data blocks were _really_ ++ * required. So try forcing the inode write to see if ++ * that helps. ++ */ ++ write_inode_now(inode, (quota_retries == 3)); ++ goto retry; ++ } + out: + return ret; + } diff --git a/queue-2.6.32/0011-ext4-Fix-quota-accounting-error-with-fallocate.patch b/queue-2.6.32/0011-ext4-Fix-quota-accounting-error-with-fallocate.patch new file mode 100644 index 00000000000..1974583898e --- /dev/null +++ b/queue-2.6.32/0011-ext4-Fix-quota-accounting-error-with-fallocate.patch @@ -0,0 +1,154 @@ +From 09e8f5642b741ecfdd05c259b47796f85fdd01aa Mon Sep 17 00:00:00 2001 +From: Aneesh Kumar K.V +Date: Sun, 30 May 2010 22:49:25 -0400 +Subject: ext4: Fix quota accounting error with fallocate + +commit 5f634d064c709ea02c3cdaa850a08323a4a4bf28 upstream (as of v2.6.33-rc6) + +When we fallocate a region of the file which we had recently written, +and which is still in the page cache marked as delayed allocated blocks +we need to make sure we don't do the quota update on writepage path. +This is because the needed quota updated would have already be done +by fallocate. + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 2 ++ + fs/ext4/extents.c | 21 +++++++++++++++++++++ + fs/ext4/inode.c | 44 +++++++++++++++++++++++++++++++------------- + 3 files changed, 54 insertions(+), 13 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1440,6 +1440,8 @@ extern int ext4_block_truncate_page(hand + extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); + extern qsize_t *ext4_get_reserved_space(struct inode *inode); + extern int flush_aio_dio_completed_IO(struct inode *inode); ++extern void ext4_da_update_reserve_space(struct inode *inode, ++ int used, int quota_claim); + /* ioctl.c */ + extern long ext4_ioctl(struct file *, unsigned int, unsigned long); + extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3138,7 +3138,19 @@ out: + unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, + newblock + max_blocks, + allocated - max_blocks); ++ allocated = max_blocks; + } ++ ++ /* ++ * If we have done fallocate with the offset that is already ++ * delayed allocated, we would have block reservation ++ * and quota reservation done in the delayed write path. ++ * But fallocate would have already updated quota and block ++ * count for this offset. So cancel these reservation ++ */ ++ if (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE) ++ ext4_da_update_reserve_space(inode, allocated, 0); ++ + map_out: + set_buffer_mapped(bh_result); + out1: +@@ -3374,9 +3386,18 @@ int ext4_ext_get_blocks(handle_t *handle + /* previous routine could use block we allocated */ + newblock = ext_pblock(&newex); + allocated = ext4_ext_get_actual_len(&newex); ++ if (allocated > max_blocks) ++ allocated = max_blocks; + set_buffer_new(bh_result); + + /* ++ * Update reserved blocks/metadata blocks after successful ++ * block allocation which had been deferred till now. ++ */ ++ if (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE) ++ ext4_da_update_reserve_space(inode, allocated, 1); ++ ++ /* + * Cache the extent and update transaction to commit on fdatasync only + * when it is _not_ an uninitialized extent. + */ +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1095,11 +1095,12 @@ static int ext4_calc_metadata_amount(str + * Called with i_data_sem down, which is important since we can call + * ext4_discard_preallocations() from here. + */ +-static void ext4_da_update_reserve_space(struct inode *inode, int used) ++void ext4_da_update_reserve_space(struct inode *inode, ++ int used, int quota_claim) + { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct ext4_inode_info *ei = EXT4_I(inode); +- int mdb_free = 0; ++ int mdb_free = 0, allocated_meta_blocks = 0; + + spin_lock(&ei->i_block_reservation_lock); + if (unlikely(used > ei->i_reserved_data_blocks)) { +@@ -1115,6 +1116,7 @@ static void ext4_da_update_reserve_space + ei->i_reserved_data_blocks -= used; + used += ei->i_allocated_meta_blocks; + ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; ++ allocated_meta_blocks = ei->i_allocated_meta_blocks; + ei->i_allocated_meta_blocks = 0; + percpu_counter_sub(&sbi->s_dirtyblocks_counter, used); + +@@ -1132,9 +1134,23 @@ static void ext4_da_update_reserve_space + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + + /* Update quota subsystem */ +- vfs_dq_claim_block(inode, used); +- if (mdb_free) +- vfs_dq_release_reservation_block(inode, mdb_free); ++ if (quota_claim) { ++ vfs_dq_claim_block(inode, used); ++ if (mdb_free) ++ vfs_dq_release_reservation_block(inode, mdb_free); ++ } else { ++ /* ++ * We did fallocate with an offset that is already delayed ++ * allocated. So on delayed allocated writeback we should ++ * not update the quota for allocated blocks. But then ++ * converting an fallocate region to initialized region would ++ * have caused a metadata allocation. So claim quota for ++ * that ++ */ ++ if (allocated_meta_blocks) ++ vfs_dq_claim_block(inode, allocated_meta_blocks); ++ vfs_dq_release_reservation_block(inode, mdb_free + used); ++ } + + /* + * If we have done all the pending block allocations and if +@@ -1334,18 +1350,20 @@ int ext4_get_blocks(handle_t *handle, st + */ + EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; + } +- } + ++ /* ++ * Update reserved blocks/metadata blocks after successful ++ * block allocation which had been deferred till now. We don't ++ * support fallocate for non extent files. So we can update ++ * reserve space here. ++ */ ++ if ((retval > 0) && ++ (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)) ++ ext4_da_update_reserve_space(inode, retval, 1); ++ } + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + EXT4_I(inode)->i_delalloc_reserved_flag = 0; + +- /* +- * Update reserved blocks/metadata blocks after successful +- * block allocation which had been deferred till now. +- */ +- if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)) +- ext4_da_update_reserve_space(inode, retval); +- + up_write((&EXT4_I(inode)->i_data_sem)); + if (retval > 0 && buffer_mapped(bh)) { + int ret = check_block_validity(inode, "file system " diff --git a/queue-2.6.32/0012-ext4-Drop-EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE-flag.patch b/queue-2.6.32/0012-ext4-Drop-EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE-flag.patch new file mode 100644 index 00000000000..b6119c8e168 --- /dev/null +++ b/queue-2.6.32/0012-ext4-Drop-EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE-flag.patch @@ -0,0 +1,99 @@ +From 3a1a12ca4219f564fe4f86cae1bfb563422a2d15 Mon Sep 17 00:00:00 2001 +From: Aneesh Kumar K.V +Date: Sun, 30 May 2010 22:49:26 -0400 +Subject: ext4: Drop EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE flag + +commit 1296cc85c26e94eb865d03f82140f27d598de467 upstream (as of v2.6.33-rc6) + +We should update reserve space if it is delalloc buffer +and that is indicated by EXT4_GET_BLOCKS_DELALLOC_RESERVE flag. +So use EXT4_GET_BLOCKS_DELALLOC_RESERVE in place of +EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE + +[ Stable note: This fixes a corruption cuased by the following + reproduction case: + + rm -f $TEST_FN + touch $TEST_FN + fallocate -n -o 656712 -l 858907 $TEST_FN + dd if=/dev/zero of=$TEST_FN conv=notrunc bs=1 seek=1011020 count=36983 + sync + dd if=/dev/zero of=$TEST_FN conv=notrunc bs=1 seek=332121 count=24005 + dd if=/dev/zero of=$TEST_FN conv=notrunc bs=1 seek=1040179 count=93319 + + If the filesystem is then unmounted and e2fsck run forced, the + i_blocks field for the file $TEST_FN will be found to be incorrect. ] + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 7 ++----- + fs/ext4/extents.c | 4 ++-- + fs/ext4/inode.c | 8 ++++---- + 3 files changed, 8 insertions(+), 11 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -361,14 +361,11 @@ struct ext4_new_group_data { + so set the magic i_delalloc_reserve_flag after taking the + inode allocation semaphore for */ + #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 +- /* Call ext4_da_update_reserve_space() after successfully +- allocating the blocks */ +-#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 + /* caller is from the direct IO path, request to creation of an + unitialized extents if not allocated, split the uninitialized + extent if blocks has been preallocated already*/ +-#define EXT4_GET_BLOCKS_DIO 0x0010 +-#define EXT4_GET_BLOCKS_CONVERT 0x0020 ++#define EXT4_GET_BLOCKS_DIO 0x0008 ++#define EXT4_GET_BLOCKS_CONVERT 0x0010 + #define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) + /* Convert extent to initialized after direct IO complete */ +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3148,7 +3148,7 @@ out: + * But fallocate would have already updated quota and block + * count for this offset. So cancel these reservation + */ +- if (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE) ++ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ext4_da_update_reserve_space(inode, allocated, 0); + + map_out: +@@ -3394,7 +3394,7 @@ int ext4_ext_get_blocks(handle_t *handle + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. + */ +- if (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE) ++ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ext4_da_update_reserve_space(inode, allocated, 1); + + /* +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1358,7 +1358,7 @@ int ext4_get_blocks(handle_t *handle, st + * reserve space here. + */ + if ((retval > 0) && +- (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)) ++ (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) + ext4_da_update_reserve_space(inode, retval, 1); + } + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) +@@ -2261,10 +2261,10 @@ static int mpage_da_map_blocks(struct mp + * variables are updated after the blocks have been allocated. + */ + new.b_state = 0; +- get_blocks_flags = (EXT4_GET_BLOCKS_CREATE | +- EXT4_GET_BLOCKS_DELALLOC_RESERVE); ++ get_blocks_flags = EXT4_GET_BLOCKS_CREATE; + if (mpd->b_state & (1 << BH_Delay)) +- get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE; ++ get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; ++ + blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, + &new, get_blocks_flags); + if (blks < 0) { diff --git a/queue-2.6.32/0013-ext4-Use-bitops-to-read-modify-EXT4_I-inode-i_state.patch b/queue-2.6.32/0013-ext4-Use-bitops-to-read-modify-EXT4_I-inode-i_state.patch new file mode 100644 index 00000000000..57f873cae9c --- /dev/null +++ b/queue-2.6.32/0013-ext4-Use-bitops-to-read-modify-EXT4_I-inode-i_state.patch @@ -0,0 +1,409 @@ +From f7ae767b11e7ac054c5f8de55e5a83ec7c60c6a0 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sun, 30 May 2010 22:49:27 -0400 +Subject: ext4: Use bitops to read/modify EXT4_I(inode)->i_state + +commit 19f5fb7ad679bb361222c7916086435020c37cce upstream (as of v2.6.33-git11) + +At several places we modify EXT4_I(inode)->i_state without holding +i_mutex (ext4_release_file, ext4_bmap, ext4_journalled_writepage, +ext4_do_update_inode, ...). These modifications are racy and we can +lose updates to i_state. So convert handling of i_state to use bitops +which are atomic. + +Cc: Jan Kara +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 41 +++++++++++++++++++++++++++++------------ + fs/ext4/extents.c | 8 ++++---- + fs/ext4/file.c | 4 ++-- + fs/ext4/ialloc.c | 3 ++- + fs/ext4/inode.c | 38 ++++++++++++++++++++------------------ + fs/ext4/migrate.c | 6 +++--- + fs/ext4/xattr.c | 22 +++++++++++----------- + 7 files changed, 71 insertions(+), 51 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -313,17 +313,6 @@ static inline __u32 ext4_mask_flags(umod + return flags & EXT4_OTHER_FLMASK; + } + +-/* +- * Inode dynamic state flags +- */ +-#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */ +-#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */ +-#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ +-#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ +-#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ +-#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ +-#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/ +- + /* Used to pass group descriptor data when online resize is done */ + struct ext4_new_group_input { + __u32 group; /* Group number for this data */ +@@ -624,7 +613,7 @@ struct ext4_inode_info { + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; +- __u32 i_state; /* Dynamic state flags for ext4 */ ++ unsigned long i_state_flags; /* Dynamic state flags */ + + ext4_lblk_t i_dir_start_lookup; + #ifdef CONFIG_EXT4_FS_XATTR +@@ -1044,6 +1033,34 @@ static inline int ext4_valid_inum(struct + (ino >= EXT4_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); + } ++ ++/* ++ * Inode dynamic state flags ++ */ ++enum { ++ EXT4_STATE_JDATA, /* journaled data exists */ ++ EXT4_STATE_NEW, /* inode is newly created */ ++ EXT4_STATE_XATTR, /* has in-inode xattrs */ ++ EXT4_STATE_NO_EXPAND, /* No space for expansion */ ++ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ ++ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ ++ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ ++}; ++ ++static inline int ext4_test_inode_state(struct inode *inode, int bit) ++{ ++ return test_bit(bit, &EXT4_I(inode)->i_state_flags); ++} ++ ++static inline void ext4_set_inode_state(struct inode *inode, int bit) ++{ ++ set_bit(bit, &EXT4_I(inode)->i_state_flags); ++} ++ ++static inline void ext4_clear_inode_state(struct inode *inode, int bit) ++{ ++ clear_bit(bit, &EXT4_I(inode)->i_state_flags); ++} + #else + /* Assume that user mode programs are passing in an ext4fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3082,7 +3082,7 @@ ext4_ext_handle_uninitialized_extents(ha + if (io) + io->flag = DIO_AIO_UNWRITTEN; + else +- EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; ++ ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + goto out; + } + /* async DIO end_io complete, convert the filled extent to written */ +@@ -3368,8 +3368,8 @@ int ext4_ext_get_blocks(handle_t *handle + if (io) + io->flag = DIO_AIO_UNWRITTEN; + else +- EXT4_I(inode)->i_state |= +- EXT4_STATE_DIO_UNWRITTEN;; ++ ext4_set_inode_state(inode, ++ EXT4_STATE_DIO_UNWRITTEN); + } + } + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); +@@ -3745,7 +3745,7 @@ static int ext4_xattr_fiemap(struct inod + int error = 0; + + /* in-inode? */ +- if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { ++ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + struct ext4_iloc iloc; + int offset; /* offset of xattr in inode */ + +--- a/fs/ext4/file.c ++++ b/fs/ext4/file.c +@@ -35,9 +35,9 @@ + */ + static int ext4_release_file(struct inode *inode, struct file *filp) + { +- if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) { ++ if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { + ext4_alloc_da_blocks(inode); +- EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE; ++ ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + } + /* if we are the last writer on the inode, drop the block reservation */ + if ((filp->f_mode & FMODE_WRITE) && +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -1029,7 +1029,8 @@ got: + inode->i_generation = sbi->s_next_generation++; + spin_unlock(&sbi->s_next_gen_lock); + +- ei->i_state = EXT4_STATE_NEW; ++ ei->i_state_flags = 0; ++ ext4_set_inode_state(inode, EXT4_STATE_NEW); + + ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1348,7 +1348,7 @@ int ext4_get_blocks(handle_t *handle, st + * i_data's format changing. Force the migrate + * to fail by clearing migrate flags + */ +- EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; ++ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + } + + /* +@@ -1835,7 +1835,7 @@ static int ext4_journalled_write_end(str + new_i_size = pos + copied; + if (new_i_size > inode->i_size) + i_size_write(inode, pos+copied); +- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; ++ ext4_set_inode_state(inode, EXT4_STATE_JDATA); + if (new_i_size > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, new_i_size); + ret2 = ext4_mark_inode_dirty(handle, inode); +@@ -2673,7 +2673,7 @@ static int __ext4_journalled_writepage(s + ret = err; + + walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); +- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; ++ ext4_set_inode_state(inode, EXT4_STATE_JDATA); + out: + return ret; + } +@@ -3344,7 +3344,8 @@ static sector_t ext4_bmap(struct address + filemap_write_and_wait(mapping); + } + +- if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { ++ if (EXT4_JOURNAL(inode) && ++ ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { + /* + * This is a REALLY heavyweight approach, but the use of + * bmap on dirty files is expected to be extremely rare: +@@ -3363,7 +3364,7 @@ static sector_t ext4_bmap(struct address + * everything they get. + */ + +- EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA; ++ ext4_clear_inode_state(inode, EXT4_STATE_JDATA); + journal = EXT4_JOURNAL(inode); + jbd2_journal_lock_updates(journal); + err = jbd2_journal_flush(journal); +@@ -3831,8 +3832,8 @@ static ssize_t ext4_ext_direct_IO(int rw + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { + ext4_free_io_end(iocb->private); + iocb->private = NULL; +- } else if (ret > 0 && (EXT4_I(inode)->i_state & +- EXT4_STATE_DIO_UNWRITTEN)) { ++ } else if (ret > 0 && ext4_test_inode_state(inode, ++ EXT4_STATE_DIO_UNWRITTEN)) { + int err; + /* + * for non AIO case, since the IO is already +@@ -3842,7 +3843,7 @@ static ssize_t ext4_ext_direct_IO(int rw + offset, ret); + if (err < 0) + ret = err; +- EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; ++ ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); + } + return ret; + } +@@ -4490,7 +4491,7 @@ void ext4_truncate(struct inode *inode) + return; + + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) +- ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; ++ ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { + ext4_ext_truncate(inode); +@@ -4776,7 +4777,7 @@ int ext4_get_inode_loc(struct inode *ino + { + /* We have all inode data except xattrs in memory here. */ + return __ext4_get_inode_loc(inode, iloc, +- !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)); ++ !ext4_test_inode_state(inode, EXT4_STATE_XATTR)); + } + + void ext4_set_inode_flags(struct inode *inode) +@@ -4870,7 +4871,7 @@ struct inode *ext4_iget(struct super_blo + } + inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + +- ei->i_state = 0; ++ ei->i_state_flags = 0; + ei->i_dir_start_lookup = 0; + ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. +@@ -4953,7 +4954,7 @@ struct inode *ext4_iget(struct super_blo + EXT4_GOOD_OLD_INODE_SIZE + + ei->i_extra_isize; + if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) +- ei->i_state |= EXT4_STATE_XATTR; ++ ext4_set_inode_state(inode, EXT4_STATE_XATTR); + } + } else + ei->i_extra_isize = 0; +@@ -5093,7 +5094,7 @@ static int ext4_do_update_inode(handle_t + + /* For fields not not tracking in the in-memory inode, + * initialise them to zero for new inodes. */ +- if (ei->i_state & EXT4_STATE_NEW) ++ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) + memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); + + ext4_get_inode_flags(ei); +@@ -5189,7 +5190,7 @@ static int ext4_do_update_inode(handle_t + rc = ext4_handle_dirty_metadata(handle, inode, bh); + if (!err) + err = rc; +- ei->i_state &= ~EXT4_STATE_NEW; ++ ext4_clear_inode_state(inode, EXT4_STATE_NEW); + + ext4_update_inode_fsync_trans(handle, inode, 0); + out_brelse: +@@ -5613,8 +5614,8 @@ static int ext4_expand_extra_isize(struc + entry = IFIRST(header); + + /* No extended attributes present */ +- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) || +- header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { ++ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || ++ header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { + memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0, + new_extra_isize); + EXT4_I(inode)->i_extra_isize = new_extra_isize; +@@ -5658,7 +5659,7 @@ int ext4_mark_inode_dirty(handle_t *hand + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (ext4_handle_valid(handle) && + EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && +- !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { ++ !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { + /* + * We need extra buffer credits since we may write into EA block + * with this same handle. If journal_extend fails, then it will +@@ -5672,7 +5673,8 @@ int ext4_mark_inode_dirty(handle_t *hand + sbi->s_want_extra_isize, + iloc, handle); + if (ret) { +- EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; ++ ext4_set_inode_state(inode, ++ EXT4_STATE_NO_EXPAND); + if (mnt_count != + le16_to_cpu(sbi->s_es->s_mnt_count)) { + ext4_warning(inode->i_sb, __func__, +--- a/fs/ext4/migrate.c ++++ b/fs/ext4/migrate.c +@@ -357,12 +357,12 @@ static int ext4_ext_swap_inode_data(hand + * happened after we started the migrate. We need to + * fail the migrate + */ +- if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) { ++ if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) { + retval = -EAGAIN; + up_write(&EXT4_I(inode)->i_data_sem); + goto err_out; + } else +- EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; ++ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + /* + * We have the extent map build with the tmp inode. + * Now copy the i_data across +@@ -524,7 +524,7 @@ int ext4_ext_migrate(struct inode *inode + * allocation. + */ + down_read((&EXT4_I(inode)->i_data_sem)); +- EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE; ++ ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE); + up_read((&EXT4_I(inode)->i_data_sem)); + + handle = ext4_journal_start(inode, 1); +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -267,7 +267,7 @@ ext4_xattr_ibody_get(struct inode *inode + void *end; + int error; + +- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) ++ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + return -ENODATA; + error = ext4_get_inode_loc(inode, &iloc); + if (error) +@@ -393,7 +393,7 @@ ext4_xattr_ibody_list(struct inode *inod + void *end; + int error; + +- if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR)) ++ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) + return 0; + error = ext4_get_inode_loc(inode, &iloc); + if (error) +@@ -903,7 +903,7 @@ ext4_xattr_ibody_find(struct inode *inod + is->s.base = is->s.first = IFIRST(header); + is->s.here = is->s.first; + is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; +- if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { ++ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { + error = ext4_xattr_check_names(IFIRST(header), is->s.end); + if (error) + return error; +@@ -935,10 +935,10 @@ ext4_xattr_ibody_set(handle_t *handle, s + header = IHDR(inode, ext4_raw_inode(&is->iloc)); + if (!IS_LAST_ENTRY(s->first)) { + header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); +- EXT4_I(inode)->i_state |= EXT4_STATE_XATTR; ++ ext4_set_inode_state(inode, EXT4_STATE_XATTR); + } else { + header->h_magic = cpu_to_le32(0); +- EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR; ++ ext4_clear_inode_state(inode, EXT4_STATE_XATTR); + } + return 0; + } +@@ -981,8 +981,8 @@ ext4_xattr_set_handle(handle_t *handle, + if (strlen(name) > 255) + return -ERANGE; + down_write(&EXT4_I(inode)->xattr_sem); +- no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND; +- EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; ++ no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); ++ ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); + + error = ext4_get_inode_loc(inode, &is.iloc); + if (error) +@@ -992,10 +992,10 @@ ext4_xattr_set_handle(handle_t *handle, + if (error) + goto cleanup; + +- if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { ++ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) { + struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); + memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); +- EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW; ++ ext4_clear_inode_state(inode, EXT4_STATE_NEW); + } + + error = ext4_xattr_ibody_find(inode, &i, &is); +@@ -1047,7 +1047,7 @@ ext4_xattr_set_handle(handle_t *handle, + ext4_xattr_update_super_block(handle, inode->i_sb); + inode->i_ctime = ext4_current_time(inode); + if (!value) +- EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; ++ ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); + /* + * The bh is consumed by ext4_mark_iloc_dirty, even with +@@ -1062,7 +1062,7 @@ cleanup: + brelse(is.iloc.bh); + brelse(bs.bh); + if (no_expand == 0) +- EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; ++ ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); + up_write(&EXT4_I(inode)->xattr_sem); + return error; + } diff --git a/queue-2.6.32/0014-ext4-Fix-BUG_ON-at-fs-buffer.c-652-in-no-journal-mod.patch b/queue-2.6.32/0014-ext4-Fix-BUG_ON-at-fs-buffer.c-652-in-no-journal-mod.patch new file mode 100644 index 00000000000..b46cbd4e46e --- /dev/null +++ b/queue-2.6.32/0014-ext4-Fix-BUG_ON-at-fs-buffer.c-652-in-no-journal-mod.patch @@ -0,0 +1,99 @@ +From 04cbf99a9333c66de2474429c01e13d110aa5fd0 Mon Sep 17 00:00:00 2001 +From: Curt Wohlgemuth +Date: Sun, 30 May 2010 22:49:28 -0400 +Subject: ext4: Fix BUG_ON at fs/buffer.c:652 in no journal mode + +commit 73b50c1c92666d326b5fa2c945d46509f2f6d91f upstream (as of v2.6.33-git11) + +Calls to ext4_handle_dirty_metadata should only pass in an inode +pointer for inode-specific metadata, and not for shared metadata +blocks such as inode table blocks, block group descriptors, the +superblock, etc. + +The BUG_ON can get tripped when updating a special device (such as a +block device) that is opened (so that i_mapping is set in +fs/block_dev.c) and the file system is mounted in no journal mode. + +Addresses-Google-Bug: #2404870 + +Signed-off-by: Curt Wohlgemuth +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4_jbd2.c | 2 +- + fs/ext4/ialloc.c | 2 +- + fs/ext4/inode.c | 6 +++--- + fs/ext4/namei.c | 4 ++-- + 4 files changed, 7 insertions(+), 7 deletions(-) + +--- a/fs/ext4/ext4_jbd2.c ++++ b/fs/ext4/ext4_jbd2.c +@@ -89,7 +89,7 @@ int __ext4_handle_dirty_metadata(const c + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } else { +- if (inode && bh) ++ if (inode) + mark_buffer_dirty_inode(bh, inode); + else + mark_buffer_dirty(bh); +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -904,7 +904,7 @@ repeat_in_this_group: + BUFFER_TRACE(inode_bitmap_bh, + "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, +- inode, ++ NULL, + inode_bitmap_bh); + if (err) + goto fail; +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -5158,7 +5158,7 @@ static int ext4_do_update_inode(handle_t + EXT4_FEATURE_RO_COMPAT_LARGE_FILE); + sb->s_dirt = 1; + ext4_handle_sync(handle); +- err = ext4_handle_dirty_metadata(handle, inode, ++ err = ext4_handle_dirty_metadata(handle, NULL, + EXT4_SB(sb)->s_sbh); + } + } +@@ -5187,7 +5187,7 @@ static int ext4_do_update_inode(handle_t + } + + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); +- rc = ext4_handle_dirty_metadata(handle, inode, bh); ++ rc = ext4_handle_dirty_metadata(handle, NULL, bh); + if (!err) + err = rc; + ext4_clear_inode_state(inode, EXT4_STATE_NEW); +@@ -5741,7 +5741,7 @@ static int ext4_pin_inode(handle_t *hand + err = jbd2_journal_get_write_access(handle, iloc.bh); + if (!err) + err = ext4_handle_dirty_metadata(handle, +- inode, ++ NULL, + iloc.bh); + brelse(iloc.bh); + } +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -2024,7 +2024,7 @@ int ext4_orphan_add(handle_t *handle, st + /* Insert this inode at the head of the on-disk orphan list... */ + NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); + EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); +- err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh); ++ err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; +@@ -2096,7 +2096,7 @@ int ext4_orphan_del(handle_t *handle, st + if (err) + goto out_brelse; + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); +- err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh); ++ err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); + } else { + struct ext4_iloc iloc2; + struct inode *i_prev = diff --git a/queue-2.6.32/0015-ext4-Add-flag-to-files-with-blocks-intentionally-pas.patch b/queue-2.6.32/0015-ext4-Add-flag-to-files-with-blocks-intentionally-pas.patch new file mode 100644 index 00000000000..2909322604f --- /dev/null +++ b/queue-2.6.32/0015-ext4-Add-flag-to-files-with-blocks-intentionally-pas.patch @@ -0,0 +1,140 @@ +From 9d176d321904553ab92a5df99e25ccb268a5560e Mon Sep 17 00:00:00 2001 +From: Jiaying Zhang +Date: Sun, 30 May 2010 22:49:29 -0400 +Subject: ext4: Add flag to files with blocks intentionally past EOF + +commit c8d46e41bc744c8fa0092112af3942fcd46c8b18 upstream (as of v2.6.33-git11) + +fallocate() may potentially instantiate blocks past EOF, depending +on the flags used when it is called. + +e2fsck currently has a test for blocks past i_size, and it +sometimes trips up - noticeably on xfstests 013 which runs fsstress. + +This patch from Jiayang does fix it up - it (along with +e2fsprogs updates and other patches recently from Aneesh) has +survived many fsstress runs in a row. + +Signed-off-by: Eric Sandeen +Signed-off-by: Jiaying Zhang +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 6 ++++-- + fs/ext4/extents.c | 22 +++++++++++++++++++++- + fs/ext4/inode.c | 9 ++++++++- + fs/ext4/ioctl.c | 9 +++++++++ + 4 files changed, 42 insertions(+), 4 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -284,10 +284,12 @@ struct flex_groups { + #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ + #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ + #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ ++#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ ++#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */ + #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +-#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ +-#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ ++#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ ++#define EXT4_FL_USER_MODIFIABLE 0x004B80FF /* User modifiable flags */ + + /* Flags that should be inherited by new inodes from their parent. */ + #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3191,7 +3191,7 @@ int ext4_ext_get_blocks(handle_t *handle + { + struct ext4_ext_path *path = NULL; + struct ext4_extent_header *eh; +- struct ext4_extent newex, *ex; ++ struct ext4_extent newex, *ex, *last_ex; + ext4_fsblk_t newblock; + int err = 0, depth, ret, cache_type; + unsigned int allocated = 0; +@@ -3372,6 +3372,19 @@ int ext4_ext_get_blocks(handle_t *handle + EXT4_STATE_DIO_UNWRITTEN); + } + } ++ ++ if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { ++ if (eh->eh_entries) { ++ last_ex = EXT_LAST_EXTENT(eh); ++ if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) ++ + ext4_ext_get_actual_len(last_ex)) ++ EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; ++ } else { ++ WARN_ON(eh->eh_entries == 0); ++ ext4_error(inode->i_sb, __func__, ++ "inode#%lu, eh->eh_entries = 0!", inode->i_ino); ++ } ++ } + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (err) { + /* free data blocks we just allocated */ +@@ -3505,6 +3518,13 @@ static void ext4_falloc_update_inode(str + i_size_write(inode, new_size); + if (new_size > EXT4_I(inode)->i_disksize) + ext4_update_i_disksize(inode, new_size); ++ } else { ++ /* ++ * Mark that we allocate beyond EOF so the subsequent truncate ++ * can proceed even if the new size is the same as i_size. ++ */ ++ if (new_size > i_size_read(inode)) ++ EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL; + } + + } +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -4490,6 +4490,8 @@ void ext4_truncate(struct inode *inode) + if (!ext4_can_truncate(inode)) + return; + ++ EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; ++ + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) + ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + +@@ -5345,7 +5347,9 @@ int ext4_setattr(struct dentry *dentry, + } + + if (S_ISREG(inode->i_mode) && +- attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { ++ attr->ia_valid & ATTR_SIZE && ++ (attr->ia_size < inode->i_size || ++ (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) { + handle_t *handle; + + handle = ext4_journal_start(inode, 3); +@@ -5376,6 +5380,9 @@ int ext4_setattr(struct dentry *dentry, + goto err_out; + } + } ++ /* ext4_truncate will clear the flag */ ++ if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) ++ ext4_truncate(inode); + } + + rc = inode_setattr(inode, attr); +--- a/fs/ext4/ioctl.c ++++ b/fs/ext4/ioctl.c +@@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsig + flags &= ~EXT4_EXTENTS_FL; + } + ++ if (flags & EXT4_EOFBLOCKS_FL) { ++ /* we don't support adding EOFBLOCKS flag */ ++ if (!(oldflags & EXT4_EOFBLOCKS_FL)) { ++ err = -EOPNOTSUPP; ++ goto flags_out; ++ } ++ } else if (oldflags & EXT4_EOFBLOCKS_FL) ++ ext4_truncate(inode); ++ + handle = ext4_journal_start(inode, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); diff --git a/queue-2.6.32/0016-ext4-Fix-fencepost-error-in-chosing-choosing-group-v.patch b/queue-2.6.32/0016-ext4-Fix-fencepost-error-in-chosing-choosing-group-v.patch new file mode 100644 index 00000000000..1a2f3f4b643 --- /dev/null +++ b/queue-2.6.32/0016-ext4-Fix-fencepost-error-in-chosing-choosing-group-v.patch @@ -0,0 +1,60 @@ +From 2cbbb92297f15740e27f2e87eb21ab86d4432cba Mon Sep 17 00:00:00 2001 +From: Tao Ma +Date: Sun, 30 May 2010 22:49:30 -0400 +Subject: ext4: Fix fencepost error in chosing choosing group vs file preallocation. + +commit cc483f102c3f703e853c96f95a654f0106fb2603 upstream (as of v2.6.33-git11) + +The ext4 multiblock allocator decides whether to use group or file +preallocation based on the file size. When the file size reaches +s_mb_stream_request (default is 16 blocks), it changes to use a +file-specific preallocation. This is cool, but it has a tiny problem. + +See a simple script: +mkfs.ext4 -b 1024 /dev/sda8 1000000 +mount -t ext4 -o nodelalloc /dev/sda8 /mnt/ext4 +for((i=0;i<5;i++)) +do +cat /mnt/4096>>/mnt/ext4/a #4096 is a file with 4096 characters. +cat /mnt/4096>>/mnt/ext4/b +done +debuge4fs -R 'stat a' /dev/sda8|grep BLOCKS -A 1 + +And you get +BLOCKS: +(0-14):8705-8719, (15):2356, (16-19):8465-8468 + +So there are 3 extents, a bit strange for the lonely 15th logical +block. As we write to the 16 blocks, we choose file preallocation in +ext4_mb_group_or_file, but in ext4_mb_normalize_request, we meet with +the 16*1024 range, so no preallocation will be carried. file b then +reserves the space after '2356', so when when write 16, we start from +another part. + +This patch just change the check in ext4_mb_group_or_file, so +that for the lonely 15 we will still use group preallocation. +After the patch, we will get: +debuge4fs -R 'stat a' /dev/sda8|grep BLOCKS -A 1 +BLOCKS: +(0-15):8705-8720, (16-19):8465-8468 + +Looks more sane. Thanks. + +Signed-off-by: Tao Ma +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -3938,7 +3938,7 @@ static void ext4_mb_group_or_file(struct + + /* don't use group allocation for large files */ + size = max(size, isize); +- if (size >= sbi->s_mb_stream_request) { ++ if (size > sbi->s_mb_stream_request) { + ac->ac_flags |= EXT4_MB_STREAM_ALLOC; + return; + } diff --git a/queue-2.6.32/0017-ext4-fix-error-handling-in-migrate.patch b/queue-2.6.32/0017-ext4-fix-error-handling-in-migrate.patch new file mode 100644 index 00000000000..8295f0af879 --- /dev/null +++ b/queue-2.6.32/0017-ext4-fix-error-handling-in-migrate.patch @@ -0,0 +1,75 @@ +From 492c93e8097f0bf58b2884064af85242fabe5d71 Mon Sep 17 00:00:00 2001 +From: Dmitry Monakhov +Date: Sun, 30 May 2010 22:49:31 -0400 +Subject: ext4: fix error handling in migrate + +commit f39490bcd1691d65dc33689222a12e1fc13dd824 upstream (as of v2.6.33-git11) + +Set i_nlink to zero for temporary inode from very beginning. +otherwise we may fail to start new journal handle and this +inode will be unreferenced but with i_nlink == 1 +Since we hold inode reference it can not be pruned. + +Also add missed journal_start retval check. + +Signed-off-by: Dmitry Monakhov +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/migrate.c | 29 ++++++++++++++--------------- + 1 file changed, 14 insertions(+), 15 deletions(-) + +--- a/fs/ext4/migrate.c ++++ b/fs/ext4/migrate.c +@@ -494,14 +494,10 @@ int ext4_ext_migrate(struct inode *inode + } + i_size_write(tmp_inode, i_size_read(inode)); + /* +- * We don't want the inode to be reclaimed +- * if we got interrupted in between. We have +- * this tmp inode carrying reference to the +- * data blocks of the original file. We set +- * the i_nlink to zero at the last stage after +- * switching the original file to extent format ++ * Set the i_nlink to zero so it will be deleted later ++ * when we drop inode reference. + */ +- tmp_inode->i_nlink = 1; ++ tmp_inode->i_nlink = 0; + + ext4_ext_tree_init(handle, tmp_inode); + ext4_orphan_add(handle, tmp_inode); +@@ -528,6 +524,16 @@ int ext4_ext_migrate(struct inode *inode + up_read((&EXT4_I(inode)->i_data_sem)); + + handle = ext4_journal_start(inode, 1); ++ if (IS_ERR(handle)) { ++ /* ++ * It is impossible to update on-disk structures without ++ * a handle, so just rollback in-core changes and live other ++ * work to orphan_list_cleanup() ++ */ ++ ext4_orphan_del(NULL, tmp_inode); ++ retval = PTR_ERR(handle); ++ goto out; ++ } + + ei = EXT4_I(inode); + i_data = ei->i_data; +@@ -609,15 +615,8 @@ err_out: + + /* Reset the extent details */ + ext4_ext_tree_init(handle, tmp_inode); +- +- /* +- * Set the i_nlink to zero so that +- * generic_drop_inode really deletes the +- * inode +- */ +- tmp_inode->i_nlink = 0; +- + ext4_journal_stop(handle); ++out: + unlock_new_inode(tmp_inode); + iput(tmp_inode); + diff --git a/queue-2.6.32/0018-ext4-explicitly-remove-inode-from-orphan-list-after-.patch b/queue-2.6.32/0018-ext4-explicitly-remove-inode-from-orphan-list-after-.patch new file mode 100644 index 00000000000..d752a67827f --- /dev/null +++ b/queue-2.6.32/0018-ext4-explicitly-remove-inode-from-orphan-list-after-.patch @@ -0,0 +1,28 @@ +From 6c582d8b4e6868f8e16d160c0435530d5f8fa8e5 Mon Sep 17 00:00:00 2001 +From: Dmitry Monakhov +Date: Sun, 30 May 2010 22:49:32 -0400 +Subject: ext4: explicitly remove inode from orphan list after failed direct io + +commit da1dafca84413145f5ac59998b4cdd06fb89f721 upstream (as of v2.6.33-git11) + +Otherwise non-empty orphan list will be triggered on umount. + +Signed-off-by: Dmitry Monakhov +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -3480,6 +3480,9 @@ retry: + * but cannot extend i_size. Bail out and pretend + * the write failed... */ + ret = PTR_ERR(handle); ++ if (inode->i_nlink) ++ ext4_orphan_del(NULL, inode); ++ + goto out; + } + if (inode->i_nlink) diff --git a/queue-2.6.32/0019-ext4-Handle-non-empty-on-disk-orphan-link.patch b/queue-2.6.32/0019-ext4-Handle-non-empty-on-disk-orphan-link.patch new file mode 100644 index 00000000000..9bef19c9143 --- /dev/null +++ b/queue-2.6.32/0019-ext4-Handle-non-empty-on-disk-orphan-link.patch @@ -0,0 +1,45 @@ +From 7765050b0f7e5ffc9146c5cea83a14774ff03a73 Mon Sep 17 00:00:00 2001 +From: Dmitry Monakhov +Date: Sun, 30 May 2010 22:49:33 -0400 +Subject: ext4: Handle non empty on-disk orphan link + +commit 6e3617e579e070d3655a93ee9ed7149113e795e0 upstream (as of v2.6.33-git11) + +In case of truncate errors we explicitly remove inode from in-core +orphan list via orphan_del(NULL, inode) without modifying the on-disk list. + +But later on, the same inode may be inserted in the orphan list again +which will result the on-disk linked list getting corrupted. If inode +i_dtime contains valid value, then skip on-disk list modification. + +Signed-off-by: Dmitry Monakhov +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/namei.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -2020,6 +2020,13 @@ int ext4_orphan_add(handle_t *handle, st + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_unlock; ++ /* ++ * Due to previous errors inode may be already a part of on-disk ++ * orphan list. If so skip on-disk list modification. ++ */ ++ if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <= ++ (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) ++ goto mem_insert; + + /* Insert this inode at the head of the on-disk orphan list... */ + NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); +@@ -2037,6 +2044,7 @@ int ext4_orphan_add(handle_t *handle, st + * + * This is safe: on error we're going to ignore the orphan list + * anyway on the next recovery. */ ++mem_insert: + if (!err) + list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); + diff --git a/queue-2.6.32/0020-ext4-make-offset-consistent-in-ext4_check_dir_entry.patch b/queue-2.6.32/0020-ext4-make-offset-consistent-in-ext4_check_dir_entry.patch new file mode 100644 index 00000000000..bc332b50719 --- /dev/null +++ b/queue-2.6.32/0020-ext4-make-offset-consistent-in-ext4_check_dir_entry.patch @@ -0,0 +1,40 @@ +From 5921c8d6a6e598b1101b5785f09bbe334e92957d Mon Sep 17 00:00:00 2001 +From: Toshiyuki Okajima +Date: Sun, 30 May 2010 22:49:34 -0400 +Subject: ext4: make "offset" consistent in ext4_check_dir_entry() + +commit b8b8afe236e97b6359d46d3a3f8c46455e192271 upstream (as of v2.6.33-git11) + +The callers of ext4_check_dir_entry() usually pass in the "file +offset" (ext4_readdir, htree_dirblock_to_tree, search_dirblock, +ext4_dx_find_entry, empty_dir), but a few callers (add_dirent_to_buf, +ext4_delete_entry) only pass in the buffer offset. + +To accomodate those last two (which would be hard to fix otherwise), +this patch changes ext4_check_dir_entry() to print the physical block +number and the relative offset as well as the passed-in offset. + +Signed-off-by: Toshiyuki Okajima +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/dir.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +--- a/fs/ext4/dir.c ++++ b/fs/ext4/dir.c +@@ -84,9 +84,11 @@ int ext4_check_dir_entry(const char *fun + + if (error_msg != NULL) + ext4_error(dir->i_sb, function, +- "bad entry in directory #%lu: %s - " +- "offset=%u, inode=%u, rec_len=%d, name_len=%d", +- dir->i_ino, error_msg, offset, ++ "bad entry in directory #%lu: %s - block=%llu" ++ "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", ++ dir->i_ino, error_msg, ++ (unsigned long long) bh->b_blocknr, ++ (unsigned) (offset%bh->b_size), offset, + le32_to_cpu(de->inode), + rlen, de->name_len); + return error_msg == NULL ? 1 : 0; diff --git a/queue-2.6.32/0021-ext4-Fix-insertion-point-of-extent-in-mext_insert_ac.patch b/queue-2.6.32/0021-ext4-Fix-insertion-point-of-extent-in-mext_insert_ac.patch new file mode 100644 index 00000000000..71b6d63e9a2 --- /dev/null +++ b/queue-2.6.32/0021-ext4-Fix-insertion-point-of-extent-in-mext_insert_ac.patch @@ -0,0 +1,54 @@ +From 98cc8ca4405bfb2d511c83ced6c46153c04d5f76 Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Sun, 30 May 2010 22:49:35 -0400 +Subject: ext4: Fix insertion point of extent in mext_insert_across_blocks() + +commit 5fd5249aa36fad98c9fd5edced352939e54f9324 upstream (as of v2.6.33-git11) + +If the leaf node has 2 extent space or fewer and EXT4_IOC_MOVE_EXT +ioctl is called with the file offset where after the 2nd extent +covers, mext_insert_across_blocks() always tries to insert extent into +the first extent. As a result, the file gets corrupted because of +wrong extent order. The patch fixes this problem. + +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/move_extent.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -252,6 +252,7 @@ mext_insert_across_blocks(handle_t *hand + } + + o_start->ee_len = start_ext->ee_len; ++ eblock = le32_to_cpu(start_ext->ee_block); + new_flag = 1; + + } else if (start_ext->ee_len && new_ext->ee_len && +@@ -262,6 +263,7 @@ mext_insert_across_blocks(handle_t *hand + * orig |------------------------------| + */ + o_start->ee_len = start_ext->ee_len; ++ eblock = le32_to_cpu(start_ext->ee_block); + new_flag = 1; + + } else if (!start_ext->ee_len && new_ext->ee_len && +@@ -502,6 +504,7 @@ mext_leaf_block(handle_t *handle, struct + le32_to_cpu(oext->ee_block) + oext_alen) { + start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - + le32_to_cpu(oext->ee_block)); ++ start_ext.ee_block = oext->ee_block; + copy_extent_status(oext, &start_ext); + } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { + prev_ext = oext - 1; +@@ -515,6 +518,7 @@ mext_leaf_block(handle_t *handle, struct + start_ext.ee_len = cpu_to_le16( + ext4_ext_get_actual_len(prev_ext) + + new_ext_alen); ++ start_ext.ee_block = oext->ee_block; + copy_extent_status(prev_ext, &start_ext); + new_ext.ee_len = 0; + } diff --git a/queue-2.6.32/0022-ext4-Fix-the-NULL-reference-in-double_down_write_dat.patch b/queue-2.6.32/0022-ext4-Fix-the-NULL-reference-in-double_down_write_dat.patch new file mode 100644 index 00000000000..d412ffb0f32 --- /dev/null +++ b/queue-2.6.32/0022-ext4-Fix-the-NULL-reference-in-double_down_write_dat.patch @@ -0,0 +1,50 @@ +From 06518e8c9d0a67cb024545b880849b68b79a5390 Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Sun, 30 May 2010 22:49:36 -0400 +Subject: ext4: Fix the NULL reference in double_down_write_data_sem() + +commit 7247c0caa23d94a1cb6b307edba9dc45fb0798d4 upstream (as of v2.6.33-git11) + +If EXT4_IOC_MOVE_EXT ioctl is called with NULL donor_fd, fget() in +ext4_ioctl() gets inappropriate file structure for donor; so we need +to do this check earlier, before calling double_down_write_data_sem(). + +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/move_extent.c | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -953,14 +953,6 @@ mext_check_arguments(struct inode *orig_ + unsigned int blkbits = orig_inode->i_blkbits; + unsigned int blocksize = 1 << blkbits; + +- /* Regular file check */ +- if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { +- ext4_debug("ext4 move extent: The argument files should be " +- "regular file [ino:orig %lu, donor %lu]\n", +- orig_inode->i_ino, donor_inode->i_ino); +- return -EINVAL; +- } +- + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { + ext4_debug("ext4 move extent: suid or sgid is set" + " to donor file [ino:orig %lu, donor %lu]\n", +@@ -1207,6 +1199,14 @@ ext4_move_extents(struct file *o_filp, s + orig_inode->i_ino, donor_inode->i_ino); + return -EINVAL; + } ++ ++ /* Regular file check */ ++ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { ++ ext4_debug("ext4 move extent: The argument files should be " ++ "regular file [ino:orig %lu, donor %lu]\n", ++ orig_inode->i_ino, donor_inode->i_ino); ++ return -EINVAL; ++ } + + /* Protect orig and donor inodes against a truncate */ + ret1 = mext_inode_double_lock(orig_inode, donor_inode); diff --git a/queue-2.6.32/0023-ext4-Code-cleanup-for-EXT4_IOC_MOVE_EXT-ioctl.patch b/queue-2.6.32/0023-ext4-Code-cleanup-for-EXT4_IOC_MOVE_EXT-ioctl.patch new file mode 100644 index 00000000000..6e7e766d92f --- /dev/null +++ b/queue-2.6.32/0023-ext4-Code-cleanup-for-EXT4_IOC_MOVE_EXT-ioctl.patch @@ -0,0 +1,58 @@ +From eee98b87da36ae78c6867d8ce1943f65a16da648 Mon Sep 17 00:00:00 2001 +From: Akira Fujita +Date: Sun, 30 May 2010 22:49:37 -0400 +Subject: ext4: Code cleanup for EXT4_IOC_MOVE_EXT ioctl + +commit c437b2733520599a2c6e0dbcdeae611319f84707 upstream (as of v2.6.33-git11) + +a) Fix sparse warning in ext4_ioctl() +b) Remove unneeded variable in mext_leaf_block() +c) Fix spelling typo in mext_check_arguments() + +Signed-off-by: Akira Fujita +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ioctl.c | 3 ++- + fs/ext4/move_extent.c | 4 +--- + 2 files changed, 3 insertions(+), 4 deletions(-) + +--- a/fs/ext4/ioctl.c ++++ b/fs/ext4/ioctl.c +@@ -258,7 +258,8 @@ setversion_out: + if (me.moved_len > 0) + file_remove_suid(donor_filp); + +- if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) ++ if (copy_to_user((struct move_extent __user *)arg, ++ &me, sizeof(me))) + err = -EFAULT; + mext_out: + fput(donor_filp); +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -477,7 +477,6 @@ mext_leaf_block(handle_t *handle, struct + struct ext4_extent *oext, *o_start, *o_end, *prev_ext; + struct ext4_extent new_ext, start_ext, end_ext; + ext4_lblk_t new_ext_end; +- ext4_fsblk_t new_phys_end; + int oext_alen, new_ext_alen, end_ext_alen; + int depth = ext_depth(orig_inode); + int ret; +@@ -491,7 +490,6 @@ mext_leaf_block(handle_t *handle, struct + new_ext.ee_len = dext->ee_len; + new_ext_alen = ext4_ext_get_actual_len(&new_ext); + new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; +- new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1; + + /* + * Case: original extent is first +@@ -932,7 +930,7 @@ out2: + } + + /** +- * mext_check_argumants - Check whether move extent can be done ++ * mext_check_arguments - Check whether move extent can be done + * + * @orig_inode: original inode + * @donor_inode: donor inode diff --git a/queue-2.6.32/0024-ext4-Fix-estimate-of-of-blocks-needed-to-write-indir.patch b/queue-2.6.32/0024-ext4-Fix-estimate-of-of-blocks-needed-to-write-indir.patch new file mode 100644 index 00000000000..9fa6f64bcc0 --- /dev/null +++ b/queue-2.6.32/0024-ext4-Fix-estimate-of-of-blocks-needed-to-write-indir.patch @@ -0,0 +1,36 @@ +From 24bce2c3022a0ff4cb418ed11173bef96bd9806a Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Sun, 30 May 2010 22:49:38 -0400 +Subject: ext4: Fix estimate of # of blocks needed to write indirect-mapped files + +commit d330a5befb88875a9b3d2db62f9b74dadf660b13 upstream (as of v2.6.34-rc3) + +http://bugzilla.kernel.org/show_bug.cgi?id=15420 + +Signed-off-by: Jan Kara +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1060,7 +1060,7 @@ static int ext4_indirect_calc_metadata_a + sector_t lblock) + { + struct ext4_inode_info *ei = EXT4_I(inode); +- int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1; ++ sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); + int blk_bits; + + if (lblock < EXT4_NDIR_BLOCKS) +@@ -1075,7 +1075,7 @@ static int ext4_indirect_calc_metadata_a + } + ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; + ei->i_da_metadata_calc_len = 1; +- blk_bits = roundup_pow_of_two(lblock + 1); ++ blk_bits = order_base_2(lblock); + return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; + } + diff --git a/queue-2.6.32/0025-ext4-Fixed-inode-allocator-to-correctly-track-a-flex.patch b/queue-2.6.32/0025-ext4-Fixed-inode-allocator-to-correctly-track-a-flex.patch new file mode 100644 index 00000000000..2e468e5f027 --- /dev/null +++ b/queue-2.6.32/0025-ext4-Fixed-inode-allocator-to-correctly-track-a-flex.patch @@ -0,0 +1,38 @@ +From 0177767f12e4ebcb387fc3c7e5945611ce0dd6f1 Mon Sep 17 00:00:00 2001 +From: Eric Sandeen +Date: Sun, 30 May 2010 22:49:39 -0400 +Subject: ext4: Fixed inode allocator to correctly track a flex_bg's used_dirs + +commit c4caae25187ff3f5e837c6f04eb1acc2723c72d3 upstream (as of v2.6.34-rc3) + +When used_dirs was introduced for the flex_groups struct, it looks +like the accounting was not put into place properly, in some places +manipulating free_inodes rather than used_dirs. + +Signed-off-by: Eric Sandeen +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ialloc.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -268,7 +268,7 @@ void ext4_free_inode(handle_t *handle, s + ext4_group_t f; + + f = ext4_flex_group(sbi, block_group); +- atomic_dec(&sbi->s_flex_groups[f].free_inodes); ++ atomic_dec(&sbi->s_flex_groups[f].used_dirs); + } + + } +@@ -779,7 +779,7 @@ static int ext4_claim_inode(struct super + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, group); + +- atomic_inc(&sbi->s_flex_groups[f].free_inodes); ++ atomic_inc(&sbi->s_flex_groups[f].used_dirs); + } + } + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); diff --git a/queue-2.6.32/0026-ext4-Fix-possible-lost-inode-write-in-no-journal-mod.patch b/queue-2.6.32/0026-ext4-Fix-possible-lost-inode-write-in-no-journal-mod.patch new file mode 100644 index 00000000000..71331829a15 --- /dev/null +++ b/queue-2.6.32/0026-ext4-Fix-possible-lost-inode-write-in-no-journal-mod.patch @@ -0,0 +1,39 @@ +From 457ad9487d209f3c7bcb6de32aa393f75ba5e22d Mon Sep 17 00:00:00 2001 +From: Curt Wohlgemuth +Date: Sun, 30 May 2010 22:49:40 -0400 +Subject: ext4: Fix possible lost inode write in no journal mode + +commit 8b472d739b2ddd8ab7fb278874f696cd95b25a5e upstream (as of v2.6.34-rc6) + +In the no-journal case, ext4_write_inode() will fetch the bh and call +sync_dirty_buffer() on it. However, if the bh has already been +written and the bh reclaimed for some other purpose, AND if the inode +is the only one in the inode table block in use, then +ext4_get_inode_loc() will not read the inode table block from disk, +but as an optimization, fill the block with zero's assuming that its +caller will copy in the on-disk version of the inode. This is not +done by ext4_write_inode(), so the contents of the inode can simply +get lost. The fix is to use __ext4_get_inode_loc() with in_mem set to +0, instead of ext4_get_inode_loc(). Long term the API needs to be +fixed so it's obvious why latter is not safe. + +Addresses-Google-Bug: #2526446 + +Signed-off-by: Curt Wohlgemuth +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -5260,7 +5260,7 @@ int ext4_write_inode(struct inode *inode + } else { + struct ext4_iloc iloc; + +- err = ext4_get_inode_loc(inode, &iloc); ++ err = __ext4_get_inode_loc(inode, &iloc, 0); + if (err) + return err; + if (wait) diff --git a/queue-2.6.32/0027-ext4-Fix-buffer-head-leaks-after-calls-to-ext4_get_i.patch b/queue-2.6.32/0027-ext4-Fix-buffer-head-leaks-after-calls-to-ext4_get_i.patch new file mode 100644 index 00000000000..47c83283988 --- /dev/null +++ b/queue-2.6.32/0027-ext4-Fix-buffer-head-leaks-after-calls-to-ext4_get_i.patch @@ -0,0 +1,42 @@ +From 62de51f3a99493a99d7f4e3793b5952b40880ea0 Mon Sep 17 00:00:00 2001 +From: Curt Wohlgemuth +Date: Sun, 30 May 2010 22:49:41 -0400 +Subject: ext4: Fix buffer head leaks after calls to ext4_get_inode_loc() + +commit fd2dd9fbaf9e498ec63eef298921e36556f7214c upstream (as of v2.6.34-rc6) + +Calls to ext4_get_inode_loc() returns with a reference to a buffer +head in iloc->bh. The callers of this function in ext4_write_inode() +when in no journal mode and in ext4_xattr_fiemap() don't release the +buffer head after using it. + +Addresses-Google-Bug: #2548165 + +Signed-off-by: Curt Wohlgemuth +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/extents.c | 1 + + fs/ext4/inode.c | 1 + + 2 files changed, 2 insertions(+) + +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3778,6 +3778,7 @@ static int ext4_xattr_fiemap(struct inod + physical += offset; + length = EXT4_SB(inode->i_sb)->s_inode_size - offset; + flags |= FIEMAP_EXTENT_DATA_INLINE; ++ brelse(iloc.bh); + } else { /* external block */ + physical = EXT4_I(inode)->i_file_acl << blockbits; + length = inode->i_sb->s_blocksize; +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -5273,6 +5273,7 @@ int ext4_write_inode(struct inode *inode + (unsigned long long)iloc.bh->b_blocknr); + err = -EIO; + } ++ brelse(iloc.bh); + } + return err; + } diff --git a/queue-2.6.32/0028-ext4-Issue-the-discard-operation-before-releasing-th.patch b/queue-2.6.32/0028-ext4-Issue-the-discard-operation-before-releasing-th.patch new file mode 100644 index 00000000000..e4c3fc44236 --- /dev/null +++ b/queue-2.6.32/0028-ext4-Issue-the-discard-operation-before-releasing-th.patch @@ -0,0 +1,58 @@ +From 462d9c2b296ce81bf4c6a6899e256ae6188f9a5a Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sun, 30 May 2010 22:49:42 -0400 +Subject: ext4: Issue the discard operation *before* releasing the blocks to be reused + +commit b90f687018e6d6c77d981b09203780f7001407e5 upstream (as of v2.6.34-rc6) + +Otherwise, we can end up having data corruption because the blocks +could get reused and then discarded! + +https://bugzilla.kernel.org/show_bug.cgi?id=15579 + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 24 +++++++++++------------- + 1 file changed, 11 insertions(+), 13 deletions(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -2537,6 +2537,17 @@ static void release_blocks_on_commit(jou + mb_debug(1, "gonna free %u blocks in group %u (0x%p):", + entry->count, entry->group, entry); + ++ if (test_opt(sb, DISCARD)) { ++ ext4_fsblk_t discard_block; ++ ++ discard_block = entry->start_blk + ++ ext4_group_first_block_no(sb, entry->group); ++ trace_ext4_discard_blocks(sb, ++ (unsigned long long)discard_block, ++ entry->count); ++ sb_issue_discard(sb, discard_block, entry->count); ++ } ++ + err = ext4_mb_load_buddy(sb, entry->group, &e4b); + /* we expect to find existing buddy because it's pinned */ + BUG_ON(err != 0); +@@ -2558,19 +2569,6 @@ static void release_blocks_on_commit(jou + page_cache_release(e4b.bd_bitmap_page); + } + ext4_unlock_group(sb, entry->group); +- if (test_opt(sb, DISCARD)) { +- ext4_fsblk_t discard_block; +- struct ext4_super_block *es = EXT4_SB(sb)->s_es; +- +- discard_block = (ext4_fsblk_t)entry->group * +- EXT4_BLOCKS_PER_GROUP(sb) +- + entry->start_blk +- + le32_to_cpu(es->s_first_data_block); +- trace_ext4_discard_blocks(sb, +- (unsigned long long)discard_block, +- entry->count); +- sb_issue_discard(sb, discard_block, entry->count); +- } + kmem_cache_free(ext4_free_ext_cachep, entry); + ext4_mb_release_desc(&e4b); + } diff --git a/queue-2.6.32/0029-ext4-check-missed-return-value-in-ext4_sync_file.patch b/queue-2.6.32/0029-ext4-check-missed-return-value-in-ext4_sync_file.patch new file mode 100644 index 00000000000..c691f107d58 --- /dev/null +++ b/queue-2.6.32/0029-ext4-check-missed-return-value-in-ext4_sync_file.patch @@ -0,0 +1,25 @@ +From 6aac59ef585709fa8e03cf86dc741954b3af47c7 Mon Sep 17 00:00:00 2001 +From: Dmitry Monakhov +Date: Sun, 30 May 2010 22:49:43 -0400 +Subject: ext4: check missed return value in ext4_sync_file() + +commit 0671e704658b9f26f85e78d51176daa861f955c7 upstream (as of v2.6.34-git13) + +Signed-off-by: Dmitry Monakhov +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/fsync.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/fsync.c ++++ b/fs/ext4/fsync.c +@@ -101,7 +101,7 @@ int ext4_sync_file(struct file *file, st + (journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); +- jbd2_log_wait_commit(journal, commit_tid); ++ ret = jbd2_log_wait_commit(journal, commit_tid); + } else if (journal->j_flags & JBD2_BARRIER) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); + return ret; diff --git a/queue-2.6.32/0030-ext4-fix-memory-leaks-in-error-path-handling-of-ext4.patch b/queue-2.6.32/0030-ext4-fix-memory-leaks-in-error-path-handling-of-ext4.patch new file mode 100644 index 00000000000..d03fe27caea --- /dev/null +++ b/queue-2.6.32/0030-ext4-fix-memory-leaks-in-error-path-handling-of-ext4.patch @@ -0,0 +1,62 @@ +From bc65559adfab46dcbcab65d1830490c5043983bf Mon Sep 17 00:00:00 2001 +From: Jing Zhang +Date: Sun, 30 May 2010 22:49:44 -0400 +Subject: ext4: fix memory leaks in error path handling of ext4_ext_zeroout() + +commit b720303df7352d4a7a1f61e467e0a124913c0d41 upstream (as of v2.6.34-git13) + +When EIO occurs after bio is submitted, there is no memory free +operation for bio, which results in memory leakage. And there is also +no check against bio_alloc() for bio. + +Acked-by: Dave Kleikamp +Signed-off-by: Jing Zhang +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/extents.c | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -2446,7 +2446,7 @@ static void bi_complete(struct bio *bio, + /* FIXME!! we need to try to merge to left or right after zero-out */ + static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) + { +- int ret = -EIO; ++ int ret; + struct bio *bio; + int blkbits, blocksize; + sector_t ee_pblock; +@@ -2470,6 +2470,9 @@ static int ext4_ext_zeroout(struct inode + len = ee_len; + + bio = bio_alloc(GFP_NOIO, len); ++ if (!bio) ++ return -ENOMEM; ++ + bio->bi_sector = ee_pblock; + bio->bi_bdev = inode->i_sb->s_bdev; + +@@ -2497,17 +2500,15 @@ static int ext4_ext_zeroout(struct inode + submit_bio(WRITE, bio); + wait_for_completion(&event); + +- if (test_bit(BIO_UPTODATE, &bio->bi_flags)) +- ret = 0; +- else { +- ret = -EIO; +- break; ++ if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { ++ bio_put(bio); ++ return -EIO; + } + bio_put(bio); + ee_len -= done; + ee_pblock += done << (blkbits - 9); + } +- return ret; ++ return 0; + } + + #define EXT4_EXT_ZERO_LEN 7 diff --git a/queue-2.6.32/0031-ext4-Remove-unnecessary-call-to-ext4_get_group_desc-.patch b/queue-2.6.32/0031-ext4-Remove-unnecessary-call-to-ext4_get_group_desc-.patch new file mode 100644 index 00000000000..ae93e97922c --- /dev/null +++ b/queue-2.6.32/0031-ext4-Remove-unnecessary-call-to-ext4_get_group_desc-.patch @@ -0,0 +1,32 @@ +From dc93068aadac2019c504112d2761773e64e7ba72 Mon Sep 17 00:00:00 2001 +From: Jing Zhang +Date: Sun, 30 May 2010 22:49:45 -0400 +Subject: ext4: Remove unnecessary call to ext4_get_group_desc() in mballoc + +commit 62e823a2cba18509ee826d775270e8ef9071b5bc upstream (as of v2.6.34-git13) + +Signed-off-by: Jing Zhang +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 2 -- + 1 file changed, 2 deletions(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -2027,7 +2027,6 @@ repeat: + + for (i = 0; i < ngroups; group++, i++) { + struct ext4_group_info *grp; +- struct ext4_group_desc *desc; + + if (group == ngroups) + group = 0; +@@ -2050,7 +2049,6 @@ repeat: + } + + ac->ac_groups_scanned++; +- desc = ext4_get_group_desc(sb, group, NULL); + if (cr == 0) + ext4_mb_simple_scan_group(ac, &e4b); + else if (cr == 1 && diff --git a/queue-2.6.32/0032-ext4-rename-ext4_mb_release_desc-to-ext4_mb_unload_b.patch b/queue-2.6.32/0032-ext4-rename-ext4_mb_release_desc-to-ext4_mb_unload_b.patch new file mode 100644 index 00000000000..b26f5f60df7 --- /dev/null +++ b/queue-2.6.32/0032-ext4-rename-ext4_mb_release_desc-to-ext4_mb_unload_b.patch @@ -0,0 +1,127 @@ +From 5fc0d2b4f06dfd2a941e23171a5a4a155383c47a Mon Sep 17 00:00:00 2001 +From: Jing Zhang +Date: Sun, 30 May 2010 22:49:46 -0400 +Subject: ext4: rename ext4_mb_release_desc() to ext4_mb_unload_buddy() + +commit e39e07fdfd98be8650385f12a7b81d6adc547510 upstream (as of v2.6.34-git13) + +This function cleans up after ext4_mb_load_buddy(), so the renaming +makes the code clearer. + +Signed-off-by: Jing Zhang +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -1150,7 +1150,7 @@ err: + return ret; + } + +-static void ext4_mb_release_desc(struct ext4_buddy *e4b) ++static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) + { + if (e4b->bd_bitmap_page) + page_cache_release(e4b->bd_bitmap_page); +@@ -1618,7 +1618,7 @@ int ext4_mb_try_best_found(struct ext4_a + } + + ext4_unlock_group(ac->ac_sb, group); +- ext4_mb_release_desc(e4b); ++ ext4_mb_unload_buddy(e4b); + + return 0; + } +@@ -1674,7 +1674,7 @@ int ext4_mb_find_by_goal(struct ext4_all + ext4_mb_use_best_found(ac, e4b); + } + ext4_unlock_group(ac->ac_sb, group); +- ext4_mb_release_desc(e4b); ++ ext4_mb_unload_buddy(e4b); + + return 0; + } +@@ -2044,7 +2044,7 @@ repeat: + if (!ext4_mb_good_group(ac, group, cr)) { + /* someone did allocation from this group */ + ext4_unlock_group(sb, group); +- ext4_mb_release_desc(&e4b); ++ ext4_mb_unload_buddy(&e4b); + continue; + } + +@@ -2058,7 +2058,7 @@ repeat: + ext4_mb_complex_scan_group(ac, &e4b); + + ext4_unlock_group(sb, group); +- ext4_mb_release_desc(&e4b); ++ ext4_mb_unload_buddy(&e4b); + + if (ac->ac_status != AC_STATUS_CONTINUE) + break; +@@ -2148,7 +2148,7 @@ static int ext4_mb_seq_groups_show(struc + ext4_lock_group(sb, group); + memcpy(&sg, ext4_get_group_info(sb, group), i); + ext4_unlock_group(sb, group); +- ext4_mb_release_desc(&e4b); ++ ext4_mb_unload_buddy(&e4b); + + seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, + sg.info.bb_fragments, sg.info.bb_first_free); +@@ -2568,7 +2568,7 @@ static void release_blocks_on_commit(jou + } + ext4_unlock_group(sb, entry->group); + kmem_cache_free(ext4_free_ext_cachep, entry); +- ext4_mb_release_desc(&e4b); ++ ext4_mb_unload_buddy(&e4b); + } + + mb_debug(1, "freed %u blocks in %u structures\n", count, count2); +@@ -3705,7 +3705,7 @@ out: + ext4_unlock_group(sb, group); + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); +- ext4_mb_release_desc(&e4b); ++ ext4_mb_unload_buddy(&e4b); + put_bh(bitmap_bh); + return free; + } +@@ -3809,7 +3809,7 @@ repeat: + if (bitmap_bh == NULL) { + ext4_error(sb, __func__, "Error in reading block " + "bitmap for %u", group); +- ext4_mb_release_desc(&e4b); ++ ext4_mb_unload_buddy(&e4b); + continue; + } + +@@ -3818,7 +3818,7 @@ repeat: + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); + ext4_unlock_group(sb, group); + +- ext4_mb_release_desc(&e4b); ++ ext4_mb_unload_buddy(&e4b); + put_bh(bitmap_bh); + + list_del(&pa->u.pa_tmp_list); +@@ -4082,7 +4082,7 @@ ext4_mb_discard_lg_preallocations(struct + ext4_mb_release_group_pa(&e4b, pa, ac); + ext4_unlock_group(sb, group); + +- ext4_mb_release_desc(&e4b); ++ ext4_mb_unload_buddy(&e4b); + list_del(&pa->u.pa_tmp_list); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } +@@ -4584,7 +4584,7 @@ do_more: + atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); + } + +- ext4_mb_release_desc(&e4b); ++ ext4_mb_unload_buddy(&e4b); + + *freed += count; + diff --git a/queue-2.6.32/0033-ext4-allow-defrag-EXT4_IOC_MOVE_EXT-in-32bit-compat-.patch b/queue-2.6.32/0033-ext4-allow-defrag-EXT4_IOC_MOVE_EXT-in-32bit-compat-.patch new file mode 100644 index 00000000000..144ceaece84 --- /dev/null +++ b/queue-2.6.32/0033-ext4-allow-defrag-EXT4_IOC_MOVE_EXT-in-32bit-compat-.patch @@ -0,0 +1,42 @@ +From 9a0bd6ee7ccc0cfdc614dbc6a4708d596ec53f82 Mon Sep 17 00:00:00 2001 +From: Christian Borntraeger +Date: Sun, 30 May 2010 22:49:47 -0400 +Subject: ext4: allow defrag (EXT4_IOC_MOVE_EXT) in 32bit compat mode + +commit b684b2ee9409f2890a8b3aea98525bbe5f84e276 upstream (as of v2.6.34-git13) + +I have an x86_64 kernel with i386 userspace. e4defrag fails on the +EXT4_IOC_MOVE_EXT ioctl because it is not wired up for the compat +case. It seems that struct move_extent is compat save, only types +with fixed widths are used: +{ + __u32 reserved; /* should be zero */ + __u32 donor_fd; /* donor file descriptor */ + __u64 orig_start; /* logical start offset in block for orig */ + __u64 donor_start; /* logical start offset in block for donor */ + __u64 len; /* block length to be moved */ + __u64 moved_len; /* moved block length */ +}; + +Lets just wire up EXT4_IOC_MOVE_EXT for the compat case. + +Signed-off-by: Christian Borntraeger +Signed-off-by: "Theodore Ts'o" +Reviewed-by: Eric Sandeen +CC: Akira Fujita +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ioctl.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/ext4/ioctl.c ++++ b/fs/ext4/ioctl.c +@@ -375,6 +375,8 @@ long ext4_compat_ioctl(struct file *file + break; + case EXT4_IOC_GROUP_ADD: + break; ++ case EXT4_IOC_MOVE_EXT: ++ break; + default: + return -ENOIOCTLCMD; + } diff --git a/queue-2.6.32/0034-ext4-fix-quota-accounting-in-case-of-fallocate.patch b/queue-2.6.32/0034-ext4-fix-quota-accounting-in-case-of-fallocate.patch new file mode 100644 index 00000000000..46217fa31f2 --- /dev/null +++ b/queue-2.6.32/0034-ext4-fix-quota-accounting-in-case-of-fallocate.patch @@ -0,0 +1,28 @@ +From 93984006ca6af7d067409fd6db2bedd999af2b0d Mon Sep 17 00:00:00 2001 +From: Dmitry Monakhov +Date: Sun, 30 May 2010 22:49:48 -0400 +Subject: ext4: fix quota accounting in case of fallocate + +commit 35121c9860316d7799cea0fbc359a9186e7c2747 upstream (as of v2.6.34-git13) + +allocated_meta_data is already included in 'used' variable. + +Signed-off-by: Dmitry Monakhov +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -1149,7 +1149,8 @@ void ext4_da_update_reserve_space(struct + */ + if (allocated_meta_blocks) + vfs_dq_claim_block(inode, allocated_meta_blocks); +- vfs_dq_release_reservation_block(inode, mdb_free + used); ++ vfs_dq_release_reservation_block(inode, mdb_free + used - ++ allocated_meta_blocks); + } + + /* diff --git a/queue-2.6.32/0035-ext4-check-s_log_groups_per_flex-in-online-resize-co.patch b/queue-2.6.32/0035-ext4-check-s_log_groups_per_flex-in-online-resize-co.patch new file mode 100644 index 00000000000..8d30580f2c3 --- /dev/null +++ b/queue-2.6.32/0035-ext4-check-s_log_groups_per_flex-in-online-resize-co.patch @@ -0,0 +1,46 @@ +From 9e92f0bbe85a6ceead4b1215861f1a30bfe1d9dc Mon Sep 17 00:00:00 2001 +From: Eric Sandeen +Date: Sun, 30 May 2010 22:49:49 -0400 +Subject: ext4: check s_log_groups_per_flex in online resize code + +commit 42007efd569f1cf3bfb9a61da60ef6c2179508ca upstream (as of v2.6.34-git13) + +If groups_per_flex < 2, sbi->s_flex_groups[] doesn't get filled out, +and every other access to this first tests s_log_groups_per_flex; +same thing needs to happen in resize or we'll wander off into +a null pointer when doing an online resize of the file system. + +Thanks to Christoph Biedl, who came up with the trivial testcase: + +# truncate --size 128M fsfile +# mkfs.ext3 -F fsfile +# tune2fs -O extents,uninit_bg,dir_index,flex_bg,huge_file,dir_nlink,extra_isize fsfile +# e2fsck -yDf -C0 fsfile +# truncate --size 132M fsfile +# losetup /dev/loop0 fsfile +# mount /dev/loop0 mnt +# resize2fs -p /dev/loop0 + + https://bugzilla.kernel.org/show_bug.cgi?id=13549 + +Reported-by: Alessandro Polverini +Test-case-by: Christoph Biedl +Signed-off-by: Eric Sandeen +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/resize.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/fs/ext4/resize.c ++++ b/fs/ext4/resize.c +@@ -930,7 +930,8 @@ int ext4_group_add(struct super_block *s + percpu_counter_add(&sbi->s_freeinodes_counter, + EXT4_INODES_PER_GROUP(sb)); + +- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { ++ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) && ++ sbi->s_log_groups_per_flex) { + ext4_group_t flex_group; + flex_group = ext4_flex_group(sbi, input->group); + atomic_add(input->free_blocks_count, diff --git a/queue-2.6.32/0036-ext4-don-t-return-to-userspace-after-freezing-the-fs.patch b/queue-2.6.32/0036-ext4-don-t-return-to-userspace-after-freezing-the-fs.patch new file mode 100644 index 00000000000..a224c526195 --- /dev/null +++ b/queue-2.6.32/0036-ext4-don-t-return-to-userspace-after-freezing-the-fs.patch @@ -0,0 +1,86 @@ +From 168b7c0d3438662c33488f73a27036f14c176efc Mon Sep 17 00:00:00 2001 +From: Eric Sandeen +Date: Sun, 30 May 2010 22:49:50 -0400 +Subject: ext4: don't return to userspace after freezing the fs with a mutex held + +commit 6b0310fbf087ad6e9e3b8392adca97cd77184084 upstream (as of v2.6.34-git13) + +ext4_freeze() used jbd2_journal_lock_updates() which takes +the j_barrier mutex, and then returns to userspace. The +kernel does not like this: + +================================================ +[ BUG: lock held when returning to user space! ] +------------------------------------------------ +lvcreate/1075 is leaving the kernel with locks still held! +1 lock held by lvcreate/1075: + #0: (&journal->j_barrier){+.+...}, at: [] +jbd2_journal_lock_updates+0xe1/0xf0 + +Use vfs_check_frozen() added to ext4_journal_start_sb() and +ext4_force_commit() instead. + +Addresses-Red-Hat-Bugzilla: #568503 + +Signed-off-by: Eric Sandeen +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/super.c | 20 ++++++++++---------- + 1 file changed, 10 insertions(+), 10 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -227,6 +227,7 @@ handle_t *ext4_journal_start_sb(struct s + if (sb->s_flags & MS_RDONLY) + return ERR_PTR(-EROFS); + ++ vfs_check_frozen(sb, SB_FREEZE_WRITE); + /* Special case here: if the journal has aborted behind our + * backs (eg. EIO in the commit thread), then we still need to + * take the FS itself readonly cleanly. */ +@@ -3391,8 +3392,10 @@ int ext4_force_commit(struct super_block + return 0; + + journal = EXT4_SB(sb)->s_journal; +- if (journal) ++ if (journal) { ++ vfs_check_frozen(sb, SB_FREEZE_WRITE); + ret = ext4_journal_force_commit(journal); ++ } + + return ret; + } +@@ -3441,18 +3444,16 @@ static int ext4_freeze(struct super_bloc + * the journal. + */ + error = jbd2_journal_flush(journal); +- if (error < 0) { +- out: +- jbd2_journal_unlock_updates(journal); +- return error; +- } ++ if (error < 0) ++ goto out; + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + error = ext4_commit_super(sb, 1); +- if (error) +- goto out; +- return 0; ++out: ++ /* we rely on s_frozen to stop further updates */ ++ jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); ++ return error; + } + + /* +@@ -3469,7 +3470,6 @@ static int ext4_unfreeze(struct super_bl + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_commit_super(sb, 1); + unlock_super(sb); +- jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + return 0; + } + diff --git a/queue-2.6.32/0037-ext4-stop-issuing-discards-if-not-supported-by-devic.patch b/queue-2.6.32/0037-ext4-stop-issuing-discards-if-not-supported-by-devic.patch new file mode 100644 index 00000000000..d942d74f6bf --- /dev/null +++ b/queue-2.6.32/0037-ext4-stop-issuing-discards-if-not-supported-by-devic.patch @@ -0,0 +1,44 @@ +From 0778bf26394249a97740013f92198b5272703e8b Mon Sep 17 00:00:00 2001 +From: Eric Sandeen +Date: Sun, 30 May 2010 22:49:51 -0400 +Subject: ext4: stop issuing discards if not supported by device + +commit a30eec2a8650a77f754e84b2e15f062fe652baa7 upstream (as of v2.6.34-git13) + +Turn off issuance of discard requests if the device does +not support it - similar to the action we take for barriers. +This will save a little computation time if a non-discardable +device is mounted with -o discard, and also makes it obvious +that it's not doing what was asked at mount time ... + +Signed-off-by: Eric Sandeen +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/mballoc.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -2536,6 +2536,7 @@ static void release_blocks_on_commit(jou + entry->count, entry->group, entry); + + if (test_opt(sb, DISCARD)) { ++ int ret; + ext4_fsblk_t discard_block; + + discard_block = entry->start_blk + +@@ -2543,7 +2544,12 @@ static void release_blocks_on_commit(jou + trace_ext4_discard_blocks(sb, + (unsigned long long)discard_block, + entry->count); +- sb_issue_discard(sb, discard_block, entry->count); ++ ret = sb_issue_discard(sb, discard_block, entry->count); ++ if (ret == EOPNOTSUPP) { ++ ext4_warning(sb, __func__, ++ "discard not supported, disabling"); ++ clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); ++ } + } + + err = ext4_mb_load_buddy(sb, entry->group, &e4b); diff --git a/queue-2.6.32/0038-ext4-don-t-scan-accumulate-more-pages-than-mballoc-w.patch b/queue-2.6.32/0038-ext4-don-t-scan-accumulate-more-pages-than-mballoc-w.patch new file mode 100644 index 00000000000..95ea1c29179 --- /dev/null +++ b/queue-2.6.32/0038-ext4-don-t-scan-accumulate-more-pages-than-mballoc-w.patch @@ -0,0 +1,62 @@ +From 2f4283aff3e5415fa36cbf81aa2a6247bfbb0527 Mon Sep 17 00:00:00 2001 +From: Eric Sandeen +Date: Sun, 30 May 2010 22:49:52 -0400 +Subject: ext4: don't scan/accumulate more pages than mballoc will allocate + +commit c445e3e0a5c2804524dec6e55f66d63f6bc5bc3e upstream (as of v2.6.34-git13) + +There was a bug reported on RHEL5 that a 10G dd on a 12G box +had a very, very slow sync after that. + +At issue was the loop in write_cache_pages scanning all the way +to the end of the 10G file, even though the subsequent call +to mpage_da_submit_io would only actually write a smallish amt; then +we went back to the write_cache_pages loop ... wasting tons of time +in calling __mpage_da_writepage for thousands of pages we would +just revisit (many times) later. + +Upstream it's not such a big issue for sys_sync because we get +to the loop with a much smaller nr_to_write, which limits the loop. + +However, talking with Aneesh he realized that fsync upstream still +gets here with a very large nr_to_write and we face the same problem. + +This patch makes mpage_add_bh_to_extent stop the loop after we've +accumulated 2048 pages, by setting mpd->io_done = 1; which ultimately +causes the write_cache_pages loop to break. + +Repeating the test with a dirty_ratio of 80 (to leave something for +fsync to do), I don't see huge IO performance gains, but the reduction +in cpu usage is striking: 80% usage with stock, and 2% with the +below patch. Instrumenting the loop in write_cache_pages clearly +shows that we are wasting time here. + +Eventually we need to change mpage_da_map_pages() also submit its I/O +to the block layer, subsuming mpage_da_submit_io(), and then change it +call ext4_get_blocks() multiple times. + +Signed-off-by: Eric Sandeen +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -2361,6 +2361,15 @@ static void mpage_add_bh_to_extent(struc + sector_t next; + int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; + ++ /* ++ * XXX Don't go larger than mballoc is willing to allocate ++ * This is a stopgap solution. We eventually need to fold ++ * mpage_da_submit_io() into this function and then call ++ * ext4_get_blocks() multiple times in a loop ++ */ ++ if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) ++ goto flush_it; ++ + /* check if thereserved journal credits might overflow */ + if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { + if (nrblocks >= EXT4_MAX_TRANS_DATA) { diff --git a/queue-2.6.32/0039-ext4-Do-not-zero-out-uninitialized-extents-beyond-i_.patch b/queue-2.6.32/0039-ext4-Do-not-zero-out-uninitialized-extents-beyond-i_.patch new file mode 100644 index 00000000000..961282796b5 --- /dev/null +++ b/queue-2.6.32/0039-ext4-Do-not-zero-out-uninitialized-extents-beyond-i_.patch @@ -0,0 +1,199 @@ +From 3f9db529f4db9500a2bc9d296258a0dd8f9ac03e Mon Sep 17 00:00:00 2001 +From: Dmitry Monakhov +Date: Sun, 30 May 2010 22:49:53 -0400 +Subject: ext4: Do not zero out uninitialized extents beyond i_size + +commit 21ca087a3891efab4d45488db8febee474d26c68 upstream (as of v2.6.34-git13) + +The extents code will sometimes zero out blocks and mark them as +initialized instead of splitting an extent into several smaller ones. +This optimization however, causes problems if the extent is beyond +i_size because fsck will complain if there are uninitialized blocks +after i_size as this can not be distinguished from an inode that has +an incorrect i_size field. + +https://bugzilla.kernel.org/show_bug.cgi?id=15742 + +Signed-off-by: Dmitry Monakhov +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/extents.c | 67 +++++++++++++++++++++++++++++++++++++++++------------- + 1 file changed, 51 insertions(+), 16 deletions(-) + +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -2533,11 +2533,21 @@ static int ext4_ext_convert_to_initializ + struct ext4_extent *ex2 = NULL; + struct ext4_extent *ex3 = NULL; + struct ext4_extent_header *eh; +- ext4_lblk_t ee_block; ++ ext4_lblk_t ee_block, eof_block; + unsigned int allocated, ee_len, depth; + ext4_fsblk_t newblock; + int err = 0; + int ret = 0; ++ int may_zeroout; ++ ++ ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" ++ "block %llu, max_blocks %u\n", inode->i_ino, ++ (unsigned long long)iblock, max_blocks); ++ ++ eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> ++ inode->i_sb->s_blocksize_bits; ++ if (eof_block < iblock + max_blocks) ++ eof_block = iblock + max_blocks; + + depth = ext_depth(inode); + eh = path[depth].p_hdr; +@@ -2546,16 +2556,23 @@ static int ext4_ext_convert_to_initializ + ee_len = ext4_ext_get_actual_len(ex); + allocated = ee_len - (iblock - ee_block); + newblock = iblock - ee_block + ext_pblock(ex); ++ + ex2 = ex; + orig_ex.ee_block = ex->ee_block; + orig_ex.ee_len = cpu_to_le16(ee_len); + ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); + ++ /* ++ * It is safe to convert extent to initialized via explicit ++ * zeroout only if extent is fully insde i_size or new_size. ++ */ ++ may_zeroout = ee_block + ee_len <= eof_block; ++ + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + goto out; + /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ +- if (ee_len <= 2*EXT4_EXT_ZERO_LEN) { ++ if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; +@@ -2586,7 +2603,7 @@ static int ext4_ext_convert_to_initializ + if (allocated > max_blocks) { + unsigned int newdepth; + /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ +- if (allocated <= EXT4_EXT_ZERO_LEN) { ++ if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) { + /* + * iblock == ee_block is handled by the zerouout + * at the beginning. +@@ -2662,7 +2679,7 @@ static int ext4_ext_convert_to_initializ + ex3->ee_len = cpu_to_le16(allocated - max_blocks); + ext4_ext_mark_uninitialized(ex3); + err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); +- if (err == -ENOSPC) { ++ if (err == -ENOSPC && may_zeroout) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; +@@ -2686,8 +2703,10 @@ static int ext4_ext_convert_to_initializ + * update the extent length after successful insert of the + * split extent + */ +- orig_ex.ee_len = cpu_to_le16(ee_len - +- ext4_ext_get_actual_len(ex3)); ++ ee_len -= ext4_ext_get_actual_len(ex3); ++ orig_ex.ee_len = cpu_to_le16(ee_len); ++ may_zeroout = ee_block + ee_len <= eof_block; ++ + depth = newdepth; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, iblock, path); +@@ -2711,7 +2730,7 @@ static int ext4_ext_convert_to_initializ + * otherwise give the extent a chance to merge to left + */ + if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && +- iblock != ee_block) { ++ iblock != ee_block && may_zeroout) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; +@@ -2780,7 +2799,7 @@ static int ext4_ext_convert_to_initializ + goto out; + insert: + err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); +- if (err == -ENOSPC) { ++ if (err == -ENOSPC && may_zeroout) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; +@@ -2840,14 +2859,21 @@ static int ext4_split_unwritten_extents( + struct ext4_extent *ex2 = NULL; + struct ext4_extent *ex3 = NULL; + struct ext4_extent_header *eh; +- ext4_lblk_t ee_block; ++ ext4_lblk_t ee_block, eof_block; + unsigned int allocated, ee_len, depth; + ext4_fsblk_t newblock; + int err = 0; ++ int may_zeroout; ++ ++ ext_debug("ext4_split_unwritten_extents: inode %lu, logical" ++ "block %llu, max_blocks %u\n", inode->i_ino, ++ (unsigned long long)iblock, max_blocks); ++ ++ eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >> ++ inode->i_sb->s_blocksize_bits; ++ if (eof_block < iblock + max_blocks) ++ eof_block = iblock + max_blocks; + +- ext_debug("ext4_split_unwritten_extents: inode %lu," +- "iblock %llu, max_blocks %u\n", inode->i_ino, +- (unsigned long long)iblock, max_blocks); + depth = ext_depth(inode); + eh = path[depth].p_hdr; + ex = path[depth].p_ext; +@@ -2855,12 +2881,19 @@ static int ext4_split_unwritten_extents( + ee_len = ext4_ext_get_actual_len(ex); + allocated = ee_len - (iblock - ee_block); + newblock = iblock - ee_block + ext_pblock(ex); ++ + ex2 = ex; + orig_ex.ee_block = ex->ee_block; + orig_ex.ee_len = cpu_to_le16(ee_len); + ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); + + /* ++ * It is safe to convert extent to initialized via explicit ++ * zeroout only if extent is fully insde i_size or new_size. ++ */ ++ may_zeroout = ee_block + ee_len <= eof_block; ++ ++ /* + * If the uninitialized extent begins at the same logical + * block where the write begins, and the write completely + * covers the extent, then we don't need to split it. +@@ -2894,7 +2927,7 @@ static int ext4_split_unwritten_extents( + ex3->ee_len = cpu_to_le16(allocated - max_blocks); + ext4_ext_mark_uninitialized(ex3); + err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); +- if (err == -ENOSPC) { ++ if (err == -ENOSPC && may_zeroout) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; +@@ -2918,8 +2951,10 @@ static int ext4_split_unwritten_extents( + * update the extent length after successful insert of the + * split extent + */ +- orig_ex.ee_len = cpu_to_le16(ee_len - +- ext4_ext_get_actual_len(ex3)); ++ ee_len -= ext4_ext_get_actual_len(ex3); ++ orig_ex.ee_len = cpu_to_le16(ee_len); ++ may_zeroout = ee_block + ee_len <= eof_block; ++ + depth = newdepth; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, iblock, path); +@@ -2965,7 +3000,7 @@ static int ext4_split_unwritten_extents( + goto out; + insert: + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); +- if (err == -ENOSPC) { ++ if (err == -ENOSPC && may_zeroout) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; diff --git a/queue-2.6.32/0040-ext4-clean-up-inode-bitmaps-manipulation-in-ext4_fre.patch b/queue-2.6.32/0040-ext4-clean-up-inode-bitmaps-manipulation-in-ext4_fre.patch new file mode 100644 index 00000000000..4e504526125 --- /dev/null +++ b/queue-2.6.32/0040-ext4-clean-up-inode-bitmaps-manipulation-in-ext4_fre.patch @@ -0,0 +1,124 @@ +From ae42cce7e825bdc82a8e9c30a87c342d1e364e57 Mon Sep 17 00:00:00 2001 +From: Dmitry Monakhov +Date: Sun, 30 May 2010 22:49:54 -0400 +Subject: ext4: clean up inode bitmaps manipulation in ext4_free_inode + +commit d17413c08cd2b1dd2bf2cfdbb0f7b736b2b2b15c upstrea (as of v2..34-git13) + +- Reorganize locking scheme to batch two atomic operation in to one. + This also allow us to state what healthy group must obey following rule + ext4_free_inodes_count(sb, gdp) == ext4_count_free(inode_bitmap, NUM); +- Fix possible undefined pointer dereference. +- Even if group descriptor stats aren't accessible we have to update + inode bitmaps. +- Move non-group members update out of group_lock. + +Note: this commit has been observed to fix fs corruption problems +under heavy fs load + +Signed-off-by: Dmitry Monakhov +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ialloc.c | 85 +++++++++++++++++++++++++------------------------------ + 1 file changed, 39 insertions(+), 46 deletions(-) + +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -244,57 +244,50 @@ void ext4_free_inode(handle_t *handle, s + if (fatal) + goto error_return; + +- /* Ok, now we can actually update the inode bitmaps.. */ +- cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), +- bit, bitmap_bh->b_data); +- if (!cleared) +- ext4_error(sb, "ext4_free_inode", +- "bit already cleared for inode %lu", ino); +- else { +- gdp = ext4_get_group_desc(sb, block_group, &bh2); +- ++ fatal = -ESRCH; ++ gdp = ext4_get_group_desc(sb, block_group, &bh2); ++ if (gdp) { + BUFFER_TRACE(bh2, "get_write_access"); + fatal = ext4_journal_get_write_access(handle, bh2); +- if (fatal) goto error_return; +- +- if (gdp) { +- ext4_lock_group(sb, block_group); +- count = ext4_free_inodes_count(sb, gdp) + 1; +- ext4_free_inodes_set(sb, gdp, count); +- if (is_directory) { +- count = ext4_used_dirs_count(sb, gdp) - 1; +- ext4_used_dirs_set(sb, gdp, count); +- if (sbi->s_log_groups_per_flex) { +- ext4_group_t f; +- +- f = ext4_flex_group(sbi, block_group); +- atomic_dec(&sbi->s_flex_groups[f].used_dirs); +- } +- +- } +- gdp->bg_checksum = ext4_group_desc_csum(sbi, +- block_group, gdp); +- ext4_unlock_group(sb, block_group); +- percpu_counter_inc(&sbi->s_freeinodes_counter); +- if (is_directory) +- percpu_counter_dec(&sbi->s_dirs_counter); ++ } ++ ext4_lock_group(sb, block_group); ++ cleared = ext4_clear_bit(bit, bitmap_bh->b_data); ++ if (fatal || !cleared) { ++ ext4_unlock_group(sb, block_group); ++ goto out; ++ } + +- if (sbi->s_log_groups_per_flex) { +- ext4_group_t f; ++ count = ext4_free_inodes_count(sb, gdp) + 1; ++ ext4_free_inodes_set(sb, gdp, count); ++ if (is_directory) { ++ count = ext4_used_dirs_count(sb, gdp) - 1; ++ ext4_used_dirs_set(sb, gdp, count); ++ percpu_counter_dec(&sbi->s_dirs_counter); ++ } ++ gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); ++ ext4_unlock_group(sb, block_group); + +- f = ext4_flex_group(sbi, block_group); +- atomic_inc(&sbi->s_flex_groups[f].free_inodes); +- } +- } +- BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); +- err = ext4_handle_dirty_metadata(handle, NULL, bh2); +- if (!fatal) fatal = err; ++ percpu_counter_inc(&sbi->s_freeinodes_counter); ++ if (sbi->s_log_groups_per_flex) { ++ ext4_group_t f = ext4_flex_group(sbi, block_group); ++ ++ atomic_inc(&sbi->s_flex_groups[f].free_inodes); ++ if (is_directory) ++ atomic_dec(&sbi->s_flex_groups[f].used_dirs); + } +- BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); +- err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); +- if (!fatal) +- fatal = err; +- sb->s_dirt = 1; ++ BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); ++ fatal = ext4_handle_dirty_metadata(handle, NULL, bh2); ++out: ++ if (cleared) { ++ BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); ++ err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); ++ if (!fatal) ++ fatal = err; ++ sb->s_dirt = 1; ++ } else ++ ext4_error(sb, "ext4_free_inode", ++ "bit already cleared for inode %lu", ino); ++ + error_return: + brelse(bitmap_bh); + ext4_std_error(sb, fatal); diff --git a/queue-2.6.32/0041-ext4-init-statistics-after-journal-recovery.patch b/queue-2.6.32/0041-ext4-init-statistics-after-journal-recovery.patch new file mode 100644 index 00000000000..72f7f3129c2 --- /dev/null +++ b/queue-2.6.32/0041-ext4-init-statistics-after-journal-recovery.patch @@ -0,0 +1,93 @@ +From 73337c4a1e35c3dedceb9e2d3af84da8614e6a45 Mon Sep 17 00:00:00 2001 +From: Dmitry Monakhov +Date: Sun, 30 May 2010 22:49:55 -0400 +Subject: ext4: init statistics after journal recovery + +commit 84061e07c5fbbbf9dc8aef8fb750fc3a2dfc31f3 upstream (as of v2.6.34-git13) + +Currently block/inode/dir counters initialized before journal was +recovered. In fact after journal recovery this info will probably +change. And freeblocks it critical for correct delalloc mode +accounting. + +https://bugzilla.kernel.org/show_bug.cgi?id=15768 + +Signed-off-by: Dmitry Monakhov +Acked-by: Jan Kara +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/super.c | 41 ++++++++++++++++++----------------------- + 1 file changed, 18 insertions(+), 23 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -2695,24 +2695,6 @@ static int ext4_fill_super(struct super_ + get_random_bytes(&sbi->s_next_generation, sizeof(u32)); + spin_lock_init(&sbi->s_next_gen_lock); + +- err = percpu_counter_init(&sbi->s_freeblocks_counter, +- ext4_count_free_blocks(sb)); +- if (!err) { +- err = percpu_counter_init(&sbi->s_freeinodes_counter, +- ext4_count_free_inodes(sb)); +- } +- if (!err) { +- err = percpu_counter_init(&sbi->s_dirs_counter, +- ext4_count_dirs(sb)); +- } +- if (!err) { +- err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); +- } +- if (err) { +- ext4_msg(sb, KERN_ERR, "insufficient memory"); +- goto failed_mount3; +- } +- + sbi->s_stripe = ext4_get_stripe_size(sbi); + sbi->s_max_writeback_mb_bump = 128; + +@@ -2832,7 +2814,20 @@ static int ext4_fill_super(struct super_ + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + + no_journal: +- ++ err = percpu_counter_init(&sbi->s_freeblocks_counter, ++ ext4_count_free_blocks(sb)); ++ if (!err) ++ err = percpu_counter_init(&sbi->s_freeinodes_counter, ++ ext4_count_free_inodes(sb)); ++ if (!err) ++ err = percpu_counter_init(&sbi->s_dirs_counter, ++ ext4_count_dirs(sb)); ++ if (!err) ++ err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); ++ if (err) { ++ ext4_msg(sb, KERN_ERR, "insufficient memory"); ++ goto failed_mount_wq; ++ } + if (test_opt(sb, NOBH)) { + if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { + ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " +@@ -2965,6 +2960,10 @@ failed_mount_wq: + jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + } ++ percpu_counter_destroy(&sbi->s_freeblocks_counter); ++ percpu_counter_destroy(&sbi->s_freeinodes_counter); ++ percpu_counter_destroy(&sbi->s_dirs_counter); ++ percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + failed_mount3: + if (sbi->s_flex_groups) { + if (is_vmalloc_addr(sbi->s_flex_groups)) +@@ -2972,10 +2971,6 @@ failed_mount3: + else + kfree(sbi->s_flex_groups); + } +- percpu_counter_destroy(&sbi->s_freeblocks_counter); +- percpu_counter_destroy(&sbi->s_freeinodes_counter); +- percpu_counter_destroy(&sbi->s_dirs_counter); +- percpu_counter_destroy(&sbi->s_dirtyblocks_counter); + failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); diff --git a/queue-2.6.32/0042-ext4-Remove-extraneous-newlines-in-ext4_msg-calls.patch b/queue-2.6.32/0042-ext4-Remove-extraneous-newlines-in-ext4_msg-calls.patch new file mode 100644 index 00000000000..4a4cd7b6d02 --- /dev/null +++ b/queue-2.6.32/0042-ext4-Remove-extraneous-newlines-in-ext4_msg-calls.patch @@ -0,0 +1,57 @@ +From 2db9e1a9cc528228b60ece755187b60331db966d Mon Sep 17 00:00:00 2001 +From: Curt Wohlgemuth +Date: Sun, 30 May 2010 22:49:56 -0400 +Subject: ext4: Remove extraneous newlines in ext4_msg() calls + +commit fbe845ddf368f77f86aa7500f8fd2690f54c66a8 upstream (as of v2.6.34-git13) + +Addresses-Google-Bug: #2562325 + +Signed-off-by: Curt Wohlgemuth +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/inode.c | 6 +++--- + fs/ext4/super.c | 2 +- + 2 files changed, 4 insertions(+), 4 deletions(-) + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -2294,7 +2294,7 @@ static int mpage_da_map_blocks(struct mp + ext4_msg(mpd->inode->i_sb, KERN_CRIT, + "delayed block allocation failed for inode %lu at " + "logical offset %llu with max blocks %zd with " +- "error %d\n", mpd->inode->i_ino, ++ "error %d", mpd->inode->i_ino, + (unsigned long long) next, + mpd->b_size >> mpd->inode->i_blkbits, err); + printk(KERN_CRIT "This should not happen!! " +@@ -2956,7 +2956,7 @@ retry: + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " +- "%ld pages, ino %lu; err %d\n", __func__, ++ "%ld pages, ino %lu; err %d", __func__, + wbc->nr_to_write, inode->i_ino, ret); + goto out_writepages; + } +@@ -3031,7 +3031,7 @@ retry: + if (pages_skipped != wbc->pages_skipped) + ext4_msg(inode->i_sb, KERN_CRIT, + "This should not happen leaving %s " +- "with nr_to_write = %ld ret = %d\n", ++ "with nr_to_write = %ld ret = %d", + __func__, wbc->nr_to_write, ret); + + /* Update index */ +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -2902,7 +2902,7 @@ no_journal: + err = ext4_setup_system_zone(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize system " +- "zone (%d)\n", err); ++ "zone (%d)", err); + goto failed_mount4; + } + diff --git a/queue-2.6.32/0043-ext4-Prevent-creation-of-files-larger-than-RLIMIT_FS.patch b/queue-2.6.32/0043-ext4-Prevent-creation-of-files-larger-than-RLIMIT_FS.patch new file mode 100644 index 00000000000..2d25790952a --- /dev/null +++ b/queue-2.6.32/0043-ext4-Prevent-creation-of-files-larger-than-RLIMIT_FS.patch @@ -0,0 +1,32 @@ +From 1050094d53941e319e9d50d4171f060dddd5dc87 Mon Sep 17 00:00:00 2001 +From: Nikanth Karthikesan +Date: Sun, 30 May 2010 22:49:57 -0400 +Subject: ext4: Prevent creation of files larger than RLIMIT_FSIZE using fallocate + +commit 6d19c42b7cf81c39632b6d4dbc514e8449bcd346 upstream (as of v2.6.34-git13) + +Currently using posix_fallocate one can bypass an RLIMIT_FSIZE limit +and create a file larger than the limit. Add a check for that. + +Signed-off-by: Nikanth Karthikesan +Signed-off-by: Amit Arora +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/extents.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3607,6 +3607,11 @@ long ext4_fallocate(struct inode *inode, + */ + credits = ext4_chunk_trans_blocks(inode, max_blocks); + mutex_lock(&inode->i_mutex); ++ ret = inode_newsize_ok(inode, (len + offset)); ++ if (ret) { ++ mutex_unlock(&inode->i_mutex); ++ return ret; ++ } + retry: + while (ret >= 0 && ret < max_blocks) { + block = block + ret; diff --git a/queue-2.6.32/0044-ext4-check-for-a-good-block-group-before-loading-bud.patch b/queue-2.6.32/0044-ext4-check-for-a-good-block-group-before-loading-bud.patch new file mode 100644 index 00000000000..69f70905172 --- /dev/null +++ b/queue-2.6.32/0044-ext4-check-for-a-good-block-group-before-loading-bud.patch @@ -0,0 +1,214 @@ +From 7d4df70b86aef3e1c2b92bede60009527b3470fd Mon Sep 17 00:00:00 2001 +From: Curt Wohlgemuth +Date: Sun, 30 May 2010 22:49:58 -0400 +Subject: ext4: check for a good block group before loading buddy pages + +commit 8a57d9d61a6e361c7bb159dda797672c1df1a691 upstream (as of v2.6.34-git13) + +This adds a new field in ext4_group_info to cache the largest available +block range in a block group; and don't load the buddy pages until *after* +we've done a sanity check on the block group. + +With large allocation requests (e.g., fallocate(), 8MiB) and relatively full +partitions, it's easy to have no block groups with a block extent large +enough to satisfy the input request length. This currently causes the loop +during cr == 0 in ext4_mb_regular_allocator() to load the buddy bitmap pages +for EVERY block group. That can be a lot of pages. The patch below allows +us to call ext4_mb_good_group() BEFORE we load the buddy pages (although we +have check again after we lock the block group). + +Addresses-Google-Bug: #2578108 +Addresses-Google-Bug: #2704453 + +Signed-off-by: Curt Wohlgemuth +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 1 + fs/ext4/mballoc.c | 70 +++++++++++++++++++++++++++++++++++++++++++----------- + 2 files changed, 58 insertions(+), 13 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1657,6 +1657,7 @@ struct ext4_group_info { + ext4_grpblk_t bb_first_free; /* first free block */ + ext4_grpblk_t bb_free; /* total free blocks */ + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ ++ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + struct list_head bb_prealloc_list; + #ifdef DOUBLE_CHECK + void *bb_bitmap; +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(str + } + } + ++/* ++ * Cache the order of the largest free extent we have available in this block ++ * group. ++ */ ++static void ++mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) ++{ ++ int i; ++ int bits; ++ ++ grp->bb_largest_free_order = -1; /* uninit */ ++ ++ bits = sb->s_blocksize_bits + 1; ++ for (i = bits; i >= 0; i--) { ++ if (grp->bb_counters[i] > 0) { ++ grp->bb_largest_free_order = i; ++ break; ++ } ++ } ++} ++ + static noinline_for_stack + void ext4_mb_generate_buddy(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group) +@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super + */ + grp->bb_free = free; + } ++ mb_set_largest_free_order(sb, grp); + + clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + +@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super + * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. + * So it can have information regarding groups_per_page which + * is blocks_per_page/2 ++ * ++ * Locking note: This routine takes the block group lock of all groups ++ * for this page; do not hold this lock when calling this routine! + */ + + static int ext4_mb_init_cache(struct page *page, char *incore) +@@ -910,6 +935,11 @@ out: + return err; + } + ++/* ++ * Locking note: This routine calls ext4_mb_init_cache(), which takes the ++ * block group lock of all groups for this page; do not hold the BG lock when ++ * calling this routine! ++ */ + static noinline_for_stack + int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) + { +@@ -1004,6 +1034,11 @@ err: + return ret; + } + ++/* ++ * Locking note: This routine calls ext4_mb_init_cache(), which takes the ++ * block group lock of all groups for this page; do not hold the BG lock when ++ * calling this routine! ++ */ + static noinline_for_stack int + ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b) +@@ -1300,6 +1335,7 @@ static void mb_free_blocks(struct inode + buddy = buddy2; + } while (1); + } ++ mb_set_largest_free_order(sb, e4b->bd_info); + mb_check_buddy(e4b); + } + +@@ -1428,6 +1464,7 @@ static int mb_mark_used(struct ext4_budd + e4b->bd_info->bb_counters[ord]++; + e4b->bd_info->bb_counters[ord]++; + } ++ mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); + + mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + mb_check_buddy(e4b); +@@ -1823,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_al + } + } + ++/* This is now called BEFORE we load the buddy bitmap. */ + static int ext4_mb_good_group(struct ext4_allocation_context *ac, + ext4_group_t group, int cr) + { + unsigned free, fragments; +- unsigned i, bits; + int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + + BUG_ON(cr < 0 || cr >= 4); +- BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); ++ ++ /* We only do this if the grp has never been initialized */ ++ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { ++ int ret = ext4_mb_init_group(ac->ac_sb, group); ++ if (ret) ++ return 0; ++ } + + free = grp->bb_free; + fragments = grp->bb_fragments; +@@ -1845,17 +1888,16 @@ static int ext4_mb_good_group(struct ext + case 0: + BUG_ON(ac->ac_2order == 0); + ++ if (grp->bb_largest_free_order < ac->ac_2order) ++ return 0; ++ + /* Avoid using the first bg of a flexgroup for data files */ + if ((ac->ac_flags & EXT4_MB_HINT_DATA) && + (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && + ((group % flex_size) == 0)) + return 0; + +- bits = ac->ac_sb->s_blocksize_bits + 1; +- for (i = ac->ac_2order; i <= bits; i++) +- if (grp->bb_counters[i] > 0) +- return 1; +- break; ++ return 1; + case 1: + if ((free / fragments) >= ac->ac_g_ex.fe_len) + return 1; +@@ -2026,14 +2068,11 @@ repeat: + group = ac->ac_g_ex.fe_group; + + for (i = 0; i < ngroups; group++, i++) { +- struct ext4_group_info *grp; +- + if (group == ngroups) + group = 0; + +- /* quick check to skip empty groups */ +- grp = ext4_get_group_info(sb, group); +- if (grp->bb_free == 0) ++ /* This now checks without needing the buddy page */ ++ if (!ext4_mb_good_group(ac, group, cr)) + continue; + + err = ext4_mb_load_buddy(sb, group, &e4b); +@@ -2041,8 +2080,12 @@ repeat: + goto out; + + ext4_lock_group(sb, group); ++ ++ /* ++ * We need to check again after locking the ++ * block group ++ */ + if (!ext4_mb_good_group(ac, group, cr)) { +- /* someone did allocation from this group */ + ext4_unlock_group(sb, group); + ext4_mb_unload_buddy(&e4b); + continue; +@@ -2255,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_b + INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + init_rwsem(&meta_group_info[i]->alloc_sem); + meta_group_info[i]->bb_free_root.rb_node = NULL; ++ meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ + + #ifdef DOUBLE_CHECK + { diff --git a/queue-2.6.32/0045-ext4-Show-journal_checksum-option.patch b/queue-2.6.32/0045-ext4-Show-journal_checksum-option.patch new file mode 100644 index 00000000000..b497975dcdf --- /dev/null +++ b/queue-2.6.32/0045-ext4-Show-journal_checksum-option.patch @@ -0,0 +1,27 @@ +From ab93377b76de07d4c8aacde97418651c7df6854e Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Sun, 30 May 2010 22:49:59 -0400 +Subject: ext4: Show journal_checksum option + +commit 39a4bade8c1826b658316d66ee81c09b0a4d7d42 upstream (as of v2.6.34-git13) + +We failed to show journal_checksum option in /proc/mounts. Fix it. + +Signed-off-by: Jan Kara +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/super.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -877,6 +877,8 @@ static int ext4_show_options(struct seq_ + seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) + seq_puts(seq, ",journal_async_commit"); ++ else if (test_opt(sb, JOURNAL_CHECKSUM)) ++ seq_puts(seq, ",journal_checksum"); + if (test_opt(sb, NOBH)) + seq_puts(seq, ",nobh"); + if (test_opt(sb, I_VERSION)) diff --git a/queue-2.6.32/0046-ext4-Use-bitops-to-read-modify-i_flags-in-struct-ext.patch b/queue-2.6.32/0046-ext4-Use-bitops-to-read-modify-i_flags-in-struct-ext.patch new file mode 100644 index 00000000000..03444c2eb30 --- /dev/null +++ b/queue-2.6.32/0046-ext4-Use-bitops-to-read-modify-i_flags-in-struct-ext.patch @@ -0,0 +1,558 @@ +From cc781d3f1f03b2fd24b7260ed319dc34bf605ed0 Mon Sep 17 00:00:00 2001 +From: Dmitry Monakhov +Date: Sun, 30 May 2010 22:50:00 -0400 +Subject: ext4: Use bitops to read/modify i_flags in struct ext4_inode_info + +commit 12e9b892002d9af057655d35b44db8ee9243b0dc upstream (as of v2.6.34-git13) + +At several places we modify EXT4_I(inode)->i_flags without holding +i_mutex (ext4_do_update_inode, ...). These modifications are racy and +we can lose updates to i_flags. So convert handling of i_flags to use +bitops which are atomic. + +https://bugzilla.kernel.org/show_bug.cgi?id=15792 + +Signed-off-by: Dmitry Monakhov +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/dir.c | 4 - + fs/ext4/ext4.h | 109 +++++++++++++++++++++++++++++++++++++++++++------- + fs/ext4/ext4_jbd2.h | 6 +- + fs/ext4/extents.c | 10 ++-- + fs/ext4/file.c | 2 + fs/ext4/ialloc.c | 4 - + fs/ext4/inode.c | 30 ++++++------- + fs/ext4/mballoc.c | 4 - + fs/ext4/migrate.c | 2 + fs/ext4/move_extent.c | 4 - + fs/ext4/namei.c | 10 ++-- + fs/ext4/super.c | 1 + fs/ext4/xattr.c | 4 - + 13 files changed, 135 insertions(+), 55 deletions(-) + +--- a/fs/ext4/dir.c ++++ b/fs/ext4/dir.c +@@ -111,7 +111,7 @@ static int ext4_readdir(struct file *fil + + if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX) && +- ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) || ++ ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || + ((inode->i_size >> sb->s_blocksize_bits) == 1))) { + err = ext4_dx_readdir(filp, dirent, filldir); + if (err != ERR_BAD_DX_DIR) { +@@ -122,7 +122,7 @@ static int ext4_readdir(struct file *fil + * We don't set the inode dirty flag since it's not + * critical that it get flushed back to the disk. + */ +- EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL; ++ ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX); + } + stored = 0; + offset = filp->f_pos & (sb->s_blocksize - 1); +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -315,6 +315,83 @@ static inline __u32 ext4_mask_flags(umod + return flags & EXT4_OTHER_FLMASK; + } + ++/* ++ * Inode flags used for atomic set/get ++ */ ++enum { ++ EXT4_INODE_SECRM = 0, /* Secure deletion */ ++ EXT4_INODE_UNRM = 1, /* Undelete */ ++ EXT4_INODE_COMPR = 2, /* Compress file */ ++ EXT4_INODE_SYNC = 3, /* Synchronous updates */ ++ EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ ++ EXT4_INODE_APPEND = 5, /* writes to file may only append */ ++ EXT4_INODE_NODUMP = 6, /* do not dump file */ ++ EXT4_INODE_NOATIME = 7, /* do not update atime */ ++/* Reserved for compression usage... */ ++ EXT4_INODE_DIRTY = 8, ++ EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ ++ EXT4_INODE_NOCOMPR = 10, /* Don't compress */ ++ EXT4_INODE_ECOMPR = 11, /* Compression error */ ++/* End compression flags --- maybe not all used */ ++ EXT4_INODE_INDEX = 12, /* hash-indexed directory */ ++ EXT4_INODE_IMAGIC = 13, /* AFS directory */ ++ EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ ++ EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ ++ EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ ++ EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ ++ EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ ++ EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ ++ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ ++ EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */ ++ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ ++}; ++ ++#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG)) ++#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \ ++ printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \ ++ EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); } ++ ++/* ++ * Since it's pretty easy to mix up bit numbers and hex values, and we ++ * can't do a compile-time test for ENUM values, we use a run-time ++ * test to make sure that EXT4_XXX_FL is consistent with respect to ++ * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop ++ * out so it won't cost any extra space in the compiled kernel image. ++ * But it's important that these values are the same, since we are ++ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL ++ * must be consistent with the values of FS_XXX_FL defined in ++ * include/linux/fs.h and the on-disk values found in ext2, ext3, and ++ * ext4 filesystems, and of course the values defined in e2fsprogs. ++ * ++ * It's not paranoia if the Murphy's Law really *is* out to get you. :-) ++ */ ++static inline void ext4_check_flag_values(void) ++{ ++ CHECK_FLAG_VALUE(SECRM); ++ CHECK_FLAG_VALUE(UNRM); ++ CHECK_FLAG_VALUE(COMPR); ++ CHECK_FLAG_VALUE(SYNC); ++ CHECK_FLAG_VALUE(IMMUTABLE); ++ CHECK_FLAG_VALUE(APPEND); ++ CHECK_FLAG_VALUE(NODUMP); ++ CHECK_FLAG_VALUE(NOATIME); ++ CHECK_FLAG_VALUE(DIRTY); ++ CHECK_FLAG_VALUE(COMPRBLK); ++ CHECK_FLAG_VALUE(NOCOMPR); ++ CHECK_FLAG_VALUE(ECOMPR); ++ CHECK_FLAG_VALUE(INDEX); ++ CHECK_FLAG_VALUE(IMAGIC); ++ CHECK_FLAG_VALUE(JOURNAL_DATA); ++ CHECK_FLAG_VALUE(NOTAIL); ++ CHECK_FLAG_VALUE(DIRSYNC); ++ CHECK_FLAG_VALUE(TOPDIR); ++ CHECK_FLAG_VALUE(HUGE_FILE); ++ CHECK_FLAG_VALUE(EXTENTS); ++ CHECK_FLAG_VALUE(EA_INODE); ++ CHECK_FLAG_VALUE(EOFBLOCKS); ++ CHECK_FLAG_VALUE(RESERVED); ++} ++ + /* Used to pass group descriptor data when online resize is done */ + struct ext4_new_group_input { + __u32 group; /* Group number for this data */ +@@ -603,9 +680,8 @@ struct ext4_ext_cache { + */ + struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ +- __u32 i_flags; +- ext4_fsblk_t i_file_acl; + __u32 i_dtime; ++ ext4_fsblk_t i_file_acl; + + /* + * i_block_group is the number of the block group which contains +@@ -616,6 +692,7 @@ struct ext4_inode_info { + */ + ext4_group_t i_block_group; + unsigned long i_state_flags; /* Dynamic state flags */ ++ unsigned long i_flags; + + ext4_lblk_t i_dir_start_lookup; + #ifdef CONFIG_EXT4_FS_XATTR +@@ -1049,20 +1126,22 @@ enum { + EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ + }; + +-static inline int ext4_test_inode_state(struct inode *inode, int bit) +-{ +- return test_bit(bit, &EXT4_I(inode)->i_state_flags); +-} +- +-static inline void ext4_set_inode_state(struct inode *inode, int bit) +-{ +- set_bit(bit, &EXT4_I(inode)->i_state_flags); ++#define EXT4_INODE_BIT_FNS(name, field) \ ++static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ ++{ \ ++ return test_bit(bit, &EXT4_I(inode)->i_##field); \ ++} \ ++static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ ++{ \ ++ set_bit(bit, &EXT4_I(inode)->i_##field); \ ++} \ ++static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ ++{ \ ++ clear_bit(bit, &EXT4_I(inode)->i_##field); \ + } + +-static inline void ext4_clear_inode_state(struct inode *inode, int bit) +-{ +- clear_bit(bit, &EXT4_I(inode)->i_state_flags); +-} ++EXT4_INODE_BIT_FNS(flag, flags) ++EXT4_INODE_BIT_FNS(state, state_flags) + #else + /* Assume that user mode programs are passing in an ext4fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test +@@ -1247,7 +1326,7 @@ struct ext4_dir_entry_2 { + + #define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ + EXT4_FEATURE_COMPAT_DIR_INDEX) && \ +- (EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) ++ ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) + #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) + #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +--- a/fs/ext4/ext4_jbd2.h ++++ b/fs/ext4/ext4_jbd2.h +@@ -282,7 +282,7 @@ static inline int ext4_should_journal_da + return 1; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return 1; +- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) ++ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + return 1; + return 0; + } +@@ -293,7 +293,7 @@ static inline int ext4_should_order_data + return 0; + if (!S_ISREG(inode->i_mode)) + return 0; +- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) ++ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + return 0; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return 1; +@@ -306,7 +306,7 @@ static inline int ext4_should_writeback_ + return 0; + if (EXT4_JOURNAL(inode) == NULL) + return 1; +- if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) ++ if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + return 0; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return 1; +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3409,12 +3409,12 @@ int ext4_ext_get_blocks(handle_t *handle + } + } + +- if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { ++ if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) { + if (eh->eh_entries) { + last_ex = EXT_LAST_EXTENT(eh); + if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) + + ext4_ext_get_actual_len(last_ex)) +- EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; ++ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + } else { + WARN_ON(eh->eh_entries == 0); + ext4_error(inode->i_sb, __func__, +@@ -3560,7 +3560,7 @@ static void ext4_falloc_update_inode(str + * can proceed even if the new size is the same as i_size. + */ + if (new_size > i_size_read(inode)) +- EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL; ++ ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + } + + } +@@ -3588,7 +3588,7 @@ long ext4_fallocate(struct inode *inode, + * currently supporting (pre)allocate mode for extent-based + * files _only_ + */ +- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) ++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EOPNOTSUPP; + + /* preallocation to directories is currently not supported */ +@@ -3838,7 +3838,7 @@ int ext4_fiemap(struct inode *inode, str + int error = 0; + + /* fallback to generic here if not in extents fmt */ +- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) ++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return generic_block_fiemap(inode, fieinfo, start, len, + ext4_get_block); + +--- a/fs/ext4/file.c ++++ b/fs/ext4/file.c +@@ -65,7 +65,7 @@ ext4_file_write(struct kiocb *iocb, cons + * is smaller than s_maxbytes, which is for extent-mapped files. + */ + +- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { ++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + size_t length = iov_length(iov, nr_segs); + +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -497,7 +497,7 @@ static int find_group_orlov(struct super + + if (S_ISDIR(mode) && + ((parent == sb->s_root->d_inode) || +- (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) { ++ (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) { + int best_ndir = inodes_per_group; + int ret = -1; + +@@ -1044,7 +1044,7 @@ got: + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { + /* set extent flag only for directory, file and normal symlink*/ + if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { +- EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; ++ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS); + ext4_ext_tree_init(handle, inode); + } + } +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -957,7 +957,7 @@ static int ext4_ind_get_blocks(handle_t + int count = 0; + ext4_fsblk_t first_block = 0; + +- J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); ++ J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); + J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); + depth = ext4_block_to_path(inode, iblock, offsets, + &blocks_to_boundary); +@@ -1085,7 +1085,7 @@ static int ext4_indirect_calc_metadata_a + */ + static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) + { +- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) ++ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return ext4_ext_calc_metadata_amount(inode, lblock); + + return ext4_indirect_calc_metadata_amount(inode, lblock); +@@ -1274,7 +1274,7 @@ int ext4_get_blocks(handle_t *handle, st + * file system block. + */ + down_read((&EXT4_I(inode)->i_data_sem)); +- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { ++ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, + bh, 0); + } else { +@@ -1336,7 +1336,7 @@ int ext4_get_blocks(handle_t *handle, st + * We need to check for EXT4 here because migrate + * could have changed the inode type in between + */ +- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { ++ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, + bh, flags); + } else { +@@ -2371,7 +2371,7 @@ static void mpage_add_bh_to_extent(struc + goto flush_it; + + /* check if thereserved journal credits might overflow */ +- if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { ++ if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) { + if (nrblocks >= EXT4_MAX_TRANS_DATA) { + /* + * With non-extent format we are limited by the journal +@@ -2836,7 +2836,7 @@ static int ext4_da_writepages_trans_bloc + * number of contiguous block. So we will limit + * number of contiguous block to a sane value + */ +- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && ++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && + (max_blocks > EXT4_MAX_TRANS_DATA)) + max_blocks = EXT4_MAX_TRANS_DATA; + +@@ -3872,7 +3872,7 @@ static ssize_t ext4_direct_IO(int rw, st + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + +- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) ++ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); + + return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); +@@ -4503,12 +4503,12 @@ void ext4_truncate(struct inode *inode) + if (!ext4_can_truncate(inode)) + return; + +- EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; ++ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + + if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) + ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); + +- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { ++ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ext4_ext_truncate(inode); + return; + } +@@ -5350,7 +5350,7 @@ int ext4_setattr(struct dentry *dentry, + } + + if (attr->ia_valid & ATTR_SIZE) { +- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { ++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (attr->ia_size > sbi->s_bitmap_maxbytes) { +@@ -5363,7 +5363,7 @@ int ext4_setattr(struct dentry *dentry, + if (S_ISREG(inode->i_mode) && + attr->ia_valid & ATTR_SIZE && + (attr->ia_size < inode->i_size || +- (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) { ++ (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) { + handle_t *handle; + + handle = ext4_journal_start(inode, 3); +@@ -5395,7 +5395,7 @@ int ext4_setattr(struct dentry *dentry, + } + } + /* ext4_truncate will clear the flag */ +- if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) ++ if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) + ext4_truncate(inode); + } + +@@ -5471,7 +5471,7 @@ static int ext4_indirect_trans_blocks(st + + static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) + { +- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) ++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return ext4_indirect_trans_blocks(inode, nrblocks, chunk); + return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); + } +@@ -5806,9 +5806,9 @@ int ext4_change_inode_journal_flag(struc + */ + + if (val) +- EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; ++ ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); + else +- EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; ++ ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); + ext4_set_aops(inode); + + jbd2_journal_unlock_updates(journal); +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -2008,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_al + sbi = EXT4_SB(sb); + ngroups = ext4_get_groups_count(sb); + /* non-extent files are limited to low blocks/groups */ +- if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL)) ++ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) + ngroups = sbi->s_blockfile_groups; + + BUG_ON(ac->ac_status == AC_STATUS_FOUND); +@@ -3176,7 +3176,7 @@ ext4_mb_use_preallocated(struct ext4_all + continue; + + /* non-extent files can't have physical blocks past 2^32 */ +- if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) && ++ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && + pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) + continue; + +--- a/fs/ext4/migrate.c ++++ b/fs/ext4/migrate.c +@@ -465,7 +465,7 @@ int ext4_ext_migrate(struct inode *inode + */ + if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS) || +- (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) ++ (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EINVAL; + + if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -975,11 +975,11 @@ mext_check_arguments(struct inode *orig_ + } + + /* Ext4 move extent supports only extent based file */ +- if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { ++ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { + ext4_debug("ext4 move extent: orig file is not extents " + "based file [ino:orig %lu]\n", orig_inode->i_ino); + return -EOPNOTSUPP; +- } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) { ++ } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { + ext4_debug("ext4 move extent: donor file is not extents " + "based file [ino:donor %lu]\n", donor_inode->i_ino); + return -EOPNOTSUPP; +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -660,7 +660,7 @@ int ext4_htree_fill_tree(struct file *di + dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", + start_hash, start_minor_hash)); + dir = dir_file->f_path.dentry->d_inode; +- if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { ++ if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { + hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += +@@ -805,7 +805,7 @@ static void ext4_update_dx_flag(struct i + { + if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_COMPAT_DIR_INDEX)) +- EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL; ++ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } + + /* +@@ -1424,7 +1424,7 @@ static int make_indexed_dir(handle_t *ha + brelse(bh); + return retval; + } +- EXT4_I(dir)->i_flags |= EXT4_INDEX_FL; ++ ext4_set_inode_flag(dir, EXT4_INODE_INDEX); + data1 = bh2->b_data; + + memcpy (data1, de, len); +@@ -1497,7 +1497,7 @@ static int ext4_add_entry(handle_t *hand + retval = ext4_dx_add_entry(handle, dentry, inode); + if (!retval || (retval != ERR_BAD_DX_DIR)) + return retval; +- EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL; ++ ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); + dx_fallback++; + ext4_mark_inode_dirty(handle, dir); + } +@@ -2292,7 +2292,7 @@ retry: + } + } else { + /* clear the extent format for fast symlink */ +- EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; ++ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); + inode->i_op = &ext4_fast_symlink_inode_operations; + memcpy((char *)&EXT4_I(inode)->i_data, symname, l); + inode->i_size = l-1; +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -3999,6 +3999,7 @@ static int __init init_ext4_fs(void) + { + int err; + ++ ext4_check_flag_values(); + err = init_ext4_system_zone(); + if (err) + return err; +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -816,7 +816,7 @@ inserted: + EXT4_I(inode)->i_block_group); + + /* non-extent files can't have physical blocks past 2^32 */ +- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) ++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; + + block = ext4_new_meta_blocks(handle, inode, +@@ -824,7 +824,7 @@ inserted: + if (error) + goto cleanup; + +- if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) ++ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); + + ea_idebug(inode, "creating block %d", block); diff --git a/queue-2.6.32/0047-ext4-Avoid-crashing-on-NULL-ptr-dereference-on-a-fil.patch b/queue-2.6.32/0047-ext4-Avoid-crashing-on-NULL-ptr-dereference-on-a-fil.patch new file mode 100644 index 00000000000..e9a1275f0c1 --- /dev/null +++ b/queue-2.6.32/0047-ext4-Avoid-crashing-on-NULL-ptr-dereference-on-a-fil.patch @@ -0,0 +1,51 @@ +From 570f16c4bfa97a7b2d3b3e6c0b8936ee91f32481 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sun, 30 May 2010 22:50:01 -0400 +Subject: ext4: Avoid crashing on NULL ptr dereference on a filesystem error + +commit f70f362b4a6fe47c239dbfb3efc0cc2c10e4f09c upstream (as of v2.6.34-git13) + +If the EOFBLOCK_FL flag is set when it should not be and the inode is +zero length, then eh_entries is zero, and ex is NULL, so dereferencing +ex to print ex->ee_block causes a kernel OOPS in +ext4_ext_map_blocks(). + +On top of that, the error message which is printed isn't very helpful. +So we fix this by printing something more explanatory which doesn't +involve trying to print ex->ee_block. + +Addresses-Google-Bug: #2655740 + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/extents.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3281,8 +3281,8 @@ int ext4_ext_get_blocks(handle_t *handle + */ + if (path[depth].p_ext == NULL && depth != 0) { + ext4_error(inode->i_sb, __func__, "bad extent address " +- "inode: %lu, iblock: %d, depth: %d", +- inode->i_ino, iblock, depth); ++ "inode: %lu, iblock: %lu, depth: %d", ++ inode->i_ino, (unsigned long) iblock, depth); + err = -EIO; + goto out2; + } +@@ -3418,8 +3418,11 @@ int ext4_ext_get_blocks(handle_t *handle + } else { + WARN_ON(eh->eh_entries == 0); + ext4_error(inode->i_sb, __func__, +- "inode#%lu, eh->eh_entries = 0!", inode->i_ino); +- } ++ "inode#%lu, eh->eh_entries = 0 and " ++ "EOFBLOCKS_FL set", inode->i_ino); ++ err = -EIO; ++ goto out2; ++ } + } + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (err) { diff --git a/queue-2.6.32/0048-ext4-Clear-the-EXT4_EOFBLOCKS_FL-flag-only-when-warr.patch b/queue-2.6.32/0048-ext4-Clear-the-EXT4_EOFBLOCKS_FL-flag-only-when-warr.patch new file mode 100644 index 00000000000..d5e1bac9539 --- /dev/null +++ b/queue-2.6.32/0048-ext4-Clear-the-EXT4_EOFBLOCKS_FL-flag-only-when-warr.patch @@ -0,0 +1,78 @@ +From 3b2905c2bc46795b9c8e54ddc435bd78f4391972 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sun, 30 May 2010 22:50:02 -0400 +Subject: ext4: Clear the EXT4_EOFBLOCKS_FL flag only when warranted + +commit 786ec7915e530936b9eb2e3d12274145cab7aa7d upstream (as of v2.6.34-git13) + +Dimitry Monakhov discovered an edge case where it was possible for the +EXT4_EOFBLOCKS_FL flag could get cleared unnecessarily. This is true; +I have a test case that can be exercised via downloading and +decompressing the file: + +wget ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/ext4-testcases/eofblocks-fl-test-case.img.bz2 +bunzip2 eofblocks-fl-test-case.img +dd if=/dev/zero of=eofblocks-fl-test-case.img bs=1k seek=17925 bs=1k count=1 conv=notrunc + +However, triggering it in real life is highly unlikely since it +requires an extremely fragmented sparse file with a hole in exactly +the right place in the extent tree. (It actually took quite a bit of +work to generate this test case.) Still, it's nice to get even +extreme corner cases to be correct, so this patch makes sure that we +don't clear the EXT4_EOFBLOCKS_FL incorrectly even in this corner +case. + +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/extents.c | 26 ++++++++++++++++++-------- + 1 file changed, 18 insertions(+), 8 deletions(-) + +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3229,7 +3229,7 @@ int ext4_ext_get_blocks(handle_t *handle + struct ext4_extent_header *eh; + struct ext4_extent newex, *ex, *last_ex; + ext4_fsblk_t newblock; +- int err = 0, depth, ret, cache_type; ++ int i, err = 0, depth, ret, cache_type; + unsigned int allocated = 0; + struct ext4_allocation_request ar; + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; +@@ -3410,19 +3410,29 @@ int ext4_ext_get_blocks(handle_t *handle + } + + if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) { +- if (eh->eh_entries) { +- last_ex = EXT_LAST_EXTENT(eh); +- if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) +- + ext4_ext_get_actual_len(last_ex)) +- ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); +- } else { +- WARN_ON(eh->eh_entries == 0); ++ if (unlikely(!eh->eh_entries)) { + ext4_error(inode->i_sb, __func__, + "inode#%lu, eh->eh_entries = 0 and " + "EOFBLOCKS_FL set", inode->i_ino); + err = -EIO; + goto out2; + } ++ last_ex = EXT_LAST_EXTENT(eh); ++ /* ++ * If the current leaf block was reached by looking at ++ * the last index block all the way down the tree, and ++ * we are extending the inode beyond the last extent ++ * in the current leaf block, then clear the ++ * EOFBLOCKS_FL flag. ++ */ ++ for (i = depth-1; i >= 0; i--) { ++ if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr)) ++ break; ++ } ++ if ((i < 0) && ++ (iblock + ar.len > le32_to_cpu(last_ex->ee_block) + ++ ext4_ext_get_actual_len(last_ex))) ++ ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); + } + err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + if (err) { diff --git a/queue-2.6.32/0049-ext4-restart-ext4_ext_remove_space-after-transaction.patch b/queue-2.6.32/0049-ext4-restart-ext4_ext_remove_space-after-transaction.patch new file mode 100644 index 00000000000..9f903040922 --- /dev/null +++ b/queue-2.6.32/0049-ext4-restart-ext4_ext_remove_space-after-transaction.patch @@ -0,0 +1,83 @@ +From b3143b86111dcac45717136a6d776f993aace17f Mon Sep 17 00:00:00 2001 +From: Dmitry Monakhov +Date: Sun, 30 May 2010 22:50:03 -0400 +Subject: ext4: restart ext4_ext_remove_space() after transaction restart + +commit 0617b83fa239db9743a18ce6cc0e556f4d0fd567 upstream (as of v2.6.34-git13) + +If i_data_sem was internally dropped due to transaction restart, it is +necessary to restart path look-up because extents tree was possibly +modified by ext4_get_block(). + +https://bugzilla.kernel.org/show_bug.cgi?id=15827 + +Signed-off-by: Dmitry Monakhov +Signed-off-by: "Theodore Ts'o" +Acked-by: Jan Kara +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/extents.c | 16 +++++++++------- + 1 file changed, 9 insertions(+), 7 deletions(-) + +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_rest + if (err <= 0) + return err; + err = ext4_truncate_restart_trans(handle, inode, needed); +- /* +- * We have dropped i_data_sem so someone might have cached again +- * an extent we are going to truncate. +- */ +- ext4_ext_invalidate_cache(inode); ++ if (err == 0) ++ err = -EAGAIN; + + return err; + } +@@ -2263,7 +2260,7 @@ static int ext4_ext_remove_space(struct + int depth = ext_depth(inode); + struct ext4_ext_path *path; + handle_t *handle; +- int i = 0, err = 0; ++ int i, err; + + ext_debug("truncate since %u\n", start); + +@@ -2272,23 +2269,26 @@ static int ext4_ext_remove_space(struct + if (IS_ERR(handle)) + return PTR_ERR(handle); + ++again: + ext4_ext_invalidate_cache(inode); + + /* + * We start scanning from right side, freeing all the blocks + * after i_size and walking into the tree depth-wise. + */ ++ depth = ext_depth(inode); + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); + if (path == NULL) { + ext4_journal_stop(handle); + return -ENOMEM; + } ++ path[0].p_depth = depth; + path[0].p_hdr = ext_inode_hdr(inode); + if (ext4_ext_check(inode, path[0].p_hdr, depth)) { + err = -EIO; + goto out; + } +- path[0].p_depth = depth; ++ i = err = 0; + + while (i >= 0 && err == 0) { + if (i == depth) { +@@ -2382,6 +2382,8 @@ static int ext4_ext_remove_space(struct + out: + ext4_ext_drop_refs(path); + kfree(path); ++ if (err == -EAGAIN) ++ goto again; + ext4_journal_stop(handle); + + return err; diff --git a/queue-2.6.32/0050-ext4-Conditionally-define-compat-ioctl-numbers.patch b/queue-2.6.32/0050-ext4-Conditionally-define-compat-ioctl-numbers.patch new file mode 100644 index 00000000000..1adce6b15a5 --- /dev/null +++ b/queue-2.6.32/0050-ext4-Conditionally-define-compat-ioctl-numbers.patch @@ -0,0 +1,36 @@ +From e58debc557cca3fa1ce0f893978be42dfa489699 Mon Sep 17 00:00:00 2001 +From: Ben Hutchings +Date: Sun, 30 May 2010 22:50:04 -0400 +Subject: ext4: Conditionally define compat ioctl numbers + +commit 899ad0cea6ad7ff4ba24b16318edbc3cbbe03fad upstream (as of v2.6.34-git13) + +It is unnecessary, and in general impossible, to define the compat +ioctl numbers except when building the filesystem with CONFIG_COMPAT +defined. + +Signed-off-by: Ben Hutchings +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -462,6 +462,7 @@ struct ext4_new_group_data { + #define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) + #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) + ++#if defined(__KERNEL__) && defined(CONFIG_COMPAT) + /* + * ioctl commands in 32 bit emulation + */ +@@ -477,6 +478,7 @@ struct ext4_new_group_data { + #endif + #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION + #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION ++#endif + + + /* diff --git a/queue-2.6.32/0051-ext4-Fix-compat-EXT4_IOC_ADD_GROUP.patch b/queue-2.6.32/0051-ext4-Fix-compat-EXT4_IOC_ADD_GROUP.patch new file mode 100644 index 00000000000..420ca0ccca6 --- /dev/null +++ b/queue-2.6.32/0051-ext4-Fix-compat-EXT4_IOC_ADD_GROUP.patch @@ -0,0 +1,91 @@ +From a496748686cdccd4b5bf1b5696919e380dc48da0 Mon Sep 17 00:00:00 2001 +From: Ben Hutchings +Date: Sun, 30 May 2010 22:50:05 -0400 +Subject: ext4: Fix compat EXT4_IOC_ADD_GROUP + +commit 4d92dc0f00a775dc2e1267b0e00befb783902fe7 upstream (as of v2.6.34-git13) + +struct ext4_new_group_input needs to be converted because u64 has +only 32-bit alignment on some 32-bit architectures, notably i386. + +Signed-off-by: Ben Hutchings +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 16 ++++++++++++++++ + fs/ext4/ioctl.c | 25 +++++++++++++++++++++++-- + 2 files changed, 39 insertions(+), 2 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -29,6 +29,9 @@ + #include + #include + #include ++#ifdef __KERNEL__ ++#include ++#endif + + /* + * The fourth extended filesystem constants/structures +@@ -403,6 +406,18 @@ struct ext4_new_group_input { + __u16 unused; + }; + ++#if defined(__KERNEL__) && defined(CONFIG_COMPAT) ++struct compat_ext4_new_group_input { ++ u32 group; ++ compat_u64 block_bitmap; ++ compat_u64 inode_bitmap; ++ compat_u64 inode_table; ++ u32 blocks_count; ++ u16 reserved_blocks; ++ u16 unused; ++}; ++#endif ++ + /* The struct ext4_new_group_input in kernel space, with free_blocks_count */ + struct ext4_new_group_data { + __u32 group; +@@ -473,6 +488,7 @@ struct ext4_new_group_data { + #define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) + #define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) + #define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) ++#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) + #ifdef CONFIG_JBD2_DEBUG + #define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) + #endif +--- a/fs/ext4/ioctl.c ++++ b/fs/ext4/ioctl.c +@@ -373,8 +373,29 @@ long ext4_compat_ioctl(struct file *file + case EXT4_IOC32_SETRSVSZ: + cmd = EXT4_IOC_SETRSVSZ; + break; +- case EXT4_IOC_GROUP_ADD: +- break; ++ case EXT4_IOC32_GROUP_ADD: { ++ struct compat_ext4_new_group_input __user *uinput; ++ struct ext4_new_group_input input; ++ mm_segment_t old_fs; ++ int err; ++ ++ uinput = compat_ptr(arg); ++ err = get_user(input.group, &uinput->group); ++ err |= get_user(input.block_bitmap, &uinput->block_bitmap); ++ err |= get_user(input.inode_bitmap, &uinput->inode_bitmap); ++ err |= get_user(input.inode_table, &uinput->inode_table); ++ err |= get_user(input.blocks_count, &uinput->blocks_count); ++ err |= get_user(input.reserved_blocks, ++ &uinput->reserved_blocks); ++ if (err) ++ return -EFAULT; ++ old_fs = get_fs(); ++ set_fs(KERNEL_DS); ++ err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD, ++ (unsigned long) &input); ++ set_fs(old_fs); ++ return err; ++ } + case EXT4_IOC_MOVE_EXT: + break; + default: diff --git a/queue-2.6.32/0052-ext4-Make-fsync-sync-new-parent-directories-in-no-jo.patch b/queue-2.6.32/0052-ext4-Make-fsync-sync-new-parent-directories-in-no-jo.patch new file mode 100644 index 00000000000..ed8bb2e80a0 --- /dev/null +++ b/queue-2.6.32/0052-ext4-Make-fsync-sync-new-parent-directories-in-no-jo.patch @@ -0,0 +1,92 @@ +From 2959737e6c8ee73e85bf706f11b272bab323597f Mon Sep 17 00:00:00 2001 +From: Frank Mayhar +Date: Sun, 30 May 2010 22:50:06 -0400 +Subject: ext4: Make fsync sync new parent directories in no-journal mode + +commit 14ece1028b3ed53ffec1b1213ffc6acaf79ad77c upstream (as of v2.6.34-git13) + +Add a new ext4 state to tell us when a file has been newly created; use +that state in ext4_sync_file in no-journal mode to tell us when we need +to sync the parent directory as well as the inode and data itself. This +fixes a problem in which a panic or power failure may lose the entire +file even when using fsync, since the parent directory entry is lost. + +Addresses-Google-Bug: #2480057 + +Signed-off-by: Frank Mayhar +Signed-off-by: "Theodore Ts'o" +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/ext4.h | 1 + + fs/ext4/fsync.c | 31 +++++++++++++++++++++++++++++-- + fs/ext4/namei.c | 2 ++ + 3 files changed, 32 insertions(+), 2 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1142,6 +1142,7 @@ enum { + EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ + EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ + EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ ++ EXT4_STATE_NEWENTRY, /* File just added to dir */ + }; + + #define EXT4_INODE_BIT_FNS(name, field) \ +--- a/fs/ext4/fsync.c ++++ b/fs/ext4/fsync.c +@@ -35,6 +35,29 @@ + #include + + /* ++ * If we're not journaling and this is a just-created file, we have to ++ * sync our parent directory (if it was freshly created) since ++ * otherwise it will only be written by writeback, leaving a huge ++ * window during which a crash may lose the file. This may apply for ++ * the parent directory's parent as well, and so on recursively, if ++ * they are also freshly created. ++ */ ++static void ext4_sync_parent(struct inode *inode) ++{ ++ struct dentry *dentry = NULL; ++ ++ while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { ++ ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); ++ dentry = list_entry(inode->i_dentry.next, ++ struct dentry, d_alias); ++ if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) ++ break; ++ inode = dentry->d_parent->d_inode; ++ sync_mapping_buffers(inode->i_mapping); ++ } ++} ++ ++/* + * akpm: A new design for ext4_sync_file(). + * + * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). +@@ -67,8 +90,12 @@ int ext4_sync_file(struct file *file, st + if (ret < 0) + return ret; + +- if (!journal) +- return simple_fsync(file, dentry, datasync); ++ if (!journal) { ++ ret = simple_fsync(file, dentry, datasync); ++ if (!ret && !list_empty(&inode->i_dentry)) ++ ext4_sync_parent(inode); ++ return ret; ++ } + + /* + * data=writeback,ordered: +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -1525,6 +1525,8 @@ static int ext4_add_entry(handle_t *hand + de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); + retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + brelse(bh); ++ if (retval == 0) ++ ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); + return retval; + } + diff --git a/queue-2.6.32/series b/queue-2.6.32/series index af745db6180..100bdec8442 100644 --- a/queue-2.6.32/series +++ b/queue-2.6.32/series @@ -78,3 +78,59 @@ usb-sisusbvga-fix-for-usb-3.0.patch usb-add-quirk-for-broadcom-bt-dongle.patch usb-ftdi-add-support-for-the-rt-system-vx-7-radio-programming-cable.patch ethtool-fix-potential-user-buffer-overflow-for-ethtool_-g-s-rxfh.patch +0001-ext4-Fix-potential-quota-deadlock.patch +0002-ext4-replace-BUG-with-return-EIO-in-ext4_ext_get_blo.patch +0003-ext4-jbd2-Add-barriers-for-file-systems-with-exernal.patch +0004-ext4-Eliminate-potential-double-free-on-error-path.patch +0005-ext4-return-correct-wbc.nr_to_write-in-ext4_da_write.patch +0006-ext4-Ensure-zeroout-blocks-have-no-dirty-metadata.patch +0007-ext4-Patch-up-how-we-claim-metadata-blocks-for-quota.patch +0008-ext4-Fix-accounting-of-reserved-metadata-blocks.patch +0009-ext4-Calculate-metadata-requirements-more-accurately.patch +0010-ext4-Handle-EDQUOT-error-on-write.patch +0011-ext4-Fix-quota-accounting-error-with-fallocate.patch +0012-ext4-Drop-EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE-flag.patch +0013-ext4-Use-bitops-to-read-modify-EXT4_I-inode-i_state.patch +0014-ext4-Fix-BUG_ON-at-fs-buffer.c-652-in-no-journal-mod.patch +0015-ext4-Add-flag-to-files-with-blocks-intentionally-pas.patch +0016-ext4-Fix-fencepost-error-in-chosing-choosing-group-v.patch +0017-ext4-fix-error-handling-in-migrate.patch +0018-ext4-explicitly-remove-inode-from-orphan-list-after-.patch +0019-ext4-Handle-non-empty-on-disk-orphan-link.patch +0020-ext4-make-offset-consistent-in-ext4_check_dir_entry.patch +0021-ext4-Fix-insertion-point-of-extent-in-mext_insert_ac.patch +0022-ext4-Fix-the-NULL-reference-in-double_down_write_dat.patch +0023-ext4-Code-cleanup-for-EXT4_IOC_MOVE_EXT-ioctl.patch +0024-ext4-Fix-estimate-of-of-blocks-needed-to-write-indir.patch +0025-ext4-Fixed-inode-allocator-to-correctly-track-a-flex.patch +0026-ext4-Fix-possible-lost-inode-write-in-no-journal-mod.patch +0027-ext4-Fix-buffer-head-leaks-after-calls-to-ext4_get_i.patch +0028-ext4-Issue-the-discard-operation-before-releasing-th.patch +0029-ext4-check-missed-return-value-in-ext4_sync_file.patch +0030-ext4-fix-memory-leaks-in-error-path-handling-of-ext4.patch +0031-ext4-Remove-unnecessary-call-to-ext4_get_group_desc-.patch +0032-ext4-rename-ext4_mb_release_desc-to-ext4_mb_unload_b.patch +0033-ext4-allow-defrag-EXT4_IOC_MOVE_EXT-in-32bit-compat-.patch +0034-ext4-fix-quota-accounting-in-case-of-fallocate.patch +0035-ext4-check-s_log_groups_per_flex-in-online-resize-co.patch +0036-ext4-don-t-return-to-userspace-after-freezing-the-fs.patch +0037-ext4-stop-issuing-discards-if-not-supported-by-devic.patch +0038-ext4-don-t-scan-accumulate-more-pages-than-mballoc-w.patch +0039-ext4-Do-not-zero-out-uninitialized-extents-beyond-i_.patch +0040-ext4-clean-up-inode-bitmaps-manipulation-in-ext4_fre.patch +0041-ext4-init-statistics-after-journal-recovery.patch +0042-ext4-Remove-extraneous-newlines-in-ext4_msg-calls.patch +0043-ext4-Prevent-creation-of-files-larger-than-RLIMIT_FS.patch +0044-ext4-check-for-a-good-block-group-before-loading-bud.patch +0045-ext4-Show-journal_checksum-option.patch +0046-ext4-Use-bitops-to-read-modify-i_flags-in-struct-ext.patch +0047-ext4-Avoid-crashing-on-NULL-ptr-dereference-on-a-fil.patch +0048-ext4-Clear-the-EXT4_EOFBLOCKS_FL-flag-only-when-warr.patch +0049-ext4-restart-ext4_ext_remove_space-after-transaction.patch +0050-ext4-Conditionally-define-compat-ioctl-numbers.patch +0051-ext4-Fix-compat-EXT4_IOC_ADD_GROUP.patch +0052-ext4-Make-fsync-sync-new-parent-directories-in-no-jo.patch +0001-KVM-MMU-Remove-user-access-when-allowing-kernel-acce.patch +0002-KVM-SVM-Handle-MCEs-early-in-the-vmexit-process.patch +0003-KVM-SVM-Implement-workaround-for-Erratum-383.patch +0004-KVM-MMU-invalidate-and-flush-on-spte-small-large-pag.patch