]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
.32 patches for ext4 and kvm
authorGreg Kroah-Hartman <gregkh@suse.de>
Wed, 28 Jul 2010 23:35:27 +0000 (16:35 -0700)
committerGreg Kroah-Hartman <gregkh@suse.de>
Wed, 28 Jul 2010 23:35:27 +0000 (16:35 -0700)
57 files changed:
queue-2.6.32/0001-KVM-MMU-Remove-user-access-when-allowing-kernel-acce.patch [new file with mode: 0644]
queue-2.6.32/0001-ext4-Fix-potential-quota-deadlock.patch [new file with mode: 0644]
queue-2.6.32/0002-KVM-SVM-Handle-MCEs-early-in-the-vmexit-process.patch [new file with mode: 0644]
queue-2.6.32/0002-ext4-replace-BUG-with-return-EIO-in-ext4_ext_get_blo.patch [new file with mode: 0644]
queue-2.6.32/0003-KVM-SVM-Implement-workaround-for-Erratum-383.patch [new file with mode: 0644]
queue-2.6.32/0003-ext4-jbd2-Add-barriers-for-file-systems-with-exernal.patch [new file with mode: 0644]
queue-2.6.32/0004-KVM-MMU-invalidate-and-flush-on-spte-small-large-pag.patch [new file with mode: 0644]
queue-2.6.32/0004-ext4-Eliminate-potential-double-free-on-error-path.patch [new file with mode: 0644]
queue-2.6.32/0005-ext4-return-correct-wbc.nr_to_write-in-ext4_da_write.patch [new file with mode: 0644]
queue-2.6.32/0006-ext4-Ensure-zeroout-blocks-have-no-dirty-metadata.patch [new file with mode: 0644]
queue-2.6.32/0007-ext4-Patch-up-how-we-claim-metadata-blocks-for-quota.patch [new file with mode: 0644]
queue-2.6.32/0008-ext4-Fix-accounting-of-reserved-metadata-blocks.patch [new file with mode: 0644]
queue-2.6.32/0009-ext4-Calculate-metadata-requirements-more-accurately.patch [new file with mode: 0644]
queue-2.6.32/0010-ext4-Handle-EDQUOT-error-on-write.patch [new file with mode: 0644]
queue-2.6.32/0011-ext4-Fix-quota-accounting-error-with-fallocate.patch [new file with mode: 0644]
queue-2.6.32/0012-ext4-Drop-EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE-flag.patch [new file with mode: 0644]
queue-2.6.32/0013-ext4-Use-bitops-to-read-modify-EXT4_I-inode-i_state.patch [new file with mode: 0644]
queue-2.6.32/0014-ext4-Fix-BUG_ON-at-fs-buffer.c-652-in-no-journal-mod.patch [new file with mode: 0644]
queue-2.6.32/0015-ext4-Add-flag-to-files-with-blocks-intentionally-pas.patch [new file with mode: 0644]
queue-2.6.32/0016-ext4-Fix-fencepost-error-in-chosing-choosing-group-v.patch [new file with mode: 0644]
queue-2.6.32/0017-ext4-fix-error-handling-in-migrate.patch [new file with mode: 0644]
queue-2.6.32/0018-ext4-explicitly-remove-inode-from-orphan-list-after-.patch [new file with mode: 0644]
queue-2.6.32/0019-ext4-Handle-non-empty-on-disk-orphan-link.patch [new file with mode: 0644]
queue-2.6.32/0020-ext4-make-offset-consistent-in-ext4_check_dir_entry.patch [new file with mode: 0644]
queue-2.6.32/0021-ext4-Fix-insertion-point-of-extent-in-mext_insert_ac.patch [new file with mode: 0644]
queue-2.6.32/0022-ext4-Fix-the-NULL-reference-in-double_down_write_dat.patch [new file with mode: 0644]
queue-2.6.32/0023-ext4-Code-cleanup-for-EXT4_IOC_MOVE_EXT-ioctl.patch [new file with mode: 0644]
queue-2.6.32/0024-ext4-Fix-estimate-of-of-blocks-needed-to-write-indir.patch [new file with mode: 0644]
queue-2.6.32/0025-ext4-Fixed-inode-allocator-to-correctly-track-a-flex.patch [new file with mode: 0644]
queue-2.6.32/0026-ext4-Fix-possible-lost-inode-write-in-no-journal-mod.patch [new file with mode: 0644]
queue-2.6.32/0027-ext4-Fix-buffer-head-leaks-after-calls-to-ext4_get_i.patch [new file with mode: 0644]
queue-2.6.32/0028-ext4-Issue-the-discard-operation-before-releasing-th.patch [new file with mode: 0644]
queue-2.6.32/0029-ext4-check-missed-return-value-in-ext4_sync_file.patch [new file with mode: 0644]
queue-2.6.32/0030-ext4-fix-memory-leaks-in-error-path-handling-of-ext4.patch [new file with mode: 0644]
queue-2.6.32/0031-ext4-Remove-unnecessary-call-to-ext4_get_group_desc-.patch [new file with mode: 0644]
queue-2.6.32/0032-ext4-rename-ext4_mb_release_desc-to-ext4_mb_unload_b.patch [new file with mode: 0644]
queue-2.6.32/0033-ext4-allow-defrag-EXT4_IOC_MOVE_EXT-in-32bit-compat-.patch [new file with mode: 0644]
queue-2.6.32/0034-ext4-fix-quota-accounting-in-case-of-fallocate.patch [new file with mode: 0644]
queue-2.6.32/0035-ext4-check-s_log_groups_per_flex-in-online-resize-co.patch [new file with mode: 0644]
queue-2.6.32/0036-ext4-don-t-return-to-userspace-after-freezing-the-fs.patch [new file with mode: 0644]
queue-2.6.32/0037-ext4-stop-issuing-discards-if-not-supported-by-devic.patch [new file with mode: 0644]
queue-2.6.32/0038-ext4-don-t-scan-accumulate-more-pages-than-mballoc-w.patch [new file with mode: 0644]
queue-2.6.32/0039-ext4-Do-not-zero-out-uninitialized-extents-beyond-i_.patch [new file with mode: 0644]
queue-2.6.32/0040-ext4-clean-up-inode-bitmaps-manipulation-in-ext4_fre.patch [new file with mode: 0644]
queue-2.6.32/0041-ext4-init-statistics-after-journal-recovery.patch [new file with mode: 0644]
queue-2.6.32/0042-ext4-Remove-extraneous-newlines-in-ext4_msg-calls.patch [new file with mode: 0644]
queue-2.6.32/0043-ext4-Prevent-creation-of-files-larger-than-RLIMIT_FS.patch [new file with mode: 0644]
queue-2.6.32/0044-ext4-check-for-a-good-block-group-before-loading-bud.patch [new file with mode: 0644]
queue-2.6.32/0045-ext4-Show-journal_checksum-option.patch [new file with mode: 0644]
queue-2.6.32/0046-ext4-Use-bitops-to-read-modify-i_flags-in-struct-ext.patch [new file with mode: 0644]
queue-2.6.32/0047-ext4-Avoid-crashing-on-NULL-ptr-dereference-on-a-fil.patch [new file with mode: 0644]
queue-2.6.32/0048-ext4-Clear-the-EXT4_EOFBLOCKS_FL-flag-only-when-warr.patch [new file with mode: 0644]
queue-2.6.32/0049-ext4-restart-ext4_ext_remove_space-after-transaction.patch [new file with mode: 0644]
queue-2.6.32/0050-ext4-Conditionally-define-compat-ioctl-numbers.patch [new file with mode: 0644]
queue-2.6.32/0051-ext4-Fix-compat-EXT4_IOC_ADD_GROUP.patch [new file with mode: 0644]
queue-2.6.32/0052-ext4-Make-fsync-sync-new-parent-directories-in-no-jo.patch [new file with mode: 0644]
queue-2.6.32/series

diff --git a/queue-2.6.32/0001-KVM-MMU-Remove-user-access-when-allowing-kernel-acce.patch b/queue-2.6.32/0001-KVM-MMU-Remove-user-access-when-allowing-kernel-acce.patch
new file mode 100644 (file)
index 0000000..2622393
--- /dev/null
@@ -0,0 +1,29 @@
+From 268a9f6207f354daedf0f92b0b57986bea37b69c Mon Sep 17 00:00:00 2001
+From: Avi Kivity <avi@redhat.com>
+Date: Thu, 27 May 2010 14:35:58 +0300
+Subject: KVM: MMU: Remove user access when allowing kernel access to gpte.w=0 page
+
+If cr0.wp=0, we have to allow the guest kernel access to a page with pte.w=0.
+We do that by setting spte.w=1, since the host cr0.wp must remain set so the
+host can write protect pages.  Once we allow write access, we must remove
+user access otherwise we mistakenly allow the user to write the page.
+
+Reviewed-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
+Signed-off-by: Avi Kivity <avi@redhat.com>
+(cherry picked from commit 69325a122580d3a7b26589e8efdd6663001c3297)
+---
+ arch/x86/kvm/mmu.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/arch/x86/kvm/mmu.c
++++ b/arch/x86/kvm/mmu.c
+@@ -1843,6 +1843,9 @@ static int set_spte(struct kvm_vcpu *vcp
+               spte |= PT_WRITABLE_MASK;
++              if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
++                      spte &= ~PT_USER_MASK;
++
+               /*
+                * Optimization: for pte sync, if spte was writable the hash
+                * lookup is unnecessary (and expensive). Write protection
diff --git a/queue-2.6.32/0001-ext4-Fix-potential-quota-deadlock.patch b/queue-2.6.32/0001-ext4-Fix-potential-quota-deadlock.patch
new file mode 100644 (file)
index 0000000..2107325
--- /dev/null
@@ -0,0 +1,242 @@
+From fea2aabf4ac586092b1a3acb4adb234bb4bf6266 Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:14 -0400
+Subject: ext4: Fix potential quota deadlock
+
+commit d21cd8f163ac44b15c465aab7306db931c606908 upstream (as of v2.6.33-rc2)
+
+We have to delay vfs_dq_claim_space() until allocation context destruction.
+Currently we have following call-trace:
+ext4_mb_new_blocks()
+  /* task is already holding ac->alloc_semp */
+ ->ext4_mb_mark_diskspace_used
+    ->vfs_dq_claim_space()  /*  acquire dqptr_sem here. Possible deadlock */
+ ->ext4_mb_release_context() /* drop ac->alloc_semp here */
+
+Let's move quota claiming to ext4_da_update_reserve_space()
+
+ =======================================================
+ [ INFO: possible circular locking dependency detected ]
+ 2.6.32-rc7 #18
+ -------------------------------------------------------
+ write-truncate-/3465 is trying to acquire lock:
+  (&s->s_dquot.dqptr_sem){++++..}, at: [<c025e73b>] dquot_claim_space+0x3b/0x1b0
+
+ but task is already holding lock:
+  (&meta_group_info[i]->alloc_sem){++++..}, at: [<c02ce962>] ext4_mb_load_buddy+0xb2/0x370
+
+ which lock already depends on the new lock.
+
+ the existing dependency chain (in reverse order) is:
+
+ -> #3 (&meta_group_info[i]->alloc_sem){++++..}:
+        [<c017d04b>] __lock_acquire+0xd7b/0x1260
+        [<c017d5ea>] lock_acquire+0xba/0xd0
+        [<c0527191>] down_read+0x51/0x90
+        [<c02ce962>] ext4_mb_load_buddy+0xb2/0x370
+        [<c02d0c1c>] ext4_mb_free_blocks+0x46c/0x870
+        [<c029c9d3>] ext4_free_blocks+0x73/0x130
+        [<c02c8cfc>] ext4_ext_truncate+0x76c/0x8d0
+        [<c02a8087>] ext4_truncate+0x187/0x5e0
+        [<c01e0f7b>] vmtruncate+0x6b/0x70
+        [<c022ec02>] inode_setattr+0x62/0x190
+        [<c02a2d7a>] ext4_setattr+0x25a/0x370
+        [<c022ee81>] notify_change+0x151/0x340
+        [<c021349d>] do_truncate+0x6d/0xa0
+        [<c0221034>] may_open+0x1d4/0x200
+        [<c022412b>] do_filp_open+0x1eb/0x910
+        [<c021244d>] do_sys_open+0x6d/0x140
+        [<c021258e>] sys_open+0x2e/0x40
+        [<c0103100>] sysenter_do_call+0x12/0x32
+
+ -> #2 (&ei->i_data_sem){++++..}:
+        [<c017d04b>] __lock_acquire+0xd7b/0x1260
+        [<c017d5ea>] lock_acquire+0xba/0xd0
+        [<c0527191>] down_read+0x51/0x90
+        [<c02a5787>] ext4_get_blocks+0x47/0x450
+        [<c02a74c1>] ext4_getblk+0x61/0x1d0
+        [<c02a7a7f>] ext4_bread+0x1f/0xa0
+        [<c02bcddc>] ext4_quota_write+0x12c/0x310
+        [<c0262d23>] qtree_write_dquot+0x93/0x120
+        [<c0261708>] v2_write_dquot+0x28/0x30
+        [<c025d3fb>] dquot_commit+0xab/0xf0
+        [<c02be977>] ext4_write_dquot+0x77/0x90
+        [<c02be9bf>] ext4_mark_dquot_dirty+0x2f/0x50
+        [<c025e321>] dquot_alloc_inode+0x101/0x180
+        [<c029fec2>] ext4_new_inode+0x602/0xf00
+        [<c02ad789>] ext4_create+0x89/0x150
+        [<c0221ff2>] vfs_create+0xa2/0xc0
+        [<c02246e7>] do_filp_open+0x7a7/0x910
+        [<c021244d>] do_sys_open+0x6d/0x140
+        [<c021258e>] sys_open+0x2e/0x40
+        [<c0103100>] sysenter_do_call+0x12/0x32
+
+ -> #1 (&sb->s_type->i_mutex_key#7/4){+.+...}:
+        [<c017d04b>] __lock_acquire+0xd7b/0x1260
+        [<c017d5ea>] lock_acquire+0xba/0xd0
+        [<c0526505>] mutex_lock_nested+0x65/0x2d0
+        [<c0260c9d>] vfs_load_quota_inode+0x4bd/0x5a0
+        [<c02610af>] vfs_quota_on_path+0x5f/0x70
+        [<c02bc812>] ext4_quota_on+0x112/0x190
+        [<c026345a>] sys_quotactl+0x44a/0x8a0
+        [<c0103100>] sysenter_do_call+0x12/0x32
+
+ -> #0 (&s->s_dquot.dqptr_sem){++++..}:
+        [<c017d361>] __lock_acquire+0x1091/0x1260
+        [<c017d5ea>] lock_acquire+0xba/0xd0
+        [<c0527191>] down_read+0x51/0x90
+        [<c025e73b>] dquot_claim_space+0x3b/0x1b0
+        [<c02cb95f>] ext4_mb_mark_diskspace_used+0x36f/0x380
+        [<c02d210a>] ext4_mb_new_blocks+0x34a/0x530
+        [<c02c83fb>] ext4_ext_get_blocks+0x122b/0x13c0
+        [<c02a5966>] ext4_get_blocks+0x226/0x450
+        [<c02a5ff3>] mpage_da_map_blocks+0xc3/0xaa0
+        [<c02a6ed6>] ext4_da_writepages+0x506/0x790
+        [<c01de272>] do_writepages+0x22/0x50
+        [<c01d766d>] __filemap_fdatawrite_range+0x6d/0x80
+        [<c01d7b9b>] filemap_flush+0x2b/0x30
+        [<c02a40ac>] ext4_alloc_da_blocks+0x5c/0x60
+        [<c029e595>] ext4_release_file+0x75/0xb0
+        [<c0216b59>] __fput+0xf9/0x210
+        [<c0216c97>] fput+0x27/0x30
+        [<c02122dc>] filp_close+0x4c/0x80
+        [<c014510e>] put_files_struct+0x6e/0xd0
+        [<c01451b7>] exit_files+0x47/0x60
+        [<c0146a24>] do_exit+0x144/0x710
+        [<c0147028>] do_group_exit+0x38/0xa0
+        [<c0159abc>] get_signal_to_deliver+0x2ac/0x410
+        [<c0102849>] do_notify_resume+0xb9/0x890
+        [<c01032d2>] work_notifysig+0x13/0x21
+
+ other info that might help us debug this:
+
+ 3 locks held by write-truncate-/3465:
+  #0:  (jbd2_handle){+.+...}, at: [<c02e1f8f>] start_this_handle+0x38f/0x5c0
+  #1:  (&ei->i_data_sem){++++..}, at: [<c02a57f6>] ext4_get_blocks+0xb6/0x450
+  #2:  (&meta_group_info[i]->alloc_sem){++++..}, at: [<c02ce962>] ext4_mb_load_buddy+0xb2/0x370
+
+ stack backtrace:
+ Pid: 3465, comm: write-truncate- Not tainted 2.6.32-rc7 #18
+ Call Trace:
+  [<c0524cb3>] ? printk+0x1d/0x22
+  [<c017ac9a>] print_circular_bug+0xca/0xd0
+  [<c017d361>] __lock_acquire+0x1091/0x1260
+  [<c016bca2>] ? sched_clock_local+0xd2/0x170
+  [<c0178fd0>] ? trace_hardirqs_off_caller+0x20/0xd0
+  [<c017d5ea>] lock_acquire+0xba/0xd0
+  [<c025e73b>] ? dquot_claim_space+0x3b/0x1b0
+  [<c0527191>] down_read+0x51/0x90
+  [<c025e73b>] ? dquot_claim_space+0x3b/0x1b0
+  [<c025e73b>] dquot_claim_space+0x3b/0x1b0
+  [<c02cb95f>] ext4_mb_mark_diskspace_used+0x36f/0x380
+  [<c02d210a>] ext4_mb_new_blocks+0x34a/0x530
+  [<c02c601d>] ? ext4_ext_find_extent+0x25d/0x280
+  [<c02c83fb>] ext4_ext_get_blocks+0x122b/0x13c0
+  [<c016bca2>] ? sched_clock_local+0xd2/0x170
+  [<c016be60>] ? sched_clock_cpu+0x120/0x160
+  [<c016beef>] ? cpu_clock+0x4f/0x60
+  [<c0178fd0>] ? trace_hardirqs_off_caller+0x20/0xd0
+  [<c052712c>] ? down_write+0x8c/0xa0
+  [<c02a5966>] ext4_get_blocks+0x226/0x450
+  [<c016be60>] ? sched_clock_cpu+0x120/0x160
+  [<c016beef>] ? cpu_clock+0x4f/0x60
+  [<c017908b>] ? trace_hardirqs_off+0xb/0x10
+  [<c02a5ff3>] mpage_da_map_blocks+0xc3/0xaa0
+  [<c01d69cc>] ? find_get_pages_tag+0x16c/0x180
+  [<c01d6860>] ? find_get_pages_tag+0x0/0x180
+  [<c02a73bd>] ? __mpage_da_writepage+0x16d/0x1a0
+  [<c01dfc4e>] ? pagevec_lookup_tag+0x2e/0x40
+  [<c01ddf1b>] ? write_cache_pages+0xdb/0x3d0
+  [<c02a7250>] ? __mpage_da_writepage+0x0/0x1a0
+  [<c02a6ed6>] ext4_da_writepages+0x506/0x790
+  [<c016beef>] ? cpu_clock+0x4f/0x60
+  [<c016bca2>] ? sched_clock_local+0xd2/0x170
+  [<c016be60>] ? sched_clock_cpu+0x120/0x160
+  [<c016be60>] ? sched_clock_cpu+0x120/0x160
+  [<c02a69d0>] ? ext4_da_writepages+0x0/0x790
+  [<c01de272>] do_writepages+0x22/0x50
+  [<c01d766d>] __filemap_fdatawrite_range+0x6d/0x80
+  [<c01d7b9b>] filemap_flush+0x2b/0x30
+  [<c02a40ac>] ext4_alloc_da_blocks+0x5c/0x60
+  [<c029e595>] ext4_release_file+0x75/0xb0
+  [<c0216b59>] __fput+0xf9/0x210
+  [<c0216c97>] fput+0x27/0x30
+  [<c02122dc>] filp_close+0x4c/0x80
+  [<c014510e>] put_files_struct+0x6e/0xd0
+  [<c01451b7>] exit_files+0x47/0x60
+  [<c0146a24>] do_exit+0x144/0x710
+  [<c017b163>] ? lock_release_holdtime+0x33/0x210
+  [<c0528137>] ? _spin_unlock_irq+0x27/0x30
+  [<c0147028>] do_group_exit+0x38/0xa0
+  [<c017babb>] ? trace_hardirqs_on+0xb/0x10
+  [<c0159abc>] get_signal_to_deliver+0x2ac/0x410
+  [<c0102849>] do_notify_resume+0xb9/0x890
+  [<c0178fd0>] ? trace_hardirqs_off_caller+0x20/0xd0
+  [<c017b163>] ? lock_release_holdtime+0x33/0x210
+  [<c0165b50>] ? autoremove_wake_function+0x0/0x50
+  [<c017ba54>] ? trace_hardirqs_on_caller+0x134/0x190
+  [<c017babb>] ? trace_hardirqs_on+0xb/0x10
+  [<c0300ba4>] ? security_file_permission+0x14/0x20
+  [<c0215761>] ? vfs_write+0x131/0x190
+  [<c0214f50>] ? do_sync_write+0x0/0x120
+  [<c0103115>] ? sysenter_do_call+0x27/0x32
+  [<c01032d2>] work_notifysig+0x13/0x21
+
+CC: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c   |    9 +++++++--
+ fs/ext4/mballoc.c |    6 ------
+ 2 files changed, 7 insertions(+), 8 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1088,7 +1088,7 @@ static int ext4_calc_metadata_amount(str
+ static void ext4_da_update_reserve_space(struct inode *inode, int used)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+-      int total, mdb, mdb_free;
++      int total, mdb, mdb_free, mdb_claim = 0;
+       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+       /* recalculate the number of metablocks still need to be reserved */
+@@ -1101,7 +1101,9 @@ static void ext4_da_update_reserve_space
+       if (mdb_free) {
+               /* Account for allocated meta_blocks */
+-              mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
++              mdb_claim = EXT4_I(inode)->i_allocated_meta_blocks;
++              BUG_ON(mdb_free < mdb_claim);
++              mdb_free -= mdb_claim;
+               /* update fs dirty blocks counter */
+               percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+@@ -1112,8 +1114,11 @@ static void ext4_da_update_reserve_space
+       /* update per-inode reservations */
+       BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
+       EXT4_I(inode)->i_reserved_data_blocks -= used;
++      percpu_counter_sub(&sbi->s_dirtyblocks_counter, used + mdb_claim);
+       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
++      vfs_dq_claim_block(inode, used + mdb_claim);
++
+       /*
+        * free those over-booking quota for metadata blocks
+        */
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -2755,12 +2755,6 @@ ext4_mb_mark_diskspace_used(struct ext4_
+       if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+               /* release all the reserved blocks if non delalloc */
+               percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
+-      else {
+-              percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+-                                              ac->ac_b_ex.fe_len);
+-              /* convert reserved quota blocks to real quota blocks */
+-              vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len);
+-      }
+       if (sbi->s_log_groups_per_flex) {
+               ext4_group_t flex_group = ext4_flex_group(sbi,
diff --git a/queue-2.6.32/0002-KVM-SVM-Handle-MCEs-early-in-the-vmexit-process.patch b/queue-2.6.32/0002-KVM-SVM-Handle-MCEs-early-in-the-vmexit-process.patch
new file mode 100644 (file)
index 0000000..e051f86
--- /dev/null
@@ -0,0 +1,56 @@
+From 9ce5c64e94beb615d6581e7b8839bb0173903425 Mon Sep 17 00:00:00 2001
+From: Joerg Roedel <joerg.roedel@amd.com>
+Date: Mon, 17 May 2010 14:43:34 +0200
+Subject: KVM: SVM: Handle MCEs early in the vmexit process
+
+This patch moves handling of the MC vmexits to an earlier
+point in the vmexit. The handle_exit function is too late
+because the vcpu might alreadry have changed its physical
+cpu.
+
+Cc: stable@kernel.org
+Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
+Signed-off-by: Avi Kivity <avi@redhat.com>
+(cherry picked from commit fe5913e4e1700cbfc337f4b1da9ddb26f6a55586)
+---
+ arch/x86/kvm/svm.c |   15 ++++++++++++++-
+ 1 file changed, 14 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -1257,7 +1257,7 @@ static int nm_interception(struct vcpu_s
+       return 1;
+ }
+-static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
++static void svm_handle_mce(struct vcpu_svm *svm)
+ {
+       /*
+        * On an #MC intercept the MCE handler is not called automatically in
+@@ -1267,6 +1267,11 @@ static int mc_interception(struct vcpu_s
+               "int $0x12\n");
+       /* not sure if we ever come back to this point */
++      return;
++}
++
++static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
++{
+       return 1;
+ }
+@@ -2717,6 +2722,14 @@ static void svm_vcpu_run(struct kvm_vcpu
+               vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
+               vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
+       }
++
++      /*
++       * We need to handle MC intercepts here before the vcpu has a chance to
++       * change the physical cpu
++       */
++      if (unlikely(svm->vmcb->control.exit_code ==
++                   SVM_EXIT_EXCP_BASE + MC_VECTOR))
++              svm_handle_mce(svm);
+ }
+ #undef R
diff --git a/queue-2.6.32/0002-ext4-replace-BUG-with-return-EIO-in-ext4_ext_get_blo.patch b/queue-2.6.32/0002-ext4-replace-BUG-with-return-EIO-in-ext4_ext_get_blo.patch
new file mode 100644 (file)
index 0000000..85c7e28
--- /dev/null
@@ -0,0 +1,41 @@
+From f57e36578513418a67eef4912c8503a47a4993aa Mon Sep 17 00:00:00 2001
+From: Surbhi Palande <surbhi.palande@canonical.com>
+Date: Sun, 30 May 2010 22:49:16 -0400
+Subject: ext4: replace BUG() with return -EIO in ext4_ext_get_blocks
+
+commit 034fb4c95fc0fed4ec4a50778127b92c6f2aec01 upstream (as of v2.6.33-rc3)
+
+This patch fixes the Kernel BZ #14286.  When the address of an extent
+corresponding to a valid block is corrupted, a -EIO should be reported
+instead of a BUG().  This situation should not normally not occur
+except in the case of a corrupted filesystem.  If however it does,
+then the system should not panic directly but depending on the mount
+time options appropriate action should be taken. If the mount options
+so permit, the I/O should be gracefully aborted by returning a -EIO.
+
+http://bugzilla.kernel.org/show_bug.cgi?id=14286
+
+Signed-off-by: Surbhi Palande <surbhi.palande@canonical.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c |    8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3196,7 +3196,13 @@ int ext4_ext_get_blocks(handle_t *handle
+        * this situation is possible, though, _during_ tree modification;
+        * this is why assert can't be put in ext4_ext_find_extent()
+        */
+-      BUG_ON(path[depth].p_ext == NULL && depth != 0);
++      if (path[depth].p_ext == NULL && depth != 0) {
++              ext4_error(inode->i_sb, __func__, "bad extent address "
++                         "inode: %lu, iblock: %d, depth: %d",
++                         inode->i_ino, iblock, depth);
++              err = -EIO;
++              goto out2;
++      }
+       eh = path[depth].p_hdr;
+       ex = path[depth].p_ext;
diff --git a/queue-2.6.32/0003-KVM-SVM-Implement-workaround-for-Erratum-383.patch b/queue-2.6.32/0003-KVM-SVM-Implement-workaround-for-Erratum-383.patch
new file mode 100644 (file)
index 0000000..5d8d39d
--- /dev/null
@@ -0,0 +1,165 @@
+From a61279422bc32ecbf85e3a6a9349287c7df0b0b1 Mon Sep 17 00:00:00 2001
+From: Joerg Roedel <joerg.roedel@amd.com>
+Date: Mon, 17 May 2010 14:43:35 +0200
+Subject: KVM: SVM: Implement workaround for Erratum 383
+
+This patch implements a workaround for AMD erratum 383 into
+KVM. Without this erratum fix it is possible for a guest to
+kill the host machine. This patch implements the suggested
+workaround for hypervisors which will be published by the
+next revision guide update.
+
+[jan: fix overflow warning on i386]
+[xiao: fix unused variable warning]
+
+Cc: stable@kernel.org
+Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
+Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
+Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
+Signed-off-by: Avi Kivity <avi@redhat.com>
+(cherry picked from commit 67ec66077799f2fef84b21a643912b179c422281)
+---
+ arch/x86/include/asm/msr-index.h |    1 
+ arch/x86/kvm/svm.c               |   84 ++++++++++++++++++++++++++++++++++++++-
+ 2 files changed, 84 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/include/asm/msr-index.h
++++ b/arch/x86/include/asm/msr-index.h
+@@ -106,6 +106,7 @@
+ #define MSR_AMD64_PATCH_LOADER                0xc0010020
+ #define MSR_AMD64_OSVW_ID_LENGTH      0xc0010140
+ #define MSR_AMD64_OSVW_STATUS         0xc0010141
++#define MSR_AMD64_DC_CFG              0xc0011022
+ #define MSR_AMD64_IBSFETCHCTL         0xc0011030
+ #define MSR_AMD64_IBSFETCHLINAD               0xc0011031
+ #define MSR_AMD64_IBSFETCHPHYSAD      0xc0011032
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -27,6 +27,7 @@
+ #include <linux/sched.h>
+ #include <linux/ftrace_event.h>
++#include <asm/tlbflush.h>
+ #include <asm/desc.h>
+ #include <asm/virtext.h>
+@@ -62,6 +63,8 @@ MODULE_LICENSE("GPL");
+ #define nsvm_printk(fmt, args...) do {} while(0)
+ #endif
++static bool erratum_383_found __read_mostly;
++
+ static const u32 host_save_user_msrs[] = {
+ #ifdef CONFIG_X86_64
+       MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
+@@ -299,6 +302,31 @@ static void skip_emulated_instruction(st
+       svm_set_interrupt_shadow(vcpu, 0);
+ }
++static void svm_init_erratum_383(void)
++{
++      u32 low, high;
++      int err;
++      u64 val;
++
++      /* Only Fam10h is affected */
++      if (boot_cpu_data.x86 != 0x10)
++              return;
++
++      /* Use _safe variants to not break nested virtualization */
++      val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
++      if (err)
++              return;
++
++      val |= (1ULL << 47);
++
++      low  = lower_32_bits(val);
++      high = upper_32_bits(val);
++
++      native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
++
++      erratum_383_found = true;
++}
++
+ static int has_svm(void)
+ {
+       const char *msg;
+@@ -318,7 +346,6 @@ static void svm_hardware_disable(void *g
+ static void svm_hardware_enable(void *garbage)
+ {
+-
+       struct svm_cpu_data *svm_data;
+       uint64_t efer;
+       struct descriptor_table gdt_descr;
+@@ -350,6 +377,10 @@ static void svm_hardware_enable(void *ga
+       wrmsrl(MSR_VM_HSAVE_PA,
+              page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
++
++      svm_init_erratum_383();
++
++      return;
+ }
+ static void svm_cpu_uninit(int cpu)
+@@ -1257,8 +1288,59 @@ static int nm_interception(struct vcpu_s
+       return 1;
+ }
++static bool is_erratum_383(void)
++{
++      int err, i;
++      u64 value;
++
++      if (!erratum_383_found)
++              return false;
++
++      value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
++      if (err)
++              return false;
++
++      /* Bit 62 may or may not be set for this mce */
++      value &= ~(1ULL << 62);
++
++      if (value != 0xb600000000010015ULL)
++              return false;
++
++      /* Clear MCi_STATUS registers */
++      for (i = 0; i < 6; ++i)
++              native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
++
++      value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
++      if (!err) {
++              u32 low, high;
++
++              value &= ~(1ULL << 2);
++              low    = lower_32_bits(value);
++              high   = upper_32_bits(value);
++
++              native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
++      }
++
++      /* Flush tlb to evict multi-match entries */
++      __flush_tlb_all();
++
++      return true;
++}
++
+ static void svm_handle_mce(struct vcpu_svm *svm)
+ {
++      if (is_erratum_383()) {
++              /*
++               * Erratum 383 triggered. Guest state is corrupt so kill the
++               * guest.
++               */
++              pr_err("KVM: Guest triggered AMD Erratum 383\n");
++
++              set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests);
++
++              return;
++      }
++
+       /*
+        * On an #MC intercept the MCE handler is not called automatically in
+        * the host. So do it by hand here.
diff --git a/queue-2.6.32/0003-ext4-jbd2-Add-barriers-for-file-systems-with-exernal.patch b/queue-2.6.32/0003-ext4-jbd2-Add-barriers-for-file-systems-with-exernal.patch
new file mode 100644 (file)
index 0000000..75bbb45
--- /dev/null
@@ -0,0 +1,141 @@
+From de6e76774ecec8a14ef63d3ad383479ca98633e6 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 30 May 2010 22:49:17 -0400
+Subject: ext4, jbd2: Add barriers for file systems with exernal journals
+
+commit cc3e1bea5d87635c519da657303690f5538bb4eb upstream (as of v2.6.33-rc3)
+
+This is a bit complicated because we are trying to optimize when we
+send barriers to the fs data disk.  We could just throw in an extra
+barrier to the data disk whenever we send a barrier to the journal
+disk, but that's not always strictly necessary.
+
+We only need to send a barrier during a commit when there are data
+blocks which are must be written out due to an inode written in
+ordered mode, or if fsync() depends on the commit to force data blocks
+to disk.  Finally, before we drop transactions from the beginning of
+the journal during a checkpoint operation, we need to guarantee that
+any blocks that were flushed out to the data disk are firmly on the
+rust platter before we drop the transaction from the journal.
+
+Thanks to Oleg Drokin for pointing out this flaw in ext3/ext4.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/fsync.c      |   16 ++++++++++++++--
+ fs/jbd2/checkpoint.c |   15 +++++++++++++++
+ fs/jbd2/commit.c     |   19 +++++++++++--------
+ include/linux/jbd2.h |    1 +
+ 4 files changed, 41 insertions(+), 10 deletions(-)
+
+--- a/fs/ext4/fsync.c
++++ b/fs/ext4/fsync.c
+@@ -88,9 +88,21 @@ int ext4_sync_file(struct file *file, st
+               return ext4_force_commit(inode->i_sb);
+       commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+-      if (jbd2_log_start_commit(journal, commit_tid))
++      if (jbd2_log_start_commit(journal, commit_tid)) {
++              /*
++               * When the journal is on a different device than the
++               * fs data disk, we need to issue the barrier in
++               * writeback mode.  (In ordered mode, the jbd2 layer
++               * will take care of issuing the barrier.  In
++               * data=journal, all of the data blocks are written to
++               * the journal device.)
++               */
++              if (ext4_should_writeback_data(inode) &&
++                  (journal->j_fs_dev != journal->j_dev) &&
++                  (journal->j_flags & JBD2_BARRIER))
++                      blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+               jbd2_log_wait_commit(journal, commit_tid);
+-      else if (journal->j_flags & JBD2_BARRIER)
++      } else if (journal->j_flags & JBD2_BARRIER)
+               blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+       return ret;
+ }
+--- a/fs/jbd2/checkpoint.c
++++ b/fs/jbd2/checkpoint.c
+@@ -22,6 +22,7 @@
+ #include <linux/jbd2.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
++#include <linux/blkdev.h>
+ #include <trace/events/jbd2.h>
+ /*
+@@ -515,6 +516,20 @@ int jbd2_cleanup_journal_tail(journal_t
+       journal->j_tail_sequence = first_tid;
+       journal->j_tail = blocknr;
+       spin_unlock(&journal->j_state_lock);
++
++      /*
++       * If there is an external journal, we need to make sure that
++       * any data blocks that were recently written out --- perhaps
++       * by jbd2_log_do_checkpoint() --- are flushed out before we
++       * drop the transactions from the external journal.  It's
++       * unlikely this will be necessary, especially with a
++       * appropriately sized journal, but we need this to guarantee
++       * correctness.  Fortunately jbd2_cleanup_journal_tail()
++       * doesn't get called all that often.
++       */
++      if ((journal->j_fs_dev != journal->j_dev) &&
++          (journal->j_flags & JBD2_BARRIER))
++              blkdev_issue_flush(journal->j_fs_dev, NULL);
+       if (!(journal->j_flags & JBD2_ABORT))
+               jbd2_journal_update_superblock(journal, 1);
+       return 0;
+--- a/fs/jbd2/commit.c
++++ b/fs/jbd2/commit.c
+@@ -259,6 +259,7 @@ static int journal_submit_data_buffers(j
+                       ret = err;
+               spin_lock(&journal->j_list_lock);
+               J_ASSERT(jinode->i_transaction == commit_transaction);
++              commit_transaction->t_flushed_data_blocks = 1;
+               jinode->i_flags &= ~JI_COMMIT_RUNNING;
+               wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+       }
+@@ -708,8 +709,17 @@ start_journal_io:
+               }
+       }
+-      /* Done it all: now write the commit record asynchronously. */
++      /*
++       * If the journal is not located on the file system device,
++       * then we must flush the file system device before we issue
++       * the commit record
++       */
++      if (commit_transaction->t_flushed_data_blocks &&
++          (journal->j_fs_dev != journal->j_dev) &&
++          (journal->j_flags & JBD2_BARRIER))
++              blkdev_issue_flush(journal->j_fs_dev, NULL);
++      /* Done it all: now write the commit record asynchronously. */
+       if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+                                     JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+               err = journal_submit_commit_record(journal, commit_transaction,
+@@ -720,13 +730,6 @@ start_journal_io:
+                       blkdev_issue_flush(journal->j_dev, NULL);
+       }
+-      /*
+-       * This is the right place to wait for data buffers both for ASYNC
+-       * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+-       * the commit block went to disk (which happens above). If commit is
+-       * SYNC, we need to wait for data buffers before we start writing
+-       * commit block, which happens below in such setting.
+-       */
+       err = journal_finish_inode_data_buffers(journal, commit_transaction);
+       if (err) {
+               printk(KERN_WARNING
+--- a/include/linux/jbd2.h
++++ b/include/linux/jbd2.h
+@@ -653,6 +653,7 @@ struct transaction_s
+        * waiting for it to finish.
+        */
+       unsigned int t_synchronous_commit:1;
++      unsigned int t_flushed_data_blocks:1;
+       /*
+        * For use by the filesystem to store fs-specific data
diff --git a/queue-2.6.32/0004-KVM-MMU-invalidate-and-flush-on-spte-small-large-pag.patch b/queue-2.6.32/0004-KVM-MMU-invalidate-and-flush-on-spte-small-large-pag.patch
new file mode 100644 (file)
index 0000000..e933f4b
--- /dev/null
@@ -0,0 +1,34 @@
+From 51e00c5c8ddedce8030521bf8645d90b82854980 Mon Sep 17 00:00:00 2001
+From: Marcelo Tosatti <mtosatti@redhat.com>
+Date: Fri, 28 May 2010 09:44:59 -0300
+Subject: KVM: MMU: invalidate and flush on spte small->large page size change
+
+Always invalidate spte and flush TLBs when changing page size, to make
+sure different sized translations for the same address are never cached
+in a CPU's TLB.
+
+Currently the only case where this occurs is when a non-leaf spte pointer is
+overwritten by a leaf, large spte entry. This can happen after dirty
+logging is disabled on a memslot, for example.
+
+Noticed by Andrea.
+
+KVM-Stable-Tag
+Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
+Signed-off-by: Avi Kivity <avi@redhat.com>
+(cherry picked from commit 3be2264be3c00865116f997dc53ebcc90fe7fc4b)
+---
+ arch/x86/kvm/mmu.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/x86/kvm/mmu.c
++++ b/arch/x86/kvm/mmu.c
+@@ -1901,6 +1901,8 @@ static void mmu_set_spte(struct kvm_vcpu
+                       child = page_header(pte & PT64_BASE_ADDR_MASK);
+                       mmu_page_remove_parent_pte(child, sptep);
++                      __set_spte(sptep, shadow_trap_nonpresent_pte);
++                      kvm_flush_remote_tlbs(vcpu->kvm);
+               } else if (pfn != spte_to_pfn(*sptep)) {
+                       pgprintk("hfn old %lx new %lx\n",
+                                spte_to_pfn(*sptep), pfn);
diff --git a/queue-2.6.32/0004-ext4-Eliminate-potential-double-free-on-error-path.patch b/queue-2.6.32/0004-ext4-Eliminate-potential-double-free-on-error-path.patch
new file mode 100644 (file)
index 0000000..c590c7a
--- /dev/null
@@ -0,0 +1,55 @@
+From 857855f2523af677951cb3bba61396813df6128d Mon Sep 17 00:00:00 2001
+From: Julia Lawall <julia@diku.dk>
+Date: Sun, 30 May 2010 22:49:18 -0400
+Subject: ext4: Eliminate potential double free on error path
+
+commit d3533d72e7478a61a3e1936956fc825289a2acf4 upstream (as of v2.6.33-rc3)
+
+b_entry_name and buffer are initially NULL, are initialized within a loop
+to the result of calling kmalloc, and are freed at the bottom of this loop.
+The loop contains gotos to cleanup, which also frees b_entry_name and
+buffer.  Some of these gotos are before the reinitializations of
+b_entry_name and buffer.  To maintain the invariant that b_entry_name and
+buffer are NULL at the top of the loop, and thus acceptable arguments to
+kfree, these variables are now set to NULL after the kfrees.
+
+This seems to be the simplest solution.  A more complicated solution
+would be to introduce more labels in the error handling code at the end of
+the function.
+
+A simplified version of the semantic match that finds this problem is as
+follows: (http://coccinelle.lip6.fr/)
+
+// <smpl>
+@r@
+identifier E;
+expression E1;
+iterator I;
+statement S;
+@@
+
+*kfree(E);
+... when != E = E1
+    when != I(E,...) S
+    when != &E
+*kfree(E);
+// </smpl>
+
+Signed-off-by: Julia Lawall <julia@diku.dk>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/xattr.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -1327,6 +1327,8 @@ retry:
+                       goto cleanup;
+               kfree(b_entry_name);
+               kfree(buffer);
++              b_entry_name = NULL;
++              buffer = NULL;
+               brelse(is->iloc.bh);
+               kfree(is);
+               kfree(bs);
diff --git a/queue-2.6.32/0005-ext4-return-correct-wbc.nr_to_write-in-ext4_da_write.patch b/queue-2.6.32/0005-ext4-return-correct-wbc.nr_to_write-in-ext4_da_write.patch
new file mode 100644 (file)
index 0000000..1de448b
--- /dev/null
@@ -0,0 +1,33 @@
+From 657eba6d2e9501946a11cc4f53148e46e4b3cbe1 Mon Sep 17 00:00:00 2001
+From: Richard Kennedy <richard@rsk.demon.co.uk>
+Date: Sun, 30 May 2010 22:49:19 -0400
+Subject: ext4: return correct wbc.nr_to_write in ext4_da_writepages
+
+commit 2faf2e19dd0e060eeb32442858ef495ac3083277 upstream (as of v2.6.33-rc3)
+
+When ext4_da_writepages increases the nr_to_write in writeback_control
+then it must always re-base the return value.  Originally there was a
+(misguided) attempt prevent wbc.nr_to_write from going negative.  In
+fact, it's necessary to allow nr_to_write to be negative so that
+wb_writeback() can correctly calculate how many pages were actually
+written.
+
+Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3010,8 +3010,7 @@ retry:
+ out_writepages:
+       if (!no_nrwrite_index_update)
+               wbc->no_nrwrite_index_update = 0;
+-      if (wbc->nr_to_write > nr_to_writebump)
+-              wbc->nr_to_write -= nr_to_writebump;
++      wbc->nr_to_write -= nr_to_writebump;
+       wbc->range_start = range_start;
+       trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
+       return ret;
diff --git a/queue-2.6.32/0006-ext4-Ensure-zeroout-blocks-have-no-dirty-metadata.patch b/queue-2.6.32/0006-ext4-Ensure-zeroout-blocks-have-no-dirty-metadata.patch
new file mode 100644 (file)
index 0000000..f42e378
--- /dev/null
@@ -0,0 +1,55 @@
+From 436e2704a8b589fb1217add4f9e5be480773ca6c Mon Sep 17 00:00:00 2001
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Sun, 30 May 2010 22:49:20 -0400
+Subject: ext4: Ensure zeroout blocks have no dirty metadata
+
+commit 515f41c33a9d44a964264c9511ad2c869af1fac3 upstream (as of v2.6.33-rc3)
+
+This fixes a bug (found by Curt Wohlgemuth) in which new blocks
+returned from an extent created with ext4_ext_zeroout() can have dirty
+metadata still associated with them.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c |   20 ++++++++++++++++++++
+ 1 file changed, 20 insertions(+)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3029,6 +3029,14 @@ out:
+       return err;
+ }
++static void unmap_underlying_metadata_blocks(struct block_device *bdev,
++                      sector_t block, int count)
++{
++      int i;
++      for (i = 0; i < count; i++)
++                unmap_underlying_metadata(bdev, block + i);
++}
++
+ static int
+ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
+                       ext4_lblk_t iblock, unsigned int max_blocks,
+@@ -3104,6 +3112,18 @@ out:
+       } else
+               allocated = ret;
+       set_buffer_new(bh_result);
++      /*
++       * if we allocated more blocks than requested
++       * we need to make sure we unmap the extra block
++       * allocated. The actual needed block will get
++       * unmapped later when we find the buffer_head marked
++       * new.
++       */
++      if (allocated > max_blocks) {
++              unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
++                                      newblock + max_blocks,
++                                      allocated - max_blocks);
++      }
+ map_out:
+       set_buffer_mapped(bh_result);
+ out1:
diff --git a/queue-2.6.32/0007-ext4-Patch-up-how-we-claim-metadata-blocks-for-quota.patch b/queue-2.6.32/0007-ext4-Patch-up-how-we-claim-metadata-blocks-for-quota.patch
new file mode 100644 (file)
index 0000000..2c83789
--- /dev/null
@@ -0,0 +1,266 @@
+From 74ded2cc0427839ccdda41f2738130f0eea77fde Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 30 May 2010 22:49:21 -0400
+Subject: ext4: Patch up how we claim metadata blocks for quota purposes
+
+commit 0637c6f4135f592f094207c7c21e7c0fc5557834 upstream (as of v2.6.33-rc3)
+
+As reported in Kernel Bugzilla #14936, commit d21cd8f triggered a BUG
+in the function ext4_da_update_reserve_space() found in
+fs/ext4/inode.c.  The root cause of this BUG() was caused by the fact
+that ext4_calc_metadata_amount() can severely over-estimate how many
+metadata blocks will be needed, especially when using direct
+block-mapped files.
+
+In addition, it can also badly *under* estimate how much space is
+needed, since ext4_calc_metadata_amount() assumes that the blocks are
+contiguous, and this is not always true.  If the application is
+writing blocks to a sparse file, the number of metadata blocks
+necessary can be severly underestimated by the functions
+ext4_da_reserve_space(), ext4_da_update_reserve_space() and
+ext4_da_release_space().  This was the cause of the dq_claim_space
+reports found on kerneloops.org.
+
+Unfortunately, doing this right means that we need to massively
+over-estimate the amount of free space needed.  So in some cases we
+may need to force the inode to be written to disk asynchronously in
+to avoid spurious quota failures.
+
+http://bugzilla.kernel.org/show_bug.cgi?id=14936
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c |  153 ++++++++++++++++++++++++++++++--------------------------
+ 1 file changed, 82 insertions(+), 71 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1085,43 +1085,47 @@ static int ext4_calc_metadata_amount(str
+       return ext4_indirect_calc_metadata_amount(inode, blocks);
+ }
++/*
++ * Called with i_data_sem down, which is important since we can call
++ * ext4_discard_preallocations() from here.
++ */
+ static void ext4_da_update_reserve_space(struct inode *inode, int used)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+-      int total, mdb, mdb_free, mdb_claim = 0;
++      struct ext4_inode_info *ei = EXT4_I(inode);
++      int mdb_free = 0;
+-      spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+-      /* recalculate the number of metablocks still need to be reserved */
+-      total = EXT4_I(inode)->i_reserved_data_blocks - used;
+-      mdb = ext4_calc_metadata_amount(inode, total);
+-
+-      /* figure out how many metablocks to release */
+-      BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+-      mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+-
+-      if (mdb_free) {
+-              /* Account for allocated meta_blocks */
+-              mdb_claim = EXT4_I(inode)->i_allocated_meta_blocks;
+-              BUG_ON(mdb_free < mdb_claim);
+-              mdb_free -= mdb_claim;
++      spin_lock(&ei->i_block_reservation_lock);
++      if (unlikely(used > ei->i_reserved_data_blocks)) {
++              ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
++                       "with only %d reserved data blocks\n",
++                       __func__, inode->i_ino, used,
++                       ei->i_reserved_data_blocks);
++              WARN_ON(1);
++              used = ei->i_reserved_data_blocks;
++      }
++
++      /* Update per-inode reservations */
++      ei->i_reserved_data_blocks -= used;
++      used += ei->i_allocated_meta_blocks;
++      ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
++      ei->i_allocated_meta_blocks = 0;
++      percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
+-              /* update fs dirty blocks counter */
++      if (ei->i_reserved_data_blocks == 0) {
++              /*
++               * We can release all of the reserved metadata blocks
++               * only when we have written all of the delayed
++               * allocation blocks.
++               */
++              mdb_free = ei->i_allocated_meta_blocks;
+               percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+-              EXT4_I(inode)->i_allocated_meta_blocks = 0;
+-              EXT4_I(inode)->i_reserved_meta_blocks = mdb;
++              ei->i_allocated_meta_blocks = 0;
+       }
+-
+-      /* update per-inode reservations */
+-      BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
+-      EXT4_I(inode)->i_reserved_data_blocks -= used;
+-      percpu_counter_sub(&sbi->s_dirtyblocks_counter, used + mdb_claim);
+       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+-      vfs_dq_claim_block(inode, used + mdb_claim);
+-
+-      /*
+-       * free those over-booking quota for metadata blocks
+-       */
++      /* Update quota subsystem */
++      vfs_dq_claim_block(inode, used);
+       if (mdb_free)
+               vfs_dq_release_reservation_block(inode, mdb_free);
+@@ -1130,7 +1134,8 @@ static void ext4_da_update_reserve_space
+        * there aren't any writers on the inode, we can discard the
+        * inode's preallocations.
+        */
+-      if (!total && (atomic_read(&inode->i_writecount) == 0))
++      if ((ei->i_reserved_data_blocks == 0) &&
++          (atomic_read(&inode->i_writecount) == 0))
+               ext4_discard_preallocations(inode);
+ }
+@@ -1843,7 +1848,8 @@ static int ext4_da_reserve_space(struct
+ {
+       int retries = 0;
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+-      unsigned long md_needed, mdblocks, total = 0;
++      struct ext4_inode_info *ei = EXT4_I(inode);
++      unsigned long md_needed, md_reserved, total = 0;
+       /*
+        * recalculate the amount of metadata blocks to reserve
+@@ -1851,35 +1857,44 @@ static int ext4_da_reserve_space(struct
+        * worse case is one extent per block
+        */
+ repeat:
+-      spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+-      total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+-      mdblocks = ext4_calc_metadata_amount(inode, total);
+-      BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+-
+-      md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
++      spin_lock(&ei->i_block_reservation_lock);
++      md_reserved = ei->i_reserved_meta_blocks;
++      md_needed = ext4_calc_metadata_amount(inode, nrblocks);
+       total = md_needed + nrblocks;
+-      spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
++      spin_unlock(&ei->i_block_reservation_lock);
+       /*
+        * Make quota reservation here to prevent quota overflow
+        * later. Real quota accounting is done at pages writeout
+        * time.
+        */
+-      if (vfs_dq_reserve_block(inode, total))
++      if (vfs_dq_reserve_block(inode, total)) {
++              /*
++               * We tend to badly over-estimate the amount of
++               * metadata blocks which are needed, so if we have
++               * reserved any metadata blocks, try to force out the
++               * inode and see if we have any better luck.
++               */
++              if (md_reserved && retries++ <= 3)
++                      goto retry;
+               return -EDQUOT;
++      }
+       if (ext4_claim_free_blocks(sbi, total)) {
+               vfs_dq_release_reservation_block(inode, total);
+               if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
++              retry:
++                      if (md_reserved)
++                              write_inode_now(inode, (retries == 3));
+                       yield();
+                       goto repeat;
+               }
+               return -ENOSPC;
+       }
+-      spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+-      EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+-      EXT4_I(inode)->i_reserved_meta_blocks += md_needed;
+-      spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
++      spin_lock(&ei->i_block_reservation_lock);
++      ei->i_reserved_data_blocks += nrblocks;
++      ei->i_reserved_meta_blocks += md_needed;
++      spin_unlock(&ei->i_block_reservation_lock);
+       return 0;       /* success */
+ }
+@@ -1887,49 +1902,45 @@ repeat:
+ static void ext4_da_release_space(struct inode *inode, int to_free)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+-      int total, mdb, mdb_free, release;
++      struct ext4_inode_info *ei = EXT4_I(inode);
+       if (!to_free)
+               return;         /* Nothing to release, exit */
+       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+-      if (!EXT4_I(inode)->i_reserved_data_blocks) {
++      if (unlikely(to_free > ei->i_reserved_data_blocks)) {
+               /*
+-               * if there is no reserved blocks, but we try to free some
+-               * then the counter is messed up somewhere.
+-               * but since this function is called from invalidate
+-               * page, it's harmless to return without any action
++               * if there aren't enough reserved blocks, then the
++               * counter is messed up somewhere.  Since this
++               * function is called from invalidate page, it's
++               * harmless to return without any action.
+                */
+-              printk(KERN_INFO "ext4 delalloc try to release %d reserved "
+-                          "blocks for inode %lu, but there is no reserved "
+-                          "data blocks\n", to_free, inode->i_ino);
+-              spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+-              return;
++              ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
++                       "ino %lu, to_free %d with only %d reserved "
++                       "data blocks\n", inode->i_ino, to_free,
++                       ei->i_reserved_data_blocks);
++              WARN_ON(1);
++              to_free = ei->i_reserved_data_blocks;
+       }
++      ei->i_reserved_data_blocks -= to_free;
+-      /* recalculate the number of metablocks still need to be reserved */
+-      total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
+-      mdb = ext4_calc_metadata_amount(inode, total);
+-
+-      /* figure out how many metablocks to release */
+-      BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+-      mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+-
+-      release = to_free + mdb_free;
+-
+-      /* update fs dirty blocks counter for truncate case */
+-      percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
++      if (ei->i_reserved_data_blocks == 0) {
++              /*
++               * We can release all of the reserved metadata blocks
++               * only when we have written all of the delayed
++               * allocation blocks.
++               */
++              to_free += ei->i_allocated_meta_blocks;
++              ei->i_allocated_meta_blocks = 0;
++      }
+-      /* update per-inode reservations */
+-      BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
+-      EXT4_I(inode)->i_reserved_data_blocks -= to_free;
++      /* update fs dirty blocks counter */
++      percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
+-      BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+-      EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+-      vfs_dq_release_reservation_block(inode, release);
++      vfs_dq_release_reservation_block(inode, to_free);
+ }
+ static void ext4_da_page_release_reservation(struct page *page,
diff --git a/queue-2.6.32/0008-ext4-Fix-accounting-of-reserved-metadata-blocks.patch b/queue-2.6.32/0008-ext4-Fix-accounting-of-reserved-metadata-blocks.patch
new file mode 100644 (file)
index 0000000..f150aa6
--- /dev/null
@@ -0,0 +1,41 @@
+From 81799214a5369211cf9046735dafcf59a29e7454 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 30 May 2010 22:49:22 -0400
+Subject: ext4: Fix accounting of reserved metadata blocks
+
+commit ee5f4d9cdf32fd99172d11665c592a288c2b1ff4 upstream (as of v2.6.33-rc3)
+
+Commit 0637c6f had a typo which caused the reserved metadata blocks to
+not be released correctly.   Fix this.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c |    8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1118,9 +1118,9 @@ static void ext4_da_update_reserve_space
+                * only when we have written all of the delayed
+                * allocation blocks.
+                */
+-              mdb_free = ei->i_allocated_meta_blocks;
++              mdb_free = ei->i_reserved_meta_blocks;
++              ei->i_reserved_meta_blocks = 0;
+               percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+-              ei->i_allocated_meta_blocks = 0;
+       }
+       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+@@ -1931,8 +1931,8 @@ static void ext4_da_release_space(struct
+                * only when we have written all of the delayed
+                * allocation blocks.
+                */
+-              to_free += ei->i_allocated_meta_blocks;
+-              ei->i_allocated_meta_blocks = 0;
++              to_free += ei->i_reserved_meta_blocks;
++              ei->i_reserved_meta_blocks = 0;
+       }
+       /* update fs dirty blocks counter */
diff --git a/queue-2.6.32/0009-ext4-Calculate-metadata-requirements-more-accurately.patch b/queue-2.6.32/0009-ext4-Calculate-metadata-requirements-more-accurately.patch
new file mode 100644 (file)
index 0000000..a3acc3c
--- /dev/null
@@ -0,0 +1,295 @@
+From 665d82f8d039371ba402227e99d3b95078c97fb9 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 30 May 2010 22:49:23 -0400
+Subject: ext4: Calculate metadata requirements more accurately
+
+commit 9d0be50230b333005635967f7ecd4897dbfd181b upstream (as of v2.6.33-rc3)
+
+In the past, ext4_calc_metadata_amount(), and its sub-functions
+ext4_ext_calc_metadata_amount() and ext4_indirect_calc_metadata_amount()
+badly over-estimated the number of metadata blocks that might be
+required for delayed allocation blocks.  This didn't matter as much
+when functions which managed the reserved metadata blocks were more
+aggressive about dropping reserved metadata blocks as delayed
+allocation blocks were written, but unfortunately they were too
+aggressive.  This was fixed in commit 0637c6f, but as a result the
+over-estimation by ext4_calc_metadata_amount() would lead to reserving
+2-3 times the number of pending delayed allocation blocks as
+potentially required metadata blocks.  So if there are 1 megabytes of
+blocks which have been not yet been allocation, up to 3 megabytes of
+space would get reserved out of the user's quota and from the file
+system free space pool until all of the inode's data blocks have been
+allocated.
+
+This commit addresses this problem by much more accurately estimating
+the number of metadata blocks that will be required.  It will still
+somewhat over-estimate the number of blocks needed, since it must make
+a worst case estimate not knowing which physical blocks will be
+needed, but it is much more accurate than before.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h         |    2 +
+ fs/ext4/ext4_extents.h |    3 +-
+ fs/ext4/extents.c      |   49 ++++++++++++++++++++++++-------------
+ fs/ext4/inode.c        |   62 +++++++++++++++++++++++++++--------------------
+ fs/ext4/super.c        |    1 +
+ 5 files changed, 73 insertions(+), 44 deletions(-)
+
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 4a825c1..23bfbbc 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -693,6 +693,8 @@ struct ext4_inode_info {
+       unsigned int i_reserved_meta_blocks;
+       unsigned int i_allocated_meta_blocks;
+       unsigned short i_delalloc_reserved_flag;
++      sector_t i_da_metadata_calc_last_lblock;
++      int i_da_metadata_calc_len;
+       /* on-disk additional length */
+       __u16 i_extra_isize;
+diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
+index 2ca6864..bdb6ce7 100644
+--- a/fs/ext4/ext4_extents.h
++++ b/fs/ext4/ext4_extents.h
+@@ -225,7 +225,8 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
+       ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
+ }
+-extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
++extern int ext4_ext_calc_metadata_amount(struct inode *inode,
++                                       sector_t lblocks);
+ extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
+ extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
+ extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
+diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
+index b14fb6d..5f03f9f 100644
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -296,29 +296,44 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
+  * to allocate @blocks
+  * Worse case is one block per extent
+  */
+-int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
++int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
+ {
+-      int lcap, icap, rcap, leafs, idxs, num;
+-      int newextents = blocks;
+-
+-      rcap = ext4_ext_space_root_idx(inode, 0);
+-      lcap = ext4_ext_space_block(inode, 0);
+-      icap = ext4_ext_space_block_idx(inode, 0);
++      struct ext4_inode_info *ei = EXT4_I(inode);
++      int idxs, num = 0;
+-      /* number of new leaf blocks needed */
+-      num = leafs = (newextents + lcap - 1) / lcap;
++      idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
++              / sizeof(struct ext4_extent_idx));
+       /*
+-       * Worse case, we need separate index block(s)
+-       * to link all new leaf blocks
++       * If the new delayed allocation block is contiguous with the
++       * previous da block, it can share index blocks with the
++       * previous block, so we only need to allocate a new index
++       * block every idxs leaf blocks.  At ldxs**2 blocks, we need
++       * an additional index block, and at ldxs**3 blocks, yet
++       * another index blocks.
+        */
+-      idxs = (leafs + icap - 1) / icap;
+-      do {
+-              num += idxs;
+-              idxs = (idxs + icap - 1) / icap;
+-      } while (idxs > rcap);
++      if (ei->i_da_metadata_calc_len &&
++          ei->i_da_metadata_calc_last_lblock+1 == lblock) {
++              if ((ei->i_da_metadata_calc_len % idxs) == 0)
++                      num++;
++              if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
++                      num++;
++              if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
++                      num++;
++                      ei->i_da_metadata_calc_len = 0;
++              } else
++                      ei->i_da_metadata_calc_len++;
++              ei->i_da_metadata_calc_last_lblock++;
++              return num;
++      }
+-      return num;
++      /*
++       * In the worst case we need a new set of index blocks at
++       * every level of the inode's extent tree.
++       */
++      ei->i_da_metadata_calc_len = 1;
++      ei->i_da_metadata_calc_last_lblock = lblock;
++      return ext_depth(inode) + 1;
+ }
+ static int
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 533bb84..2e3f422 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1051,38 +1051,44 @@ qsize_t *ext4_get_reserved_space(struct inode *inode)
+       return &EXT4_I(inode)->i_reserved_quota;
+ }
+ #endif
++
+ /*
+  * Calculate the number of metadata blocks need to reserve
+- * to allocate @blocks for non extent file based file
++ * to allocate a new block at @lblocks for non extent file based file
+  */
+-static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
++static int ext4_indirect_calc_metadata_amount(struct inode *inode,
++                                            sector_t lblock)
+ {
+-      int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+-      int ind_blks, dind_blks, tind_blks;
+-
+-      /* number of new indirect blocks needed */
+-      ind_blks = (blocks + icap - 1) / icap;
++      struct ext4_inode_info *ei = EXT4_I(inode);
++      int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1;
++      int blk_bits;
+-      dind_blks = (ind_blks + icap - 1) / icap;
++      if (lblock < EXT4_NDIR_BLOCKS)
++              return 0;
+-      tind_blks = 1;
++      lblock -= EXT4_NDIR_BLOCKS;
+-      return ind_blks + dind_blks + tind_blks;
++      if (ei->i_da_metadata_calc_len &&
++          (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
++              ei->i_da_metadata_calc_len++;
++              return 0;
++      }
++      ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
++      ei->i_da_metadata_calc_len = 1;
++      blk_bits = roundup_pow_of_two(lblock + 1);
++      return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
+ }
+ /*
+  * Calculate the number of metadata blocks need to reserve
+- * to allocate given number of blocks
++ * to allocate a block located at @lblock
+  */
+-static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
++static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
+ {
+-      if (!blocks)
+-              return 0;
+-
+       if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+-              return ext4_ext_calc_metadata_amount(inode, blocks);
++              return ext4_ext_calc_metadata_amount(inode, lblock);
+-      return ext4_indirect_calc_metadata_amount(inode, blocks);
++      return ext4_indirect_calc_metadata_amount(inode, lblock);
+ }
+ /*
+@@ -1120,6 +1126,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
+                */
+               mdb_free = ei->i_reserved_meta_blocks;
+               ei->i_reserved_meta_blocks = 0;
++              ei->i_da_metadata_calc_len = 0;
+               percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+       }
+       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+@@ -1844,12 +1851,15 @@ static int ext4_journalled_write_end(struct file *file,
+       return ret ? ret : copied;
+ }
+-static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
++/*
++ * Reserve a single block located at lblock
++ */
++static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
+ {
+       int retries = 0;
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       struct ext4_inode_info *ei = EXT4_I(inode);
+-      unsigned long md_needed, md_reserved, total = 0;
++      unsigned long md_needed, md_reserved;
+       /*
+        * recalculate the amount of metadata blocks to reserve
+@@ -1859,8 +1869,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+ repeat:
+       spin_lock(&ei->i_block_reservation_lock);
+       md_reserved = ei->i_reserved_meta_blocks;
+-      md_needed = ext4_calc_metadata_amount(inode, nrblocks);
+-      total = md_needed + nrblocks;
++      md_needed = ext4_calc_metadata_amount(inode, lblock);
+       spin_unlock(&ei->i_block_reservation_lock);
+       /*
+@@ -1868,7 +1877,7 @@ repeat:
+        * later. Real quota accounting is done at pages writeout
+        * time.
+        */
+-      if (vfs_dq_reserve_block(inode, total)) {
++      if (vfs_dq_reserve_block(inode, md_needed + 1)) {
+               /*
+                * We tend to badly over-estimate the amount of
+                * metadata blocks which are needed, so if we have
+@@ -1880,8 +1889,8 @@ repeat:
+               return -EDQUOT;
+       }
+-      if (ext4_claim_free_blocks(sbi, total)) {
+-              vfs_dq_release_reservation_block(inode, total);
++      if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
++              vfs_dq_release_reservation_block(inode, md_needed + 1);
+               if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+               retry:
+                       if (md_reserved)
+@@ -1892,7 +1901,7 @@ repeat:
+               return -ENOSPC;
+       }
+       spin_lock(&ei->i_block_reservation_lock);
+-      ei->i_reserved_data_blocks += nrblocks;
++      ei->i_reserved_data_blocks++;
+       ei->i_reserved_meta_blocks += md_needed;
+       spin_unlock(&ei->i_block_reservation_lock);
+@@ -1933,6 +1942,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
+                */
+               to_free += ei->i_reserved_meta_blocks;
+               ei->i_reserved_meta_blocks = 0;
++              ei->i_da_metadata_calc_len = 0;
+       }
+       /* update fs dirty blocks counter */
+@@ -2546,7 +2556,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+                * XXX: __block_prepare_write() unmaps passed block,
+                * is it OK?
+                */
+-              ret = ext4_da_reserve_space(inode, 1);
++              ret = ext4_da_reserve_space(inode, iblock);
+               if (ret)
+                       /* not enough space to reserve */
+                       return ret;
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 92943f2..252f30b 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -702,6 +702,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
+       ei->i_reserved_data_blocks = 0;
+       ei->i_reserved_meta_blocks = 0;
+       ei->i_allocated_meta_blocks = 0;
++      ei->i_da_metadata_calc_len = 0;
+       ei->i_delalloc_reserved_flag = 0;
+       spin_lock_init(&(ei->i_block_reservation_lock));
+ #ifdef CONFIG_QUOTA
+-- 
+1.7.1
+
diff --git a/queue-2.6.32/0010-ext4-Handle-EDQUOT-error-on-write.patch b/queue-2.6.32/0010-ext4-Handle-EDQUOT-error-on-write.patch
new file mode 100644 (file)
index 0000000..3bec88c
--- /dev/null
@@ -0,0 +1,76 @@
+From 34e8248f530c4db6c4ba200c945257e0713d9905 Mon Sep 17 00:00:00 2001
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Sun, 30 May 2010 22:49:24 -0400
+Subject: ext4: Handle -EDQUOT error on write
+
+commit 1db913823c0f8360fccbd24ca67eb073966a5ffd upstream (as of v2.6.33-rc6)
+
+We need to release the journal before we do a write_inode.  Otherwise
+we could deadlock.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c |   32 ++++++++++++++++++--------------
+ 1 file changed, 18 insertions(+), 14 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1877,24 +1877,12 @@ repeat:
+        * later. Real quota accounting is done at pages writeout
+        * time.
+        */
+-      if (vfs_dq_reserve_block(inode, md_needed + 1)) {
+-              /*
+-               * We tend to badly over-estimate the amount of
+-               * metadata blocks which are needed, so if we have
+-               * reserved any metadata blocks, try to force out the
+-               * inode and see if we have any better luck.
+-               */
+-              if (md_reserved && retries++ <= 3)
+-                      goto retry;
++      if (vfs_dq_reserve_block(inode, md_needed + 1))
+               return -EDQUOT;
+-      }
+       if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
+               vfs_dq_release_reservation_block(inode, md_needed + 1);
+               if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+-              retry:
+-                      if (md_reserved)
+-                              write_inode_now(inode, (retries == 3));
+                       yield();
+                       goto repeat;
+               }
+@@ -3075,7 +3063,7 @@ static int ext4_da_write_begin(struct fi
+                              loff_t pos, unsigned len, unsigned flags,
+                              struct page **pagep, void **fsdata)
+ {
+-      int ret, retries = 0;
++      int ret, retries = 0, quota_retries = 0;
+       struct page *page;
+       pgoff_t index;
+       unsigned from, to;
+@@ -3134,6 +3122,22 @@ retry:
+       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+               goto retry;
++
++      if ((ret == -EDQUOT) &&
++          EXT4_I(inode)->i_reserved_meta_blocks &&
++          (quota_retries++ < 3)) {
++              /*
++               * Since we often over-estimate the number of meta
++               * data blocks required, we may sometimes get a
++               * spurios out of quota error even though there would
++               * be enough space once we write the data blocks and
++               * find out how many meta data blocks were _really_
++               * required.  So try forcing the inode write to see if
++               * that helps.
++               */
++              write_inode_now(inode, (quota_retries == 3));
++              goto retry;
++      }
+ out:
+       return ret;
+ }
diff --git a/queue-2.6.32/0011-ext4-Fix-quota-accounting-error-with-fallocate.patch b/queue-2.6.32/0011-ext4-Fix-quota-accounting-error-with-fallocate.patch
new file mode 100644 (file)
index 0000000..1974583
--- /dev/null
@@ -0,0 +1,154 @@
+From 09e8f5642b741ecfdd05c259b47796f85fdd01aa Mon Sep 17 00:00:00 2001
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Sun, 30 May 2010 22:49:25 -0400
+Subject: ext4: Fix quota accounting error with fallocate
+
+commit 5f634d064c709ea02c3cdaa850a08323a4a4bf28 upstream (as of v2.6.33-rc6)
+
+When we fallocate a region of the file which we had recently written,
+and which is still in the page cache marked as delayed allocated blocks
+we need to make sure we don't do the quota update on writepage path.
+This is because the needed quota updated would have already be done
+by fallocate.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h    |    2 ++
+ fs/ext4/extents.c |   21 +++++++++++++++++++++
+ fs/ext4/inode.c   |   44 +++++++++++++++++++++++++++++++-------------
+ 3 files changed, 54 insertions(+), 13 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1440,6 +1440,8 @@ extern int ext4_block_truncate_page(hand
+ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+ extern qsize_t *ext4_get_reserved_space(struct inode *inode);
+ extern int flush_aio_dio_completed_IO(struct inode *inode);
++extern void ext4_da_update_reserve_space(struct inode *inode,
++                                      int used, int quota_claim);
+ /* ioctl.c */
+ extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
+ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3138,7 +3138,19 @@ out:
+               unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
+                                       newblock + max_blocks,
+                                       allocated - max_blocks);
++              allocated = max_blocks;
+       }
++
++      /*
++       * If we have done fallocate with the offset that is already
++       * delayed allocated, we would have block reservation
++       * and quota reservation done in the delayed write path.
++       * But fallocate would have already updated quota and block
++       * count for this offset. So cancel these reservation
++       */
++      if (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)
++              ext4_da_update_reserve_space(inode, allocated, 0);
++
+ map_out:
+       set_buffer_mapped(bh_result);
+ out1:
+@@ -3374,9 +3386,18 @@ int ext4_ext_get_blocks(handle_t *handle
+       /* previous routine could use block we allocated */
+       newblock = ext_pblock(&newex);
+       allocated = ext4_ext_get_actual_len(&newex);
++      if (allocated > max_blocks)
++              allocated = max_blocks;
+       set_buffer_new(bh_result);
+       /*
++       * Update reserved blocks/metadata blocks after successful
++       * block allocation which had been deferred till now.
++       */
++      if (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)
++              ext4_da_update_reserve_space(inode, allocated, 1);
++
++      /*
+        * Cache the extent and update transaction to commit on fdatasync only
+        * when it is _not_ an uninitialized extent.
+        */
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1095,11 +1095,12 @@ static int ext4_calc_metadata_amount(str
+  * Called with i_data_sem down, which is important since we can call
+  * ext4_discard_preallocations() from here.
+  */
+-static void ext4_da_update_reserve_space(struct inode *inode, int used)
++void ext4_da_update_reserve_space(struct inode *inode,
++                                      int used, int quota_claim)
+ {
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       struct ext4_inode_info *ei = EXT4_I(inode);
+-      int mdb_free = 0;
++      int mdb_free = 0, allocated_meta_blocks = 0;
+       spin_lock(&ei->i_block_reservation_lock);
+       if (unlikely(used > ei->i_reserved_data_blocks)) {
+@@ -1115,6 +1116,7 @@ static void ext4_da_update_reserve_space
+       ei->i_reserved_data_blocks -= used;
+       used += ei->i_allocated_meta_blocks;
+       ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
++      allocated_meta_blocks = ei->i_allocated_meta_blocks;
+       ei->i_allocated_meta_blocks = 0;
+       percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
+@@ -1132,9 +1134,23 @@ static void ext4_da_update_reserve_space
+       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+       /* Update quota subsystem */
+-      vfs_dq_claim_block(inode, used);
+-      if (mdb_free)
+-              vfs_dq_release_reservation_block(inode, mdb_free);
++      if (quota_claim) {
++              vfs_dq_claim_block(inode, used);
++              if (mdb_free)
++                      vfs_dq_release_reservation_block(inode, mdb_free);
++      } else {
++              /*
++               * We did fallocate with an offset that is already delayed
++               * allocated. So on delayed allocated writeback we should
++               * not update the quota for allocated blocks. But then
++               * converting an fallocate region to initialized region would
++               * have caused a metadata allocation. So claim quota for
++               * that
++               */
++              if (allocated_meta_blocks)
++                      vfs_dq_claim_block(inode, allocated_meta_blocks);
++              vfs_dq_release_reservation_block(inode, mdb_free + used);
++      }
+       /*
+        * If we have done all the pending block allocations and if
+@@ -1334,18 +1350,20 @@ int ext4_get_blocks(handle_t *handle, st
+                        */
+                       EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
+               }
+-      }
++              /*
++               * Update reserved blocks/metadata blocks after successful
++               * block allocation which had been deferred till now. We don't
++               * support fallocate for non extent files. So we can update
++               * reserve space here.
++               */
++              if ((retval > 0) &&
++                      (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
++                      ext4_da_update_reserve_space(inode, retval, 1);
++      }
+       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+               EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+-      /*
+-       * Update reserved blocks/metadata blocks after successful
+-       * block allocation which had been deferred till now.
+-       */
+-      if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
+-              ext4_da_update_reserve_space(inode, retval);
+-
+       up_write((&EXT4_I(inode)->i_data_sem));
+       if (retval > 0 && buffer_mapped(bh)) {
+               int ret = check_block_validity(inode, "file system "
diff --git a/queue-2.6.32/0012-ext4-Drop-EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE-flag.patch b/queue-2.6.32/0012-ext4-Drop-EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE-flag.patch
new file mode 100644 (file)
index 0000000..b6119c8
--- /dev/null
@@ -0,0 +1,99 @@
+From 3a1a12ca4219f564fe4f86cae1bfb563422a2d15 Mon Sep 17 00:00:00 2001
+From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Date: Sun, 30 May 2010 22:49:26 -0400
+Subject: ext4: Drop EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE flag
+
+commit 1296cc85c26e94eb865d03f82140f27d598de467 upstream (as of v2.6.33-rc6)
+
+We should update reserve space if it is delalloc buffer
+and that is indicated by EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.
+So use EXT4_GET_BLOCKS_DELALLOC_RESERVE in place of
+EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE
+
+[ Stable note: This fixes a corruption cuased by the following
+  reproduction case:
+
+  rm -f $TEST_FN
+  touch $TEST_FN
+  fallocate -n -o 656712 -l 858907 $TEST_FN
+  dd if=/dev/zero of=$TEST_FN conv=notrunc bs=1 seek=1011020 count=36983
+  sync
+  dd if=/dev/zero of=$TEST_FN conv=notrunc bs=1 seek=332121 count=24005
+  dd if=/dev/zero of=$TEST_FN conv=notrunc bs=1 seek=1040179 count=93319
+
+  If the filesystem is then unmounted and e2fsck run forced, the
+  i_blocks field for the file $TEST_FN will be found to be incorrect. ]
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h    |    7 ++-----
+ fs/ext4/extents.c |    4 ++--
+ fs/ext4/inode.c   |    8 ++++----
+ 3 files changed, 8 insertions(+), 11 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -361,14 +361,11 @@ struct ext4_new_group_data {
+          so set the magic i_delalloc_reserve_flag after taking the 
+          inode allocation semaphore for */
+ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE      0x0004
+-      /* Call ext4_da_update_reserve_space() after successfully 
+-         allocating the blocks */
+-#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE  0x0008
+       /* caller is from the direct IO path, request to creation of an
+       unitialized extents if not allocated, split the uninitialized
+       extent if blocks has been preallocated already*/
+-#define EXT4_GET_BLOCKS_DIO                   0x0010
+-#define EXT4_GET_BLOCKS_CONVERT                       0x0020
++#define EXT4_GET_BLOCKS_DIO                   0x0008
++#define EXT4_GET_BLOCKS_CONVERT                       0x0010
+ #define EXT4_GET_BLOCKS_DIO_CREATE_EXT                (EXT4_GET_BLOCKS_DIO|\
+                                        EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+       /* Convert extent to initialized after direct IO complete */
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3148,7 +3148,7 @@ out:
+        * But fallocate would have already updated quota and block
+        * count for this offset. So cancel these reservation
+        */
+-      if (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)
++      if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+               ext4_da_update_reserve_space(inode, allocated, 0);
+ map_out:
+@@ -3394,7 +3394,7 @@ int ext4_ext_get_blocks(handle_t *handle
+        * Update reserved blocks/metadata blocks after successful
+        * block allocation which had been deferred till now.
+        */
+-      if (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)
++      if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+               ext4_da_update_reserve_space(inode, allocated, 1);
+       /*
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1358,7 +1358,7 @@ int ext4_get_blocks(handle_t *handle, st
+                * reserve space here.
+                */
+               if ((retval > 0) &&
+-                      (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
++                      (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
+                       ext4_da_update_reserve_space(inode, retval, 1);
+       }
+       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+@@ -2261,10 +2261,10 @@ static int mpage_da_map_blocks(struct mp
+        * variables are updated after the blocks have been allocated.
+        */
+       new.b_state = 0;
+-      get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
+-                          EXT4_GET_BLOCKS_DELALLOC_RESERVE);
++      get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
+       if (mpd->b_state & (1 << BH_Delay))
+-              get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
++              get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
++
+       blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
+                              &new, get_blocks_flags);
+       if (blks < 0) {
diff --git a/queue-2.6.32/0013-ext4-Use-bitops-to-read-modify-EXT4_I-inode-i_state.patch b/queue-2.6.32/0013-ext4-Use-bitops-to-read-modify-EXT4_I-inode-i_state.patch
new file mode 100644 (file)
index 0000000..57f873c
--- /dev/null
@@ -0,0 +1,409 @@
+From f7ae767b11e7ac054c5f8de55e5a83ec7c60c6a0 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 30 May 2010 22:49:27 -0400
+Subject: ext4: Use bitops to read/modify EXT4_I(inode)->i_state
+
+commit 19f5fb7ad679bb361222c7916086435020c37cce upstream (as of v2.6.33-git11)
+
+At several places we modify EXT4_I(inode)->i_state without holding
+i_mutex (ext4_release_file, ext4_bmap, ext4_journalled_writepage,
+ext4_do_update_inode, ...). These modifications are racy and we can
+lose updates to i_state. So convert handling of i_state to use bitops
+which are atomic.
+
+Cc: Jan Kara <jack@suse.cz>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h    |   41 +++++++++++++++++++++++++++++------------
+ fs/ext4/extents.c |    8 ++++----
+ fs/ext4/file.c    |    4 ++--
+ fs/ext4/ialloc.c  |    3 ++-
+ fs/ext4/inode.c   |   38 ++++++++++++++++++++------------------
+ fs/ext4/migrate.c |    6 +++---
+ fs/ext4/xattr.c   |   22 +++++++++++-----------
+ 7 files changed, 71 insertions(+), 51 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -313,17 +313,6 @@ static inline __u32 ext4_mask_flags(umod
+               return flags & EXT4_OTHER_FLMASK;
+ }
+-/*
+- * Inode dynamic state flags
+- */
+-#define EXT4_STATE_JDATA              0x00000001 /* journaled data exists */
+-#define EXT4_STATE_NEW                        0x00000002 /* inode is newly created */
+-#define EXT4_STATE_XATTR              0x00000004 /* has in-inode xattrs */
+-#define EXT4_STATE_NO_EXPAND          0x00000008 /* No space for expansion */
+-#define EXT4_STATE_DA_ALLOC_CLOSE     0x00000010 /* Alloc DA blks on close */
+-#define EXT4_STATE_EXT_MIGRATE                0x00000020 /* Inode is migrating */
+-#define EXT4_STATE_DIO_UNWRITTEN      0x00000040 /* need convert on dio done*/
+-
+ /* Used to pass group descriptor data when online resize is done */
+ struct ext4_new_group_input {
+       __u32 group;            /* Group number for this data */
+@@ -624,7 +613,7 @@ struct ext4_inode_info {
+        * near to their parent directory's inode.
+        */
+       ext4_group_t    i_block_group;
+-      __u32   i_state;                /* Dynamic state flags for ext4 */
++      unsigned long   i_state_flags;          /* Dynamic state flags */
+       ext4_lblk_t             i_dir_start_lookup;
+ #ifdef CONFIG_EXT4_FS_XATTR
+@@ -1044,6 +1033,34 @@ static inline int ext4_valid_inum(struct
+               (ino >= EXT4_FIRST_INO(sb) &&
+                ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
+ }
++
++/*
++ * Inode dynamic state flags
++ */
++enum {
++      EXT4_STATE_JDATA,               /* journaled data exists */
++      EXT4_STATE_NEW,                 /* inode is newly created */
++      EXT4_STATE_XATTR,               /* has in-inode xattrs */
++      EXT4_STATE_NO_EXPAND,           /* No space for expansion */
++      EXT4_STATE_DA_ALLOC_CLOSE,      /* Alloc DA blks on close */
++      EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
++      EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
++};
++
++static inline int ext4_test_inode_state(struct inode *inode, int bit)
++{
++      return test_bit(bit, &EXT4_I(inode)->i_state_flags);
++}
++
++static inline void ext4_set_inode_state(struct inode *inode, int bit)
++{
++      set_bit(bit, &EXT4_I(inode)->i_state_flags);
++}
++
++static inline void ext4_clear_inode_state(struct inode *inode, int bit)
++{
++      clear_bit(bit, &EXT4_I(inode)->i_state_flags);
++}
+ #else
+ /* Assume that user mode programs are passing in an ext4fs superblock, not
+  * a kernel struct super_block.  This will allow us to call the feature-test
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3082,7 +3082,7 @@ ext4_ext_handle_uninitialized_extents(ha
+               if (io)
+                       io->flag = DIO_AIO_UNWRITTEN;
+               else
+-                      EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN;
++                      ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+               goto out;
+       }
+       /* async DIO end_io complete, convert the filled extent to written */
+@@ -3368,8 +3368,8 @@ int ext4_ext_get_blocks(handle_t *handle
+                       if (io)
+                               io->flag = DIO_AIO_UNWRITTEN;
+                       else
+-                              EXT4_I(inode)->i_state |=
+-                                      EXT4_STATE_DIO_UNWRITTEN;;
++                              ext4_set_inode_state(inode,
++                                                   EXT4_STATE_DIO_UNWRITTEN);
+               }
+       }
+       err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+@@ -3745,7 +3745,7 @@ static int ext4_xattr_fiemap(struct inod
+       int error = 0;
+       /* in-inode? */
+-      if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
++      if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+               struct ext4_iloc iloc;
+               int offset;     /* offset of xattr in inode */
+--- a/fs/ext4/file.c
++++ b/fs/ext4/file.c
+@@ -35,9 +35,9 @@
+  */
+ static int ext4_release_file(struct inode *inode, struct file *filp)
+ {
+-      if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
++      if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
+               ext4_alloc_da_blocks(inode);
+-              EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
++              ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+       }
+       /* if we are the last writer on the inode, drop the block reservation */
+       if ((filp->f_mode & FMODE_WRITE) &&
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -1029,7 +1029,8 @@ got:
+       inode->i_generation = sbi->s_next_generation++;
+       spin_unlock(&sbi->s_next_gen_lock);
+-      ei->i_state = EXT4_STATE_NEW;
++      ei->i_state_flags = 0;
++      ext4_set_inode_state(inode, EXT4_STATE_NEW);
+       ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1348,7 +1348,7 @@ int ext4_get_blocks(handle_t *handle, st
+                        * i_data's format changing.  Force the migrate
+                        * to fail by clearing migrate flags
+                        */
+-                      EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
++                      ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+               }
+               /*
+@@ -1835,7 +1835,7 @@ static int ext4_journalled_write_end(str
+       new_i_size = pos + copied;
+       if (new_i_size > inode->i_size)
+               i_size_write(inode, pos+copied);
+-      EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
++      ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+       if (new_i_size > EXT4_I(inode)->i_disksize) {
+               ext4_update_i_disksize(inode, new_i_size);
+               ret2 = ext4_mark_inode_dirty(handle, inode);
+@@ -2673,7 +2673,7 @@ static int __ext4_journalled_writepage(s
+               ret = err;
+       walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
+-      EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
++      ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ out:
+       return ret;
+ }
+@@ -3344,7 +3344,8 @@ static sector_t ext4_bmap(struct address
+               filemap_write_and_wait(mapping);
+       }
+-      if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
++      if (EXT4_JOURNAL(inode) &&
++          ext4_test_inode_state(inode, EXT4_STATE_JDATA)) {
+               /*
+                * This is a REALLY heavyweight approach, but the use of
+                * bmap on dirty files is expected to be extremely rare:
+@@ -3363,7 +3364,7 @@ static sector_t ext4_bmap(struct address
+                * everything they get.
+                */
+-              EXT4_I(inode)->i_state &= ~EXT4_STATE_JDATA;
++              ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
+               journal = EXT4_JOURNAL(inode);
+               jbd2_journal_lock_updates(journal);
+               err = jbd2_journal_flush(journal);
+@@ -3831,8 +3832,8 @@ static ssize_t ext4_ext_direct_IO(int rw
+               if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+                       ext4_free_io_end(iocb->private);
+                       iocb->private = NULL;
+-              } else if (ret > 0 && (EXT4_I(inode)->i_state &
+-                                     EXT4_STATE_DIO_UNWRITTEN)) {
++              } else if (ret > 0 && ext4_test_inode_state(inode,
++                                              EXT4_STATE_DIO_UNWRITTEN)) {
+                       int err;
+                       /*
+                        * for non AIO case, since the IO is already
+@@ -3842,7 +3843,7 @@ static ssize_t ext4_ext_direct_IO(int rw
+                                                            offset, ret);
+                       if (err < 0)
+                               ret = err;
+-                      EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN;
++                      ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+               }
+               return ret;
+       }
+@@ -4490,7 +4491,7 @@ void ext4_truncate(struct inode *inode)
+               return;
+       if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+-              ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
++              ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+       if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+               ext4_ext_truncate(inode);
+@@ -4776,7 +4777,7 @@ int ext4_get_inode_loc(struct inode *ino
+ {
+       /* We have all inode data except xattrs in memory here. */
+       return __ext4_get_inode_loc(inode, iloc,
+-              !(EXT4_I(inode)->i_state & EXT4_STATE_XATTR));
++              !ext4_test_inode_state(inode, EXT4_STATE_XATTR));
+ }
+ void ext4_set_inode_flags(struct inode *inode)
+@@ -4870,7 +4871,7 @@ struct inode *ext4_iget(struct super_blo
+       }
+       inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+-      ei->i_state = 0;
++      ei->i_state_flags = 0;
+       ei->i_dir_start_lookup = 0;
+       ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
+       /* We now have enough fields to check if the inode was active or not.
+@@ -4953,7 +4954,7 @@ struct inode *ext4_iget(struct super_blo
+                                       EXT4_GOOD_OLD_INODE_SIZE +
+                                       ei->i_extra_isize;
+                       if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
+-                              ei->i_state |= EXT4_STATE_XATTR;
++                              ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+               }
+       } else
+               ei->i_extra_isize = 0;
+@@ -5093,7 +5094,7 @@ static int ext4_do_update_inode(handle_t
+       /* For fields not not tracking in the in-memory inode,
+        * initialise them to zero for new inodes. */
+-      if (ei->i_state & EXT4_STATE_NEW)
++      if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
+               memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+       ext4_get_inode_flags(ei);
+@@ -5189,7 +5190,7 @@ static int ext4_do_update_inode(handle_t
+       rc = ext4_handle_dirty_metadata(handle, inode, bh);
+       if (!err)
+               err = rc;
+-      ei->i_state &= ~EXT4_STATE_NEW;
++      ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+       ext4_update_inode_fsync_trans(handle, inode, 0);
+ out_brelse:
+@@ -5613,8 +5614,8 @@ static int ext4_expand_extra_isize(struc
+       entry = IFIRST(header);
+       /* No extended attributes present */
+-      if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR) ||
+-              header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
++      if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
++          header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
+               memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE, 0,
+                       new_extra_isize);
+               EXT4_I(inode)->i_extra_isize = new_extra_isize;
+@@ -5658,7 +5659,7 @@ int ext4_mark_inode_dirty(handle_t *hand
+       err = ext4_reserve_inode_write(handle, inode, &iloc);
+       if (ext4_handle_valid(handle) &&
+           EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+-          !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) {
++          !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
+               /*
+                * We need extra buffer credits since we may write into EA block
+                * with this same handle. If journal_extend fails, then it will
+@@ -5672,7 +5673,8 @@ int ext4_mark_inode_dirty(handle_t *hand
+                                                     sbi->s_want_extra_isize,
+                                                     iloc, handle);
+                       if (ret) {
+-                              EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
++                              ext4_set_inode_state(inode,
++                                                   EXT4_STATE_NO_EXPAND);
+                               if (mnt_count !=
+                                       le16_to_cpu(sbi->s_es->s_mnt_count)) {
+                                       ext4_warning(inode->i_sb, __func__,
+--- a/fs/ext4/migrate.c
++++ b/fs/ext4/migrate.c
+@@ -357,12 +357,12 @@ static int ext4_ext_swap_inode_data(hand
+        * happened after we started the migrate. We need to
+        * fail the migrate
+        */
+-      if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
++      if (!ext4_test_inode_state(inode, EXT4_STATE_EXT_MIGRATE)) {
+               retval = -EAGAIN;
+               up_write(&EXT4_I(inode)->i_data_sem);
+               goto err_out;
+       } else
+-              EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
++              ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+       /*
+        * We have the extent map build with the tmp inode.
+        * Now copy the i_data across
+@@ -524,7 +524,7 @@ int ext4_ext_migrate(struct inode *inode
+        * allocation.
+        */
+       down_read((&EXT4_I(inode)->i_data_sem));
+-      EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
++      ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
+       up_read((&EXT4_I(inode)->i_data_sem));
+       handle = ext4_journal_start(inode, 1);
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -267,7 +267,7 @@ ext4_xattr_ibody_get(struct inode *inode
+       void *end;
+       int error;
+-      if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
++      if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+               return -ENODATA;
+       error = ext4_get_inode_loc(inode, &iloc);
+       if (error)
+@@ -393,7 +393,7 @@ ext4_xattr_ibody_list(struct inode *inod
+       void *end;
+       int error;
+-      if (!(EXT4_I(inode)->i_state & EXT4_STATE_XATTR))
++      if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+               return 0;
+       error = ext4_get_inode_loc(inode, &iloc);
+       if (error)
+@@ -903,7 +903,7 @@ ext4_xattr_ibody_find(struct inode *inod
+       is->s.base = is->s.first = IFIRST(header);
+       is->s.here = is->s.first;
+       is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+-      if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
++      if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
+               error = ext4_xattr_check_names(IFIRST(header), is->s.end);
+               if (error)
+                       return error;
+@@ -935,10 +935,10 @@ ext4_xattr_ibody_set(handle_t *handle, s
+       header = IHDR(inode, ext4_raw_inode(&is->iloc));
+       if (!IS_LAST_ENTRY(s->first)) {
+               header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+-              EXT4_I(inode)->i_state |= EXT4_STATE_XATTR;
++              ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+       } else {
+               header->h_magic = cpu_to_le32(0);
+-              EXT4_I(inode)->i_state &= ~EXT4_STATE_XATTR;
++              ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
+       }
+       return 0;
+ }
+@@ -981,8 +981,8 @@ ext4_xattr_set_handle(handle_t *handle,
+       if (strlen(name) > 255)
+               return -ERANGE;
+       down_write(&EXT4_I(inode)->xattr_sem);
+-      no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
+-      EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
++      no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
++      ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
+       error = ext4_get_inode_loc(inode, &is.iloc);
+       if (error)
+@@ -992,10 +992,10 @@ ext4_xattr_set_handle(handle_t *handle,
+       if (error)
+               goto cleanup;
+-      if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
++      if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) {
+               struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
+               memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
+-              EXT4_I(inode)->i_state &= ~EXT4_STATE_NEW;
++              ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+       }
+       error = ext4_xattr_ibody_find(inode, &i, &is);
+@@ -1047,7 +1047,7 @@ ext4_xattr_set_handle(handle_t *handle,
+               ext4_xattr_update_super_block(handle, inode->i_sb);
+               inode->i_ctime = ext4_current_time(inode);
+               if (!value)
+-                      EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
++                      ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
+               error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+               /*
+                * The bh is consumed by ext4_mark_iloc_dirty, even with
+@@ -1062,7 +1062,7 @@ cleanup:
+       brelse(is.iloc.bh);
+       brelse(bs.bh);
+       if (no_expand == 0)
+-              EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
++              ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
+       up_write(&EXT4_I(inode)->xattr_sem);
+       return error;
+ }
diff --git a/queue-2.6.32/0014-ext4-Fix-BUG_ON-at-fs-buffer.c-652-in-no-journal-mod.patch b/queue-2.6.32/0014-ext4-Fix-BUG_ON-at-fs-buffer.c-652-in-no-journal-mod.patch
new file mode 100644 (file)
index 0000000..b46cbd4
--- /dev/null
@@ -0,0 +1,99 @@
+From 04cbf99a9333c66de2474429c01e13d110aa5fd0 Mon Sep 17 00:00:00 2001
+From: Curt Wohlgemuth <curtw@google.com>
+Date: Sun, 30 May 2010 22:49:28 -0400
+Subject: ext4: Fix BUG_ON at fs/buffer.c:652 in no journal mode
+
+commit 73b50c1c92666d326b5fa2c945d46509f2f6d91f upstream (as of v2.6.33-git11)
+
+Calls to ext4_handle_dirty_metadata should only pass in an inode
+pointer for inode-specific metadata, and not for shared metadata
+blocks such as inode table blocks, block group descriptors, the
+superblock, etc.
+
+The BUG_ON can get tripped when updating a special device (such as a
+block device) that is opened (so that i_mapping is set in
+fs/block_dev.c) and the file system is mounted in no journal mode.
+
+Addresses-Google-Bug: #2404870
+
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4_jbd2.c |    2 +-
+ fs/ext4/ialloc.c    |    2 +-
+ fs/ext4/inode.c     |    6 +++---
+ fs/ext4/namei.c     |    4 ++--
+ 4 files changed, 7 insertions(+), 7 deletions(-)
+
+--- a/fs/ext4/ext4_jbd2.c
++++ b/fs/ext4/ext4_jbd2.c
+@@ -89,7 +89,7 @@ int __ext4_handle_dirty_metadata(const c
+                       ext4_journal_abort_handle(where, __func__, bh,
+                                                 handle, err);
+       } else {
+-              if (inode && bh)
++              if (inode)
+                       mark_buffer_dirty_inode(bh, inode);
+               else
+                       mark_buffer_dirty(bh);
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -904,7 +904,7 @@ repeat_in_this_group:
+                               BUFFER_TRACE(inode_bitmap_bh,
+                                       "call ext4_handle_dirty_metadata");
+                               err = ext4_handle_dirty_metadata(handle,
+-                                                               inode,
++                                                               NULL,
+                                                       inode_bitmap_bh);
+                               if (err)
+                                       goto fail;
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -5158,7 +5158,7 @@ static int ext4_do_update_inode(handle_t
+                                       EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+                       sb->s_dirt = 1;
+                       ext4_handle_sync(handle);
+-                      err = ext4_handle_dirty_metadata(handle, inode,
++                      err = ext4_handle_dirty_metadata(handle, NULL,
+                                       EXT4_SB(sb)->s_sbh);
+               }
+       }
+@@ -5187,7 +5187,7 @@ static int ext4_do_update_inode(handle_t
+       }
+       BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+-      rc = ext4_handle_dirty_metadata(handle, inode, bh);
++      rc = ext4_handle_dirty_metadata(handle, NULL, bh);
+       if (!err)
+               err = rc;
+       ext4_clear_inode_state(inode, EXT4_STATE_NEW);
+@@ -5741,7 +5741,7 @@ static int ext4_pin_inode(handle_t *hand
+                       err = jbd2_journal_get_write_access(handle, iloc.bh);
+                       if (!err)
+                               err = ext4_handle_dirty_metadata(handle,
+-                                                               inode,
++                                                               NULL,
+                                                                iloc.bh);
+                       brelse(iloc.bh);
+               }
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -2024,7 +2024,7 @@ int ext4_orphan_add(handle_t *handle, st
+       /* Insert this inode at the head of the on-disk orphan list... */
+       NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
+       EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+-      err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh);
++      err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+       rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+       if (!err)
+               err = rc;
+@@ -2096,7 +2096,7 @@ int ext4_orphan_del(handle_t *handle, st
+               if (err)
+                       goto out_brelse;
+               sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+-              err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh);
++              err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+       } else {
+               struct ext4_iloc iloc2;
+               struct inode *i_prev =
diff --git a/queue-2.6.32/0015-ext4-Add-flag-to-files-with-blocks-intentionally-pas.patch b/queue-2.6.32/0015-ext4-Add-flag-to-files-with-blocks-intentionally-pas.patch
new file mode 100644 (file)
index 0000000..2909322
--- /dev/null
@@ -0,0 +1,140 @@
+From 9d176d321904553ab92a5df99e25ccb268a5560e Mon Sep 17 00:00:00 2001
+From: Jiaying Zhang <jiayingz@google.com>
+Date: Sun, 30 May 2010 22:49:29 -0400
+Subject: ext4: Add flag to files with blocks intentionally past EOF
+
+commit c8d46e41bc744c8fa0092112af3942fcd46c8b18 upstream (as of v2.6.33-git11)
+
+fallocate() may potentially instantiate blocks past EOF, depending
+on the flags used when it is called.
+
+e2fsck currently has a test for blocks past i_size, and it
+sometimes trips up - noticeably on xfstests 013 which runs fsstress.
+
+This patch from Jiayang does fix it up - it (along with
+e2fsprogs updates and other patches recently from Aneesh) has
+survived many fsstress runs in a row.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: Jiaying Zhang <jiayingz@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h    |    6 ++++--
+ fs/ext4/extents.c |   22 +++++++++++++++++++++-
+ fs/ext4/inode.c   |    9 ++++++++-
+ fs/ext4/ioctl.c   |    9 +++++++++
+ 4 files changed, 42 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -284,10 +284,12 @@ struct flex_groups {
+ #define EXT4_TOPDIR_FL                        0x00020000 /* Top of directory hierarchies*/
+ #define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
+ #define EXT4_EXTENTS_FL                       0x00080000 /* Inode uses extents */
++#define EXT4_EA_INODE_FL              0x00200000 /* Inode used for large EA */
++#define EXT4_EOFBLOCKS_FL             0x00400000 /* Blocks allocated beyond EOF */
+ #define EXT4_RESERVED_FL              0x80000000 /* reserved for ext4 lib */
+-#define EXT4_FL_USER_VISIBLE          0x000BDFFF /* User visible flags */
+-#define EXT4_FL_USER_MODIFIABLE               0x000B80FF /* User modifiable flags */
++#define EXT4_FL_USER_VISIBLE          0x004BDFFF /* User visible flags */
++#define EXT4_FL_USER_MODIFIABLE               0x004B80FF /* User modifiable flags */
+ /* Flags that should be inherited by new inodes from their parent. */
+ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3191,7 +3191,7 @@ int ext4_ext_get_blocks(handle_t *handle
+ {
+       struct ext4_ext_path *path = NULL;
+       struct ext4_extent_header *eh;
+-      struct ext4_extent newex, *ex;
++      struct ext4_extent newex, *ex, *last_ex;
+       ext4_fsblk_t newblock;
+       int err = 0, depth, ret, cache_type;
+       unsigned int allocated = 0;
+@@ -3372,6 +3372,19 @@ int ext4_ext_get_blocks(handle_t *handle
+                                                    EXT4_STATE_DIO_UNWRITTEN);
+               }
+       }
++
++      if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
++              if (eh->eh_entries) {
++                      last_ex = EXT_LAST_EXTENT(eh);
++                      if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
++                                          + ext4_ext_get_actual_len(last_ex))
++                              EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
++              } else {
++                      WARN_ON(eh->eh_entries == 0);
++                      ext4_error(inode->i_sb, __func__,
++                              "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
++                      }
++      }
+       err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+       if (err) {
+               /* free data blocks we just allocated */
+@@ -3505,6 +3518,13 @@ static void ext4_falloc_update_inode(str
+                       i_size_write(inode, new_size);
+               if (new_size > EXT4_I(inode)->i_disksize)
+                       ext4_update_i_disksize(inode, new_size);
++      } else {
++              /*
++               * Mark that we allocate beyond EOF so the subsequent truncate
++               * can proceed even if the new size is the same as i_size.
++               */
++              if (new_size > i_size_read(inode))
++                      EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
+       }
+ }
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -4490,6 +4490,8 @@ void ext4_truncate(struct inode *inode)
+       if (!ext4_can_truncate(inode))
+               return;
++      EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
++
+       if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+               ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+@@ -5345,7 +5347,9 @@ int ext4_setattr(struct dentry *dentry,
+       }
+       if (S_ISREG(inode->i_mode) &&
+-          attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
++          attr->ia_valid & ATTR_SIZE &&
++          (attr->ia_size < inode->i_size ||
++           (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
+               handle_t *handle;
+               handle = ext4_journal_start(inode, 3);
+@@ -5376,6 +5380,9 @@ int ext4_setattr(struct dentry *dentry,
+                               goto err_out;
+                       }
+               }
++              /* ext4_truncate will clear the flag */
++              if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
++                      ext4_truncate(inode);
+       }
+       rc = inode_setattr(inode, attr);
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -92,6 +92,15 @@ long ext4_ioctl(struct file *filp, unsig
+                       flags &= ~EXT4_EXTENTS_FL;
+               }
++              if (flags & EXT4_EOFBLOCKS_FL) {
++                      /* we don't support adding EOFBLOCKS flag */
++                      if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
++                              err = -EOPNOTSUPP;
++                              goto flags_out;
++                      }
++              } else if (oldflags & EXT4_EOFBLOCKS_FL)
++                      ext4_truncate(inode);
++
+               handle = ext4_journal_start(inode, 1);
+               if (IS_ERR(handle)) {
+                       err = PTR_ERR(handle);
diff --git a/queue-2.6.32/0016-ext4-Fix-fencepost-error-in-chosing-choosing-group-v.patch b/queue-2.6.32/0016-ext4-Fix-fencepost-error-in-chosing-choosing-group-v.patch
new file mode 100644 (file)
index 0000000..1a2f3f4
--- /dev/null
@@ -0,0 +1,60 @@
+From 2cbbb92297f15740e27f2e87eb21ab86d4432cba Mon Sep 17 00:00:00 2001
+From: Tao Ma <tao.ma@oracle.com>
+Date: Sun, 30 May 2010 22:49:30 -0400
+Subject: ext4: Fix fencepost error in chosing choosing group vs file preallocation.
+
+commit cc483f102c3f703e853c96f95a654f0106fb2603 upstream (as of v2.6.33-git11)
+
+The ext4 multiblock allocator decides whether to use group or file
+preallocation based on the file size.  When the file size reaches
+s_mb_stream_request (default is 16 blocks), it changes to use a
+file-specific preallocation. This is cool, but it has a tiny problem.
+
+See a simple script:
+mkfs.ext4 -b 1024 /dev/sda8 1000000
+mount -t ext4 -o nodelalloc /dev/sda8 /mnt/ext4
+for((i=0;i<5;i++))
+do
+cat /mnt/4096>>/mnt/ext4/a     #4096 is a file with 4096 characters.
+cat /mnt/4096>>/mnt/ext4/b
+done
+debuge4fs -R 'stat a' /dev/sda8|grep BLOCKS -A 1
+
+And you get
+BLOCKS:
+(0-14):8705-8719, (15):2356, (16-19):8465-8468
+
+So there are 3 extents, a bit strange for the lonely 15th logical
+block.  As we write to the 16 blocks, we choose file preallocation in
+ext4_mb_group_or_file, but in ext4_mb_normalize_request, we meet with
+the 16*1024 range, so no preallocation will be carried. file b then
+reserves the space after '2356', so when when write 16, we start from
+another part.
+
+This patch just change the check in ext4_mb_group_or_file, so
+that for the lonely 15 we will still use group preallocation.
+After the patch, we will get:
+debuge4fs -R 'stat a' /dev/sda8|grep BLOCKS -A 1
+BLOCKS:
+(0-15):8705-8720, (16-19):8465-8468
+
+Looks more sane. Thanks.
+
+Signed-off-by: Tao Ma <tao.ma@oracle.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/mballoc.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -3938,7 +3938,7 @@ static void ext4_mb_group_or_file(struct
+       /* don't use group allocation for large files */
+       size = max(size, isize);
+-      if (size >= sbi->s_mb_stream_request) {
++      if (size > sbi->s_mb_stream_request) {
+               ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+               return;
+       }
diff --git a/queue-2.6.32/0017-ext4-fix-error-handling-in-migrate.patch b/queue-2.6.32/0017-ext4-fix-error-handling-in-migrate.patch
new file mode 100644 (file)
index 0000000..8295f0a
--- /dev/null
@@ -0,0 +1,75 @@
+From 492c93e8097f0bf58b2884064af85242fabe5d71 Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:31 -0400
+Subject: ext4: fix error handling in migrate
+
+commit f39490bcd1691d65dc33689222a12e1fc13dd824 upstream (as of v2.6.33-git11)
+
+Set i_nlink to zero for temporary inode from very beginning.
+otherwise we may fail to start new journal handle and this
+inode will be unreferenced but with i_nlink == 1
+Since we hold inode reference it can not be pruned.
+
+Also add missed journal_start retval check.
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/migrate.c |   29 ++++++++++++++---------------
+ 1 file changed, 14 insertions(+), 15 deletions(-)
+
+--- a/fs/ext4/migrate.c
++++ b/fs/ext4/migrate.c
+@@ -494,14 +494,10 @@ int ext4_ext_migrate(struct inode *inode
+       }
+       i_size_write(tmp_inode, i_size_read(inode));
+       /*
+-       * We don't want the inode to be reclaimed
+-       * if we got interrupted in between. We have
+-       * this tmp inode carrying reference to the
+-       * data blocks of the original file. We set
+-       * the i_nlink to zero at the last stage after
+-       * switching the original file to extent format
++       * Set the i_nlink to zero so it will be deleted later
++       * when we drop inode reference.
+        */
+-      tmp_inode->i_nlink = 1;
++      tmp_inode->i_nlink = 0;
+       ext4_ext_tree_init(handle, tmp_inode);
+       ext4_orphan_add(handle, tmp_inode);
+@@ -528,6 +524,16 @@ int ext4_ext_migrate(struct inode *inode
+       up_read((&EXT4_I(inode)->i_data_sem));
+       handle = ext4_journal_start(inode, 1);
++      if (IS_ERR(handle)) {
++              /*
++               * It is impossible to update on-disk structures without
++               * a handle, so just rollback in-core changes and live other
++               * work to orphan_list_cleanup()
++               */
++              ext4_orphan_del(NULL, tmp_inode);
++              retval = PTR_ERR(handle);
++              goto out;
++      }
+       ei = EXT4_I(inode);
+       i_data = ei->i_data;
+@@ -609,15 +615,8 @@ err_out:
+       /* Reset the extent details */
+       ext4_ext_tree_init(handle, tmp_inode);
+-
+-      /*
+-       * Set the i_nlink to zero so that
+-       * generic_drop_inode really deletes the
+-       * inode
+-       */
+-      tmp_inode->i_nlink = 0;
+-
+       ext4_journal_stop(handle);
++out:
+       unlock_new_inode(tmp_inode);
+       iput(tmp_inode);
diff --git a/queue-2.6.32/0018-ext4-explicitly-remove-inode-from-orphan-list-after-.patch b/queue-2.6.32/0018-ext4-explicitly-remove-inode-from-orphan-list-after-.patch
new file mode 100644 (file)
index 0000000..d752a67
--- /dev/null
@@ -0,0 +1,28 @@
+From 6c582d8b4e6868f8e16d160c0435530d5f8fa8e5 Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:32 -0400
+Subject: ext4: explicitly remove inode from orphan list after failed direct io
+
+commit da1dafca84413145f5ac59998b4cdd06fb89f721 upstream (as of v2.6.33-git11)
+
+Otherwise non-empty orphan list will be triggered on umount.
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -3480,6 +3480,9 @@ retry:
+                        * but cannot extend i_size. Bail out and pretend
+                        * the write failed... */
+                       ret = PTR_ERR(handle);
++                      if (inode->i_nlink)
++                              ext4_orphan_del(NULL, inode);
++
+                       goto out;
+               }
+               if (inode->i_nlink)
diff --git a/queue-2.6.32/0019-ext4-Handle-non-empty-on-disk-orphan-link.patch b/queue-2.6.32/0019-ext4-Handle-non-empty-on-disk-orphan-link.patch
new file mode 100644 (file)
index 0000000..9bef19c
--- /dev/null
@@ -0,0 +1,45 @@
+From 7765050b0f7e5ffc9146c5cea83a14774ff03a73 Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:33 -0400
+Subject: ext4: Handle non empty on-disk orphan link
+
+commit 6e3617e579e070d3655a93ee9ed7149113e795e0 upstream (as of v2.6.33-git11)
+
+In case of truncate errors we explicitly remove inode from in-core
+orphan list via orphan_del(NULL, inode) without modifying the on-disk list.
+
+But later on, the same inode may be inserted in the orphan list again
+which will result the on-disk linked list getting corrupted.  If inode
+i_dtime contains valid value, then skip on-disk list modification.
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/namei.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -2020,6 +2020,13 @@ int ext4_orphan_add(handle_t *handle, st
+       err = ext4_reserve_inode_write(handle, inode, &iloc);
+       if (err)
+               goto out_unlock;
++      /*
++       * Due to previous errors inode may be already a part of on-disk
++       * orphan list. If so skip on-disk list modification.
++       */
++      if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
++              (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
++                      goto mem_insert;
+       /* Insert this inode at the head of the on-disk orphan list... */
+       NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
+@@ -2037,6 +2044,7 @@ int ext4_orphan_add(handle_t *handle, st
+        *
+        * This is safe: on error we're going to ignore the orphan list
+        * anyway on the next recovery. */
++mem_insert:
+       if (!err)
+               list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
diff --git a/queue-2.6.32/0020-ext4-make-offset-consistent-in-ext4_check_dir_entry.patch b/queue-2.6.32/0020-ext4-make-offset-consistent-in-ext4_check_dir_entry.patch
new file mode 100644 (file)
index 0000000..bc332b5
--- /dev/null
@@ -0,0 +1,40 @@
+From 5921c8d6a6e598b1101b5785f09bbe334e92957d Mon Sep 17 00:00:00 2001
+From: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
+Date: Sun, 30 May 2010 22:49:34 -0400
+Subject: ext4: make "offset" consistent in ext4_check_dir_entry()
+
+commit b8b8afe236e97b6359d46d3a3f8c46455e192271 upstream (as of v2.6.33-git11)
+
+The callers of ext4_check_dir_entry() usually pass in the "file
+offset" (ext4_readdir, htree_dirblock_to_tree, search_dirblock,
+ext4_dx_find_entry, empty_dir), but a few callers (add_dirent_to_buf,
+ext4_delete_entry) only pass in the buffer offset.
+
+To accomodate those last two (which would be hard to fix otherwise),
+this patch changes ext4_check_dir_entry() to print the physical block
+number and the relative offset as well as the passed-in offset.
+
+Signed-off-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/dir.c |    8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+--- a/fs/ext4/dir.c
++++ b/fs/ext4/dir.c
+@@ -84,9 +84,11 @@ int ext4_check_dir_entry(const char *fun
+       if (error_msg != NULL)
+               ext4_error(dir->i_sb, function,
+-                      "bad entry in directory #%lu: %s - "
+-                      "offset=%u, inode=%u, rec_len=%d, name_len=%d",
+-                      dir->i_ino, error_msg, offset,
++                      "bad entry in directory #%lu: %s - block=%llu"
++                      "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
++                      dir->i_ino, error_msg,
++                      (unsigned long long) bh->b_blocknr,
++                      (unsigned) (offset%bh->b_size), offset,
+                       le32_to_cpu(de->inode),
+                       rlen, de->name_len);
+       return error_msg == NULL ? 1 : 0;
diff --git a/queue-2.6.32/0021-ext4-Fix-insertion-point-of-extent-in-mext_insert_ac.patch b/queue-2.6.32/0021-ext4-Fix-insertion-point-of-extent-in-mext_insert_ac.patch
new file mode 100644 (file)
index 0000000..71b6d63
--- /dev/null
@@ -0,0 +1,54 @@
+From 98cc8ca4405bfb2d511c83ced6c46153c04d5f76 Mon Sep 17 00:00:00 2001
+From: Akira Fujita <a-fujita@rs.jp.nec.com>
+Date: Sun, 30 May 2010 22:49:35 -0400
+Subject: ext4: Fix insertion point of extent in mext_insert_across_blocks()
+
+commit 5fd5249aa36fad98c9fd5edced352939e54f9324 upstream (as of v2.6.33-git11)
+
+If the leaf node has 2 extent space or fewer and EXT4_IOC_MOVE_EXT
+ioctl is called with the file offset where after the 2nd extent
+covers, mext_insert_across_blocks() always tries to insert extent into
+the first extent.  As a result, the file gets corrupted because of
+wrong extent order.  The patch fixes this problem.
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -252,6 +252,7 @@ mext_insert_across_blocks(handle_t *hand
+               }
+               o_start->ee_len = start_ext->ee_len;
++              eblock = le32_to_cpu(start_ext->ee_block);
+               new_flag = 1;
+       } else if (start_ext->ee_len && new_ext->ee_len &&
+@@ -262,6 +263,7 @@ mext_insert_across_blocks(handle_t *hand
+                * orig  |------------------------------|
+                */
+               o_start->ee_len = start_ext->ee_len;
++              eblock = le32_to_cpu(start_ext->ee_block);
+               new_flag = 1;
+       } else if (!start_ext->ee_len && new_ext->ee_len &&
+@@ -502,6 +504,7 @@ mext_leaf_block(handle_t *handle, struct
+               le32_to_cpu(oext->ee_block) + oext_alen) {
+               start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
+                                              le32_to_cpu(oext->ee_block));
++              start_ext.ee_block = oext->ee_block;
+               copy_extent_status(oext, &start_ext);
+       } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
+               prev_ext = oext - 1;
+@@ -515,6 +518,7 @@ mext_leaf_block(handle_t *handle, struct
+                       start_ext.ee_len = cpu_to_le16(
+                               ext4_ext_get_actual_len(prev_ext) +
+                               new_ext_alen);
++                      start_ext.ee_block = oext->ee_block;
+                       copy_extent_status(prev_ext, &start_ext);
+                       new_ext.ee_len = 0;
+               }
diff --git a/queue-2.6.32/0022-ext4-Fix-the-NULL-reference-in-double_down_write_dat.patch b/queue-2.6.32/0022-ext4-Fix-the-NULL-reference-in-double_down_write_dat.patch
new file mode 100644 (file)
index 0000000..d412ffb
--- /dev/null
@@ -0,0 +1,50 @@
+From 06518e8c9d0a67cb024545b880849b68b79a5390 Mon Sep 17 00:00:00 2001
+From: Akira Fujita <a-fujita@rs.jp.nec.com>
+Date: Sun, 30 May 2010 22:49:36 -0400
+Subject: ext4: Fix the NULL reference in double_down_write_data_sem()
+
+commit 7247c0caa23d94a1cb6b307edba9dc45fb0798d4 upstream (as of v2.6.33-git11)
+
+If EXT4_IOC_MOVE_EXT ioctl is called with NULL donor_fd, fget() in
+ext4_ioctl() gets inappropriate file structure for donor; so we need
+to do this check earlier, before calling double_down_write_data_sem().
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/move_extent.c |   16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -953,14 +953,6 @@ mext_check_arguments(struct inode *orig_
+       unsigned int blkbits = orig_inode->i_blkbits;
+       unsigned int blocksize = 1 << blkbits;
+-      /* Regular file check */
+-      if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
+-              ext4_debug("ext4 move extent: The argument files should be "
+-                      "regular file [ino:orig %lu, donor %lu]\n",
+-                      orig_inode->i_ino, donor_inode->i_ino);
+-              return -EINVAL;
+-      }
+-
+       if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
+               ext4_debug("ext4 move extent: suid or sgid is set"
+                          " to donor file [ino:orig %lu, donor %lu]\n",
+@@ -1207,6 +1199,14 @@ ext4_move_extents(struct file *o_filp, s
+                       orig_inode->i_ino, donor_inode->i_ino);
+               return -EINVAL;
+       }
++
++      /* Regular file check */
++      if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
++              ext4_debug("ext4 move extent: The argument files should be "
++                      "regular file [ino:orig %lu, donor %lu]\n",
++                      orig_inode->i_ino, donor_inode->i_ino);
++              return -EINVAL;
++      }
+       /* Protect orig and donor inodes against a truncate */
+       ret1 = mext_inode_double_lock(orig_inode, donor_inode);
diff --git a/queue-2.6.32/0023-ext4-Code-cleanup-for-EXT4_IOC_MOVE_EXT-ioctl.patch b/queue-2.6.32/0023-ext4-Code-cleanup-for-EXT4_IOC_MOVE_EXT-ioctl.patch
new file mode 100644 (file)
index 0000000..6e7e766
--- /dev/null
@@ -0,0 +1,58 @@
+From eee98b87da36ae78c6867d8ce1943f65a16da648 Mon Sep 17 00:00:00 2001
+From: Akira Fujita <a-fujita@rs.jp.nec.com>
+Date: Sun, 30 May 2010 22:49:37 -0400
+Subject: ext4: Code cleanup for EXT4_IOC_MOVE_EXT ioctl
+
+commit c437b2733520599a2c6e0dbcdeae611319f84707 upstream (as of v2.6.33-git11)
+
+a) Fix sparse warning in ext4_ioctl()
+b) Remove unneeded variable in mext_leaf_block()
+c) Fix spelling typo in mext_check_arguments()
+
+Signed-off-by: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ioctl.c       |    3 ++-
+ fs/ext4/move_extent.c |    4 +---
+ 2 files changed, 3 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -258,7 +258,8 @@ setversion_out:
+               if (me.moved_len > 0)
+                       file_remove_suid(donor_filp);
+-              if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
++              if (copy_to_user((struct move_extent __user *)arg,
++                               &me, sizeof(me)))
+                       err = -EFAULT;
+ mext_out:
+               fput(donor_filp);
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -477,7 +477,6 @@ mext_leaf_block(handle_t *handle, struct
+       struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
+       struct ext4_extent new_ext, start_ext, end_ext;
+       ext4_lblk_t new_ext_end;
+-      ext4_fsblk_t new_phys_end;
+       int oext_alen, new_ext_alen, end_ext_alen;
+       int depth = ext_depth(orig_inode);
+       int ret;
+@@ -491,7 +490,6 @@ mext_leaf_block(handle_t *handle, struct
+       new_ext.ee_len = dext->ee_len;
+       new_ext_alen = ext4_ext_get_actual_len(&new_ext);
+       new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
+-      new_phys_end = ext_pblock(&new_ext) + new_ext_alen - 1;
+       /*
+        * Case: original extent is first
+@@ -932,7 +930,7 @@ out2:
+ }
+ /**
+- * mext_check_argumants - Check whether move extent can be done
++ * mext_check_arguments - Check whether move extent can be done
+  *
+  * @orig_inode:               original inode
+  * @donor_inode:      donor inode
diff --git a/queue-2.6.32/0024-ext4-Fix-estimate-of-of-blocks-needed-to-write-indir.patch b/queue-2.6.32/0024-ext4-Fix-estimate-of-of-blocks-needed-to-write-indir.patch
new file mode 100644 (file)
index 0000000..9fa6f64
--- /dev/null
@@ -0,0 +1,36 @@
+From 24bce2c3022a0ff4cb418ed11173bef96bd9806a Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Sun, 30 May 2010 22:49:38 -0400
+Subject: ext4: Fix estimate of # of blocks needed to write indirect-mapped files
+
+commit d330a5befb88875a9b3d2db62f9b74dadf660b13 upstream (as of v2.6.34-rc3)
+
+http://bugzilla.kernel.org/show_bug.cgi?id=15420
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1060,7 +1060,7 @@ static int ext4_indirect_calc_metadata_a
+                                             sector_t lblock)
+ {
+       struct ext4_inode_info *ei = EXT4_I(inode);
+-      int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1;
++      sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
+       int blk_bits;
+       if (lblock < EXT4_NDIR_BLOCKS)
+@@ -1075,7 +1075,7 @@ static int ext4_indirect_calc_metadata_a
+       }
+       ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+       ei->i_da_metadata_calc_len = 1;
+-      blk_bits = roundup_pow_of_two(lblock + 1);
++      blk_bits = order_base_2(lblock);
+       return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
+ }
diff --git a/queue-2.6.32/0025-ext4-Fixed-inode-allocator-to-correctly-track-a-flex.patch b/queue-2.6.32/0025-ext4-Fixed-inode-allocator-to-correctly-track-a-flex.patch
new file mode 100644 (file)
index 0000000..2e468e5
--- /dev/null
@@ -0,0 +1,38 @@
+From 0177767f12e4ebcb387fc3c7e5945611ce0dd6f1 Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Sun, 30 May 2010 22:49:39 -0400
+Subject: ext4: Fixed inode allocator to correctly track a flex_bg's used_dirs
+
+commit c4caae25187ff3f5e837c6f04eb1acc2723c72d3 upstream (as of v2.6.34-rc3)
+
+When used_dirs was introduced for the flex_groups struct, it looks
+like the accounting was not put into place properly, in some places
+manipulating free_inodes rather than used_dirs.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ialloc.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -268,7 +268,7 @@ void ext4_free_inode(handle_t *handle, s
+                                       ext4_group_t f;
+                                       f = ext4_flex_group(sbi, block_group);
+-                                      atomic_dec(&sbi->s_flex_groups[f].free_inodes);
++                                      atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+                               }
+                       }
+@@ -779,7 +779,7 @@ static int ext4_claim_inode(struct super
+               if (sbi->s_log_groups_per_flex) {
+                       ext4_group_t f = ext4_flex_group(sbi, group);
+-                      atomic_inc(&sbi->s_flex_groups[f].free_inodes);
++                      atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+               }
+       }
+       gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
diff --git a/queue-2.6.32/0026-ext4-Fix-possible-lost-inode-write-in-no-journal-mod.patch b/queue-2.6.32/0026-ext4-Fix-possible-lost-inode-write-in-no-journal-mod.patch
new file mode 100644 (file)
index 0000000..7133182
--- /dev/null
@@ -0,0 +1,39 @@
+From 457ad9487d209f3c7bcb6de32aa393f75ba5e22d Mon Sep 17 00:00:00 2001
+From: Curt Wohlgemuth <curtw@google.com>
+Date: Sun, 30 May 2010 22:49:40 -0400
+Subject: ext4: Fix possible lost inode write in no journal mode
+
+commit 8b472d739b2ddd8ab7fb278874f696cd95b25a5e upstream (as of v2.6.34-rc6)
+
+In the no-journal case, ext4_write_inode() will fetch the bh and call
+sync_dirty_buffer() on it.  However, if the bh has already been
+written and the bh reclaimed for some other purpose, AND if the inode
+is the only one in the inode table block in use, then
+ext4_get_inode_loc() will not read the inode table block from disk,
+but as an optimization, fill the block with zero's assuming that its
+caller will copy in the on-disk version of the inode.  This is not
+done by ext4_write_inode(), so the contents of the inode can simply
+get lost.  The fix is to use __ext4_get_inode_loc() with in_mem set to
+0, instead of ext4_get_inode_loc().  Long term the API needs to be
+fixed so it's obvious why latter is not safe.
+
+Addresses-Google-Bug: #2526446
+
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -5260,7 +5260,7 @@ int ext4_write_inode(struct inode *inode
+       } else {
+               struct ext4_iloc iloc;
+-              err = ext4_get_inode_loc(inode, &iloc);
++              err = __ext4_get_inode_loc(inode, &iloc, 0);
+               if (err)
+                       return err;
+               if (wait)
diff --git a/queue-2.6.32/0027-ext4-Fix-buffer-head-leaks-after-calls-to-ext4_get_i.patch b/queue-2.6.32/0027-ext4-Fix-buffer-head-leaks-after-calls-to-ext4_get_i.patch
new file mode 100644 (file)
index 0000000..47c8328
--- /dev/null
@@ -0,0 +1,42 @@
+From 62de51f3a99493a99d7f4e3793b5952b40880ea0 Mon Sep 17 00:00:00 2001
+From: Curt Wohlgemuth <curtw@google.com>
+Date: Sun, 30 May 2010 22:49:41 -0400
+Subject: ext4: Fix buffer head leaks after calls to ext4_get_inode_loc()
+
+commit fd2dd9fbaf9e498ec63eef298921e36556f7214c upstream (as of v2.6.34-rc6)
+
+Calls to ext4_get_inode_loc() returns with a reference to a buffer
+head in iloc->bh.  The callers of this function in ext4_write_inode()
+when in no journal mode and in ext4_xattr_fiemap() don't release the
+buffer head after using it.
+
+Addresses-Google-Bug: #2548165
+
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c |    1 +
+ fs/ext4/inode.c   |    1 +
+ 2 files changed, 2 insertions(+)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3778,6 +3778,7 @@ static int ext4_xattr_fiemap(struct inod
+               physical += offset;
+               length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
+               flags |= FIEMAP_EXTENT_DATA_INLINE;
++              brelse(iloc.bh);
+       } else { /* external block */
+               physical = EXT4_I(inode)->i_file_acl << blockbits;
+               length = inode->i_sb->s_blocksize;
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -5273,6 +5273,7 @@ int ext4_write_inode(struct inode *inode
+                                  (unsigned long long)iloc.bh->b_blocknr);
+                       err = -EIO;
+               }
++              brelse(iloc.bh);
+       }
+       return err;
+ }
diff --git a/queue-2.6.32/0028-ext4-Issue-the-discard-operation-before-releasing-th.patch b/queue-2.6.32/0028-ext4-Issue-the-discard-operation-before-releasing-th.patch
new file mode 100644 (file)
index 0000000..e4c3fc4
--- /dev/null
@@ -0,0 +1,58 @@
+From 462d9c2b296ce81bf4c6a6899e256ae6188f9a5a Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 30 May 2010 22:49:42 -0400
+Subject: ext4: Issue the discard operation *before* releasing the blocks to be reused
+
+commit b90f687018e6d6c77d981b09203780f7001407e5 upstream (as of v2.6.34-rc6)
+
+Otherwise, we can end up having data corruption because the blocks
+could get reused and then discarded!
+
+https://bugzilla.kernel.org/show_bug.cgi?id=15579
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/mballoc.c |   24 +++++++++++-------------
+ 1 file changed, 11 insertions(+), 13 deletions(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -2537,6 +2537,17 @@ static void release_blocks_on_commit(jou
+               mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+                        entry->count, entry->group, entry);
++              if (test_opt(sb, DISCARD)) {
++                      ext4_fsblk_t discard_block;
++
++                      discard_block = entry->start_blk +
++                              ext4_group_first_block_no(sb, entry->group);
++                      trace_ext4_discard_blocks(sb,
++                                      (unsigned long long)discard_block,
++                                      entry->count);
++                      sb_issue_discard(sb, discard_block, entry->count);
++              }
++
+               err = ext4_mb_load_buddy(sb, entry->group, &e4b);
+               /* we expect to find existing buddy because it's pinned */
+               BUG_ON(err != 0);
+@@ -2558,19 +2569,6 @@ static void release_blocks_on_commit(jou
+                       page_cache_release(e4b.bd_bitmap_page);
+               }
+               ext4_unlock_group(sb, entry->group);
+-              if (test_opt(sb, DISCARD)) {
+-                      ext4_fsblk_t discard_block;
+-                      struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+-
+-                      discard_block = (ext4_fsblk_t)entry->group *
+-                                              EXT4_BLOCKS_PER_GROUP(sb)
+-                                      + entry->start_blk
+-                                      + le32_to_cpu(es->s_first_data_block);
+-                      trace_ext4_discard_blocks(sb,
+-                                      (unsigned long long)discard_block,
+-                                      entry->count);
+-                      sb_issue_discard(sb, discard_block, entry->count);
+-              }
+               kmem_cache_free(ext4_free_ext_cachep, entry);
+               ext4_mb_release_desc(&e4b);
+       }
diff --git a/queue-2.6.32/0029-ext4-check-missed-return-value-in-ext4_sync_file.patch b/queue-2.6.32/0029-ext4-check-missed-return-value-in-ext4_sync_file.patch
new file mode 100644 (file)
index 0000000..c691f10
--- /dev/null
@@ -0,0 +1,25 @@
+From 6aac59ef585709fa8e03cf86dc741954b3af47c7 Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:43 -0400
+Subject: ext4: check missed return value in ext4_sync_file()
+
+commit 0671e704658b9f26f85e78d51176daa861f955c7 upstream (as of v2.6.34-git13)
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/fsync.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/fsync.c
++++ b/fs/ext4/fsync.c
+@@ -101,7 +101,7 @@ int ext4_sync_file(struct file *file, st
+                   (journal->j_fs_dev != journal->j_dev) &&
+                   (journal->j_flags & JBD2_BARRIER))
+                       blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+-              jbd2_log_wait_commit(journal, commit_tid);
++              ret = jbd2_log_wait_commit(journal, commit_tid);
+       } else if (journal->j_flags & JBD2_BARRIER)
+               blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+       return ret;
diff --git a/queue-2.6.32/0030-ext4-fix-memory-leaks-in-error-path-handling-of-ext4.patch b/queue-2.6.32/0030-ext4-fix-memory-leaks-in-error-path-handling-of-ext4.patch
new file mode 100644 (file)
index 0000000..d03fe27
--- /dev/null
@@ -0,0 +1,62 @@
+From bc65559adfab46dcbcab65d1830490c5043983bf Mon Sep 17 00:00:00 2001
+From: Jing Zhang <zj.barak@gmail.com>
+Date: Sun, 30 May 2010 22:49:44 -0400
+Subject: ext4: fix memory leaks in error path handling of ext4_ext_zeroout()
+
+commit b720303df7352d4a7a1f61e467e0a124913c0d41 upstream (as of v2.6.34-git13)
+
+When EIO occurs after bio is submitted, there is no memory free
+operation for bio, which results in memory leakage. And there is also
+no check against bio_alloc() for bio.
+
+Acked-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
+Signed-off-by: Jing Zhang <zj.barak@gmail.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c |   15 ++++++++-------
+ 1 file changed, 8 insertions(+), 7 deletions(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -2446,7 +2446,7 @@ static void bi_complete(struct bio *bio,
+ /* FIXME!! we need to try to merge to left or right after zero-out  */
+ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
+ {
+-      int ret = -EIO;
++      int ret;
+       struct bio *bio;
+       int blkbits, blocksize;
+       sector_t ee_pblock;
+@@ -2470,6 +2470,9 @@ static int ext4_ext_zeroout(struct inode
+                       len = ee_len;
+               bio = bio_alloc(GFP_NOIO, len);
++              if (!bio)
++                      return -ENOMEM;
++
+               bio->bi_sector = ee_pblock;
+               bio->bi_bdev   = inode->i_sb->s_bdev;
+@@ -2497,17 +2500,15 @@ static int ext4_ext_zeroout(struct inode
+               submit_bio(WRITE, bio);
+               wait_for_completion(&event);
+-              if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+-                      ret = 0;
+-              else {
+-                      ret = -EIO;
+-                      break;
++              if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
++                      bio_put(bio);
++                      return -EIO;
+               }
+               bio_put(bio);
+               ee_len    -= done;
+               ee_pblock += done  << (blkbits - 9);
+       }
+-      return ret;
++      return 0;
+ }
+ #define EXT4_EXT_ZERO_LEN 7
diff --git a/queue-2.6.32/0031-ext4-Remove-unnecessary-call-to-ext4_get_group_desc-.patch b/queue-2.6.32/0031-ext4-Remove-unnecessary-call-to-ext4_get_group_desc-.patch
new file mode 100644 (file)
index 0000000..ae93e97
--- /dev/null
@@ -0,0 +1,32 @@
+From dc93068aadac2019c504112d2761773e64e7ba72 Mon Sep 17 00:00:00 2001
+From: Jing Zhang <zj.barak@gmail.com>
+Date: Sun, 30 May 2010 22:49:45 -0400
+Subject: ext4: Remove unnecessary call to ext4_get_group_desc() in mballoc
+
+commit 62e823a2cba18509ee826d775270e8ef9071b5bc upstream (as of v2.6.34-git13)
+
+Signed-off-by: Jing Zhang <zj.barak@gmail.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/mballoc.c |    2 --
+ 1 file changed, 2 deletions(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -2027,7 +2027,6 @@ repeat:
+               for (i = 0; i < ngroups; group++, i++) {
+                       struct ext4_group_info *grp;
+-                      struct ext4_group_desc *desc;
+                       if (group == ngroups)
+                               group = 0;
+@@ -2050,7 +2049,6 @@ repeat:
+                       }
+                       ac->ac_groups_scanned++;
+-                      desc = ext4_get_group_desc(sb, group, NULL);
+                       if (cr == 0)
+                               ext4_mb_simple_scan_group(ac, &e4b);
+                       else if (cr == 1 &&
diff --git a/queue-2.6.32/0032-ext4-rename-ext4_mb_release_desc-to-ext4_mb_unload_b.patch b/queue-2.6.32/0032-ext4-rename-ext4_mb_release_desc-to-ext4_mb_unload_b.patch
new file mode 100644 (file)
index 0000000..b26f5f6
--- /dev/null
@@ -0,0 +1,127 @@
+From 5fc0d2b4f06dfd2a941e23171a5a4a155383c47a Mon Sep 17 00:00:00 2001
+From: Jing Zhang <zj.barak@gmail.com>
+Date: Sun, 30 May 2010 22:49:46 -0400
+Subject: ext4: rename ext4_mb_release_desc() to ext4_mb_unload_buddy()
+
+commit e39e07fdfd98be8650385f12a7b81d6adc547510 upstream (as of v2.6.34-git13)
+
+This function cleans up after ext4_mb_load_buddy(), so the renaming
+makes the code clearer.
+
+Signed-off-by: Jing Zhang <zj.barak@gmail.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/mballoc.c |   24 ++++++++++++------------
+ 1 file changed, 12 insertions(+), 12 deletions(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -1150,7 +1150,7 @@ err:
+       return ret;
+ }
+-static void ext4_mb_release_desc(struct ext4_buddy *e4b)
++static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
+ {
+       if (e4b->bd_bitmap_page)
+               page_cache_release(e4b->bd_bitmap_page);
+@@ -1618,7 +1618,7 @@ int ext4_mb_try_best_found(struct ext4_a
+       }
+       ext4_unlock_group(ac->ac_sb, group);
+-      ext4_mb_release_desc(e4b);
++      ext4_mb_unload_buddy(e4b);
+       return 0;
+ }
+@@ -1674,7 +1674,7 @@ int ext4_mb_find_by_goal(struct ext4_all
+               ext4_mb_use_best_found(ac, e4b);
+       }
+       ext4_unlock_group(ac->ac_sb, group);
+-      ext4_mb_release_desc(e4b);
++      ext4_mb_unload_buddy(e4b);
+       return 0;
+ }
+@@ -2044,7 +2044,7 @@ repeat:
+                       if (!ext4_mb_good_group(ac, group, cr)) {
+                               /* someone did allocation from this group */
+                               ext4_unlock_group(sb, group);
+-                              ext4_mb_release_desc(&e4b);
++                              ext4_mb_unload_buddy(&e4b);
+                               continue;
+                       }
+@@ -2058,7 +2058,7 @@ repeat:
+                               ext4_mb_complex_scan_group(ac, &e4b);
+                       ext4_unlock_group(sb, group);
+-                      ext4_mb_release_desc(&e4b);
++                      ext4_mb_unload_buddy(&e4b);
+                       if (ac->ac_status != AC_STATUS_CONTINUE)
+                               break;
+@@ -2148,7 +2148,7 @@ static int ext4_mb_seq_groups_show(struc
+       ext4_lock_group(sb, group);
+       memcpy(&sg, ext4_get_group_info(sb, group), i);
+       ext4_unlock_group(sb, group);
+-      ext4_mb_release_desc(&e4b);
++      ext4_mb_unload_buddy(&e4b);
+       seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
+                       sg.info.bb_fragments, sg.info.bb_first_free);
+@@ -2568,7 +2568,7 @@ static void release_blocks_on_commit(jou
+               }
+               ext4_unlock_group(sb, entry->group);
+               kmem_cache_free(ext4_free_ext_cachep, entry);
+-              ext4_mb_release_desc(&e4b);
++              ext4_mb_unload_buddy(&e4b);
+       }
+       mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
+@@ -3705,7 +3705,7 @@ out:
+       ext4_unlock_group(sb, group);
+       if (ac)
+               kmem_cache_free(ext4_ac_cachep, ac);
+-      ext4_mb_release_desc(&e4b);
++      ext4_mb_unload_buddy(&e4b);
+       put_bh(bitmap_bh);
+       return free;
+ }
+@@ -3809,7 +3809,7 @@ repeat:
+               if (bitmap_bh == NULL) {
+                       ext4_error(sb, __func__, "Error in reading block "
+                                       "bitmap for %u", group);
+-                      ext4_mb_release_desc(&e4b);
++                      ext4_mb_unload_buddy(&e4b);
+                       continue;
+               }
+@@ -3818,7 +3818,7 @@ repeat:
+               ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
+               ext4_unlock_group(sb, group);
+-              ext4_mb_release_desc(&e4b);
++              ext4_mb_unload_buddy(&e4b);
+               put_bh(bitmap_bh);
+               list_del(&pa->u.pa_tmp_list);
+@@ -4082,7 +4082,7 @@ ext4_mb_discard_lg_preallocations(struct
+               ext4_mb_release_group_pa(&e4b, pa, ac);
+               ext4_unlock_group(sb, group);
+-              ext4_mb_release_desc(&e4b);
++              ext4_mb_unload_buddy(&e4b);
+               list_del(&pa->u.pa_tmp_list);
+               call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+       }
+@@ -4584,7 +4584,7 @@ do_more:
+               atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
+       }
+-      ext4_mb_release_desc(&e4b);
++      ext4_mb_unload_buddy(&e4b);
+       *freed += count;
diff --git a/queue-2.6.32/0033-ext4-allow-defrag-EXT4_IOC_MOVE_EXT-in-32bit-compat-.patch b/queue-2.6.32/0033-ext4-allow-defrag-EXT4_IOC_MOVE_EXT-in-32bit-compat-.patch
new file mode 100644 (file)
index 0000000..144ceae
--- /dev/null
@@ -0,0 +1,42 @@
+From 9a0bd6ee7ccc0cfdc614dbc6a4708d596ec53f82 Mon Sep 17 00:00:00 2001
+From: Christian Borntraeger <borntraeger@de.ibm.com>
+Date: Sun, 30 May 2010 22:49:47 -0400
+Subject: ext4: allow defrag (EXT4_IOC_MOVE_EXT) in 32bit compat mode
+
+commit b684b2ee9409f2890a8b3aea98525bbe5f84e276 upstream (as of v2.6.34-git13)
+
+I have an x86_64 kernel with i386 userspace. e4defrag fails on the
+EXT4_IOC_MOVE_EXT ioctl because it is not wired up for the compat
+case. It seems that struct move_extent is compat save, only types
+with fixed widths are used:
+{
+        __u32 reserved;         /* should be zero */
+        __u32 donor_fd;         /* donor file descriptor */
+        __u64 orig_start;       /* logical start offset in block for orig */
+        __u64 donor_start;      /* logical start offset in block for donor */
+        __u64 len;              /* block length to be moved */
+        __u64 moved_len;        /* moved block length */
+};
+
+Lets just wire up EXT4_IOC_MOVE_EXT for the compat case.
+
+Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Reviewed-by: Eric Sandeen <sandeen@redhat.com>
+CC: Akira Fujita <a-fujita@rs.jp.nec.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ioctl.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -375,6 +375,8 @@ long ext4_compat_ioctl(struct file *file
+               break;
+       case EXT4_IOC_GROUP_ADD:
+               break;
++      case EXT4_IOC_MOVE_EXT:
++              break;
+       default:
+               return -ENOIOCTLCMD;
+       }
diff --git a/queue-2.6.32/0034-ext4-fix-quota-accounting-in-case-of-fallocate.patch b/queue-2.6.32/0034-ext4-fix-quota-accounting-in-case-of-fallocate.patch
new file mode 100644 (file)
index 0000000..46217fa
--- /dev/null
@@ -0,0 +1,28 @@
+From 93984006ca6af7d067409fd6db2bedd999af2b0d Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:48 -0400
+Subject: ext4: fix quota accounting in case of fallocate
+
+commit 35121c9860316d7799cea0fbc359a9186e7c2747 upstream (as of v2.6.34-git13)
+
+allocated_meta_data is already included in 'used' variable.
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -1149,7 +1149,8 @@ void ext4_da_update_reserve_space(struct
+                */
+               if (allocated_meta_blocks)
+                       vfs_dq_claim_block(inode, allocated_meta_blocks);
+-              vfs_dq_release_reservation_block(inode, mdb_free + used);
++              vfs_dq_release_reservation_block(inode, mdb_free + used -
++                                              allocated_meta_blocks);
+       }
+       /*
diff --git a/queue-2.6.32/0035-ext4-check-s_log_groups_per_flex-in-online-resize-co.patch b/queue-2.6.32/0035-ext4-check-s_log_groups_per_flex-in-online-resize-co.patch
new file mode 100644 (file)
index 0000000..8d30580
--- /dev/null
@@ -0,0 +1,46 @@
+From 9e92f0bbe85a6ceead4b1215861f1a30bfe1d9dc Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Sun, 30 May 2010 22:49:49 -0400
+Subject: ext4: check s_log_groups_per_flex in online resize code
+
+commit 42007efd569f1cf3bfb9a61da60ef6c2179508ca upstream (as of v2.6.34-git13)
+
+If groups_per_flex < 2, sbi->s_flex_groups[] doesn't get filled out,
+and every other access to this first tests s_log_groups_per_flex;
+same thing needs to happen in resize or we'll wander off into
+a null pointer when doing an online resize of the file system.
+
+Thanks to Christoph Biedl, who came up with the trivial testcase:
+
+# truncate --size 128M fsfile
+# mkfs.ext3 -F fsfile
+# tune2fs -O extents,uninit_bg,dir_index,flex_bg,huge_file,dir_nlink,extra_isize fsfile
+# e2fsck -yDf -C0 fsfile
+# truncate --size 132M fsfile
+# losetup /dev/loop0 fsfile
+# mount /dev/loop0 mnt
+# resize2fs -p /dev/loop0
+
+       https://bugzilla.kernel.org/show_bug.cgi?id=13549
+
+Reported-by: Alessandro Polverini <alex@nibbles.it>
+Test-case-by: Christoph Biedl  <bugzilla.kernel.bpeb@manchmal.in-ulm.de>
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/resize.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/resize.c
++++ b/fs/ext4/resize.c
+@@ -930,7 +930,8 @@ int ext4_group_add(struct super_block *s
+       percpu_counter_add(&sbi->s_freeinodes_counter,
+                          EXT4_INODES_PER_GROUP(sb));
+-      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
++      if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
++          sbi->s_log_groups_per_flex) {
+               ext4_group_t flex_group;
+               flex_group = ext4_flex_group(sbi, input->group);
+               atomic_add(input->free_blocks_count,
diff --git a/queue-2.6.32/0036-ext4-don-t-return-to-userspace-after-freezing-the-fs.patch b/queue-2.6.32/0036-ext4-don-t-return-to-userspace-after-freezing-the-fs.patch
new file mode 100644 (file)
index 0000000..a224c52
--- /dev/null
@@ -0,0 +1,86 @@
+From 168b7c0d3438662c33488f73a27036f14c176efc Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Sun, 30 May 2010 22:49:50 -0400
+Subject: ext4: don't return to userspace after freezing the fs with a mutex held
+
+commit 6b0310fbf087ad6e9e3b8392adca97cd77184084 upstream (as of v2.6.34-git13)
+
+ext4_freeze() used jbd2_journal_lock_updates() which takes
+the j_barrier mutex, and then returns to userspace.  The
+kernel does not like this:
+
+================================================
+[ BUG: lock held when returning to user space! ]
+------------------------------------------------
+lvcreate/1075 is leaving the kernel with locks still held!
+1 lock held by lvcreate/1075:
+ #0:  (&journal->j_barrier){+.+...}, at: [<ffffffff811c6214>]
+jbd2_journal_lock_updates+0xe1/0xf0
+
+Use vfs_check_frozen() added to ext4_journal_start_sb() and
+ext4_force_commit() instead.
+
+Addresses-Red-Hat-Bugzilla: #568503
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/super.c |   20 ++++++++++----------
+ 1 file changed, 10 insertions(+), 10 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -227,6 +227,7 @@ handle_t *ext4_journal_start_sb(struct s
+       if (sb->s_flags & MS_RDONLY)
+               return ERR_PTR(-EROFS);
++      vfs_check_frozen(sb, SB_FREEZE_WRITE);
+       /* Special case here: if the journal has aborted behind our
+        * backs (eg. EIO in the commit thread), then we still need to
+        * take the FS itself readonly cleanly. */
+@@ -3391,8 +3392,10 @@ int ext4_force_commit(struct super_block
+               return 0;
+       journal = EXT4_SB(sb)->s_journal;
+-      if (journal)
++      if (journal) {
++              vfs_check_frozen(sb, SB_FREEZE_WRITE);
+               ret = ext4_journal_force_commit(journal);
++      }
+       return ret;
+ }
+@@ -3441,18 +3444,16 @@ static int ext4_freeze(struct super_bloc
+        * the journal.
+        */
+       error = jbd2_journal_flush(journal);
+-      if (error < 0) {
+-      out:
+-              jbd2_journal_unlock_updates(journal);
+-              return error;
+-      }
++      if (error < 0)
++              goto out;
+       /* Journal blocked and flushed, clear needs_recovery flag. */
+       EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+       error = ext4_commit_super(sb, 1);
+-      if (error)
+-              goto out;
+-      return 0;
++out:
++      /* we rely on s_frozen to stop further updates */
++      jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
++      return error;
+ }
+ /*
+@@ -3469,7 +3470,6 @@ static int ext4_unfreeze(struct super_bl
+       EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+       ext4_commit_super(sb, 1);
+       unlock_super(sb);
+-      jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+       return 0;
+ }
diff --git a/queue-2.6.32/0037-ext4-stop-issuing-discards-if-not-supported-by-devic.patch b/queue-2.6.32/0037-ext4-stop-issuing-discards-if-not-supported-by-devic.patch
new file mode 100644 (file)
index 0000000..d942d74
--- /dev/null
@@ -0,0 +1,44 @@
+From 0778bf26394249a97740013f92198b5272703e8b Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Sun, 30 May 2010 22:49:51 -0400
+Subject: ext4: stop issuing discards if not supported by device
+
+commit a30eec2a8650a77f754e84b2e15f062fe652baa7 upstream (as of v2.6.34-git13)
+
+Turn off issuance of discard requests if the device does
+not support it - similar to the action we take for barriers.
+This will save a little computation time if a non-discardable
+device is mounted with -o discard, and also makes it obvious
+that it's not doing what was asked at mount time ...
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/mballoc.c |    8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -2536,6 +2536,7 @@ static void release_blocks_on_commit(jou
+                        entry->count, entry->group, entry);
+               if (test_opt(sb, DISCARD)) {
++                      int ret;
+                       ext4_fsblk_t discard_block;
+                       discard_block = entry->start_blk +
+@@ -2543,7 +2544,12 @@ static void release_blocks_on_commit(jou
+                       trace_ext4_discard_blocks(sb,
+                                       (unsigned long long)discard_block,
+                                       entry->count);
+-                      sb_issue_discard(sb, discard_block, entry->count);
++                      ret = sb_issue_discard(sb, discard_block, entry->count);
++                      if (ret == EOPNOTSUPP) {
++                              ext4_warning(sb, __func__,
++                                      "discard not supported, disabling");
++                              clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
++                      }
+               }
+               err = ext4_mb_load_buddy(sb, entry->group, &e4b);
diff --git a/queue-2.6.32/0038-ext4-don-t-scan-accumulate-more-pages-than-mballoc-w.patch b/queue-2.6.32/0038-ext4-don-t-scan-accumulate-more-pages-than-mballoc-w.patch
new file mode 100644 (file)
index 0000000..95ea1c2
--- /dev/null
@@ -0,0 +1,62 @@
+From 2f4283aff3e5415fa36cbf81aa2a6247bfbb0527 Mon Sep 17 00:00:00 2001
+From: Eric Sandeen <sandeen@redhat.com>
+Date: Sun, 30 May 2010 22:49:52 -0400
+Subject: ext4: don't scan/accumulate more pages than mballoc will allocate
+
+commit c445e3e0a5c2804524dec6e55f66d63f6bc5bc3e upstream (as of v2.6.34-git13)
+
+There was a bug reported on RHEL5 that a 10G dd on a 12G box
+had a very, very slow sync after that.
+
+At issue was the loop in write_cache_pages scanning all the way
+to the end of the 10G file, even though the subsequent call
+to mpage_da_submit_io would only actually write a smallish amt; then
+we went back to the write_cache_pages loop ... wasting tons of time
+in calling __mpage_da_writepage for thousands of pages we would
+just revisit (many times) later.
+
+Upstream it's not such a big issue for sys_sync because we get
+to the loop with a much smaller nr_to_write, which limits the loop.
+
+However, talking with Aneesh he realized that fsync upstream still
+gets here with a very large nr_to_write and we face the same problem.
+
+This patch makes mpage_add_bh_to_extent stop the loop after we've
+accumulated 2048 pages, by setting mpd->io_done = 1; which ultimately
+causes the write_cache_pages loop to break.
+
+Repeating the test with a dirty_ratio of 80 (to leave something for
+fsync to do), I don't see huge IO performance gains, but the reduction
+in cpu usage is striking: 80% usage with stock, and 2% with the
+below patch.  Instrumenting the loop in write_cache_pages clearly
+shows that we are wasting time here.
+
+Eventually we need to change mpage_da_map_pages() also submit its I/O
+to the block layer, subsuming mpage_da_submit_io(), and then change it
+call ext4_get_blocks() multiple times.
+
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c |    9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -2361,6 +2361,15 @@ static void mpage_add_bh_to_extent(struc
+       sector_t next;
+       int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
++      /*
++       * XXX Don't go larger than mballoc is willing to allocate
++       * This is a stopgap solution.  We eventually need to fold
++       * mpage_da_submit_io() into this function and then call
++       * ext4_get_blocks() multiple times in a loop
++       */
++      if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
++              goto flush_it;
++
+       /* check if thereserved journal credits might overflow */
+       if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+               if (nrblocks >= EXT4_MAX_TRANS_DATA) {
diff --git a/queue-2.6.32/0039-ext4-Do-not-zero-out-uninitialized-extents-beyond-i_.patch b/queue-2.6.32/0039-ext4-Do-not-zero-out-uninitialized-extents-beyond-i_.patch
new file mode 100644 (file)
index 0000000..9612827
--- /dev/null
@@ -0,0 +1,199 @@
+From 3f9db529f4db9500a2bc9d296258a0dd8f9ac03e Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:53 -0400
+Subject: ext4: Do not zero out uninitialized extents beyond i_size
+
+commit 21ca087a3891efab4d45488db8febee474d26c68 upstream (as of v2.6.34-git13)
+
+The extents code will sometimes zero out blocks and mark them as
+initialized instead of splitting an extent into several smaller ones.
+This optimization however, causes problems if the extent is beyond
+i_size because fsck will complain if there are uninitialized blocks
+after i_size as this can not be distinguished from an inode that has
+an incorrect i_size field.
+
+https://bugzilla.kernel.org/show_bug.cgi?id=15742
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c |   67 +++++++++++++++++++++++++++++++++++++++++-------------
+ 1 file changed, 51 insertions(+), 16 deletions(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -2533,11 +2533,21 @@ static int ext4_ext_convert_to_initializ
+       struct ext4_extent *ex2 = NULL;
+       struct ext4_extent *ex3 = NULL;
+       struct ext4_extent_header *eh;
+-      ext4_lblk_t ee_block;
++      ext4_lblk_t ee_block, eof_block;
+       unsigned int allocated, ee_len, depth;
+       ext4_fsblk_t newblock;
+       int err = 0;
+       int ret = 0;
++      int may_zeroout;
++
++      ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
++              "block %llu, max_blocks %u\n", inode->i_ino,
++              (unsigned long long)iblock, max_blocks);
++
++      eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
++              inode->i_sb->s_blocksize_bits;
++      if (eof_block < iblock + max_blocks)
++              eof_block = iblock + max_blocks;
+       depth = ext_depth(inode);
+       eh = path[depth].p_hdr;
+@@ -2546,16 +2556,23 @@ static int ext4_ext_convert_to_initializ
+       ee_len = ext4_ext_get_actual_len(ex);
+       allocated = ee_len - (iblock - ee_block);
+       newblock = iblock - ee_block + ext_pblock(ex);
++
+       ex2 = ex;
+       orig_ex.ee_block = ex->ee_block;
+       orig_ex.ee_len   = cpu_to_le16(ee_len);
+       ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
++      /*
++       * It is safe to convert extent to initialized via explicit
++       * zeroout only if extent is fully insde i_size or new_size.
++       */
++      may_zeroout = ee_block + ee_len <= eof_block;
++
+       err = ext4_ext_get_access(handle, inode, path + depth);
+       if (err)
+               goto out;
+       /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
+-      if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
++      if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
+               err =  ext4_ext_zeroout(inode, &orig_ex);
+               if (err)
+                       goto fix_extent_len;
+@@ -2586,7 +2603,7 @@ static int ext4_ext_convert_to_initializ
+       if (allocated > max_blocks) {
+               unsigned int newdepth;
+               /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
+-              if (allocated <= EXT4_EXT_ZERO_LEN) {
++              if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
+                       /*
+                        * iblock == ee_block is handled by the zerouout
+                        * at the beginning.
+@@ -2662,7 +2679,7 @@ static int ext4_ext_convert_to_initializ
+               ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+               ext4_ext_mark_uninitialized(ex3);
+               err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
+-              if (err == -ENOSPC) {
++              if (err == -ENOSPC && may_zeroout) {
+                       err =  ext4_ext_zeroout(inode, &orig_ex);
+                       if (err)
+                               goto fix_extent_len;
+@@ -2686,8 +2703,10 @@ static int ext4_ext_convert_to_initializ
+                * update the extent length after successful insert of the
+                * split extent
+                */
+-              orig_ex.ee_len = cpu_to_le16(ee_len -
+-                                              ext4_ext_get_actual_len(ex3));
++              ee_len -= ext4_ext_get_actual_len(ex3);
++              orig_ex.ee_len = cpu_to_le16(ee_len);
++              may_zeroout = ee_block + ee_len <= eof_block;
++
+               depth = newdepth;
+               ext4_ext_drop_refs(path);
+               path = ext4_ext_find_extent(inode, iblock, path);
+@@ -2711,7 +2730,7 @@ static int ext4_ext_convert_to_initializ
+                * otherwise give the extent a chance to merge to left
+                */
+               if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
+-                                                      iblock != ee_block) {
++                      iblock != ee_block && may_zeroout) {
+                       err =  ext4_ext_zeroout(inode, &orig_ex);
+                       if (err)
+                               goto fix_extent_len;
+@@ -2780,7 +2799,7 @@ static int ext4_ext_convert_to_initializ
+       goto out;
+ insert:
+       err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
+-      if (err == -ENOSPC) {
++      if (err == -ENOSPC && may_zeroout) {
+               err =  ext4_ext_zeroout(inode, &orig_ex);
+               if (err)
+                       goto fix_extent_len;
+@@ -2840,14 +2859,21 @@ static int ext4_split_unwritten_extents(
+       struct ext4_extent *ex2 = NULL;
+       struct ext4_extent *ex3 = NULL;
+       struct ext4_extent_header *eh;
+-      ext4_lblk_t ee_block;
++      ext4_lblk_t ee_block, eof_block;
+       unsigned int allocated, ee_len, depth;
+       ext4_fsblk_t newblock;
+       int err = 0;
++      int may_zeroout;
++
++      ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
++              "block %llu, max_blocks %u\n", inode->i_ino,
++              (unsigned long long)iblock, max_blocks);
++
++      eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
++              inode->i_sb->s_blocksize_bits;
++      if (eof_block < iblock + max_blocks)
++              eof_block = iblock + max_blocks;
+-      ext_debug("ext4_split_unwritten_extents: inode %lu,"
+-                "iblock %llu, max_blocks %u\n", inode->i_ino,
+-                (unsigned long long)iblock, max_blocks);
+       depth = ext_depth(inode);
+       eh = path[depth].p_hdr;
+       ex = path[depth].p_ext;
+@@ -2855,12 +2881,19 @@ static int ext4_split_unwritten_extents(
+       ee_len = ext4_ext_get_actual_len(ex);
+       allocated = ee_len - (iblock - ee_block);
+       newblock = iblock - ee_block + ext_pblock(ex);
++
+       ex2 = ex;
+       orig_ex.ee_block = ex->ee_block;
+       orig_ex.ee_len   = cpu_to_le16(ee_len);
+       ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+       /*
++       * It is safe to convert extent to initialized via explicit
++       * zeroout only if extent is fully insde i_size or new_size.
++       */
++      may_zeroout = ee_block + ee_len <= eof_block;
++
++      /*
+        * If the uninitialized extent begins at the same logical
+        * block where the write begins, and the write completely
+        * covers the extent, then we don't need to split it.
+@@ -2894,7 +2927,7 @@ static int ext4_split_unwritten_extents(
+               ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+               ext4_ext_mark_uninitialized(ex3);
+               err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
+-              if (err == -ENOSPC) {
++              if (err == -ENOSPC && may_zeroout) {
+                       err =  ext4_ext_zeroout(inode, &orig_ex);
+                       if (err)
+                               goto fix_extent_len;
+@@ -2918,8 +2951,10 @@ static int ext4_split_unwritten_extents(
+                * update the extent length after successful insert of the
+                * split extent
+                */
+-              orig_ex.ee_len = cpu_to_le16(ee_len -
+-                                              ext4_ext_get_actual_len(ex3));
++              ee_len -= ext4_ext_get_actual_len(ex3);
++              orig_ex.ee_len = cpu_to_le16(ee_len);
++              may_zeroout = ee_block + ee_len <= eof_block;
++
+               depth = newdepth;
+               ext4_ext_drop_refs(path);
+               path = ext4_ext_find_extent(inode, iblock, path);
+@@ -2965,7 +3000,7 @@ static int ext4_split_unwritten_extents(
+       goto out;
+ insert:
+       err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+-      if (err == -ENOSPC) {
++      if (err == -ENOSPC && may_zeroout) {
+               err =  ext4_ext_zeroout(inode, &orig_ex);
+               if (err)
+                       goto fix_extent_len;
diff --git a/queue-2.6.32/0040-ext4-clean-up-inode-bitmaps-manipulation-in-ext4_fre.patch b/queue-2.6.32/0040-ext4-clean-up-inode-bitmaps-manipulation-in-ext4_fre.patch
new file mode 100644 (file)
index 0000000..4e50452
--- /dev/null
@@ -0,0 +1,124 @@
+From ae42cce7e825bdc82a8e9c30a87c342d1e364e57 Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:54 -0400
+Subject: ext4: clean up inode bitmaps manipulation in ext4_free_inode
+
+commit d17413c08cd2b1dd2bf2cfdbb0f7b736b2b2b15c upstrea (as of v2..34-git13)
+
+- Reorganize locking scheme to batch two atomic operation in to one.
+  This also allow us to state what healthy group must obey following rule
+  ext4_free_inodes_count(sb, gdp) == ext4_count_free(inode_bitmap, NUM);
+- Fix possible undefined pointer dereference.
+- Even if group descriptor stats aren't accessible we have to update
+  inode bitmaps.
+- Move non-group members update out of group_lock.
+
+Note: this commit has been observed to fix fs corruption problems
+under heavy fs load
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ialloc.c |   85 +++++++++++++++++++++++++------------------------------
+ 1 file changed, 39 insertions(+), 46 deletions(-)
+
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -244,57 +244,50 @@ void ext4_free_inode(handle_t *handle, s
+       if (fatal)
+               goto error_return;
+-      /* Ok, now we can actually update the inode bitmaps.. */
+-      cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
+-                                      bit, bitmap_bh->b_data);
+-      if (!cleared)
+-              ext4_error(sb, "ext4_free_inode",
+-                         "bit already cleared for inode %lu", ino);
+-      else {
+-              gdp = ext4_get_group_desc(sb, block_group, &bh2);
+-
++      fatal = -ESRCH;
++      gdp = ext4_get_group_desc(sb, block_group, &bh2);
++      if (gdp) {
+               BUFFER_TRACE(bh2, "get_write_access");
+               fatal = ext4_journal_get_write_access(handle, bh2);
+-              if (fatal) goto error_return;
+-
+-              if (gdp) {
+-                      ext4_lock_group(sb, block_group);
+-                      count = ext4_free_inodes_count(sb, gdp) + 1;
+-                      ext4_free_inodes_set(sb, gdp, count);
+-                      if (is_directory) {
+-                              count = ext4_used_dirs_count(sb, gdp) - 1;
+-                              ext4_used_dirs_set(sb, gdp, count);
+-                              if (sbi->s_log_groups_per_flex) {
+-                                      ext4_group_t f;
+-
+-                                      f = ext4_flex_group(sbi, block_group);
+-                                      atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+-                              }
+-
+-                      }
+-                      gdp->bg_checksum = ext4_group_desc_csum(sbi,
+-                                                      block_group, gdp);
+-                      ext4_unlock_group(sb, block_group);
+-                      percpu_counter_inc(&sbi->s_freeinodes_counter);
+-                      if (is_directory)
+-                              percpu_counter_dec(&sbi->s_dirs_counter);
++      }
++      ext4_lock_group(sb, block_group);
++      cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
++      if (fatal || !cleared) {
++              ext4_unlock_group(sb, block_group);
++              goto out;
++      }
+-                      if (sbi->s_log_groups_per_flex) {
+-                              ext4_group_t f;
++      count = ext4_free_inodes_count(sb, gdp) + 1;
++      ext4_free_inodes_set(sb, gdp, count);
++      if (is_directory) {
++              count = ext4_used_dirs_count(sb, gdp) - 1;
++              ext4_used_dirs_set(sb, gdp, count);
++              percpu_counter_dec(&sbi->s_dirs_counter);
++      }
++      gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
++      ext4_unlock_group(sb, block_group);
+-                              f = ext4_flex_group(sbi, block_group);
+-                              atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+-                      }
+-              }
+-              BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+-              err = ext4_handle_dirty_metadata(handle, NULL, bh2);
+-              if (!fatal) fatal = err;
++      percpu_counter_inc(&sbi->s_freeinodes_counter);
++      if (sbi->s_log_groups_per_flex) {
++              ext4_group_t f = ext4_flex_group(sbi, block_group);
++
++              atomic_inc(&sbi->s_flex_groups[f].free_inodes);
++              if (is_directory)
++                      atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+       }
+-      BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+-      err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+-      if (!fatal)
+-              fatal = err;
+-      sb->s_dirt = 1;
++      BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
++      fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
++out:
++      if (cleared) {
++              BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
++              err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
++              if (!fatal)
++                      fatal = err;
++              sb->s_dirt = 1;
++      } else
++              ext4_error(sb, "ext4_free_inode",
++                         "bit already cleared for inode %lu", ino);
++
+ error_return:
+       brelse(bitmap_bh);
+       ext4_std_error(sb, fatal);
diff --git a/queue-2.6.32/0041-ext4-init-statistics-after-journal-recovery.patch b/queue-2.6.32/0041-ext4-init-statistics-after-journal-recovery.patch
new file mode 100644 (file)
index 0000000..72f7f31
--- /dev/null
@@ -0,0 +1,93 @@
+From 73337c4a1e35c3dedceb9e2d3af84da8614e6a45 Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:49:55 -0400
+Subject: ext4: init statistics after journal recovery
+
+commit 84061e07c5fbbbf9dc8aef8fb750fc3a2dfc31f3 upstream (as of v2.6.34-git13)
+
+Currently block/inode/dir counters initialized before journal was
+recovered. In fact after journal recovery this info will probably
+change. And freeblocks it critical for correct delalloc mode
+accounting.
+
+https://bugzilla.kernel.org/show_bug.cgi?id=15768
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Acked-by: Jan Kara <jack@suse.cz>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/super.c |   41 ++++++++++++++++++-----------------------
+ 1 file changed, 18 insertions(+), 23 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -2695,24 +2695,6 @@ static int ext4_fill_super(struct super_
+       get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+       spin_lock_init(&sbi->s_next_gen_lock);
+-      err = percpu_counter_init(&sbi->s_freeblocks_counter,
+-                      ext4_count_free_blocks(sb));
+-      if (!err) {
+-              err = percpu_counter_init(&sbi->s_freeinodes_counter,
+-                              ext4_count_free_inodes(sb));
+-      }
+-      if (!err) {
+-              err = percpu_counter_init(&sbi->s_dirs_counter,
+-                              ext4_count_dirs(sb));
+-      }
+-      if (!err) {
+-              err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+-      }
+-      if (err) {
+-              ext4_msg(sb, KERN_ERR, "insufficient memory");
+-              goto failed_mount3;
+-      }
+-
+       sbi->s_stripe = ext4_get_stripe_size(sbi);
+       sbi->s_max_writeback_mb_bump = 128;
+@@ -2832,7 +2814,20 @@ static int ext4_fill_super(struct super_
+       set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
+ no_journal:
+-
++      err = percpu_counter_init(&sbi->s_freeblocks_counter,
++                                ext4_count_free_blocks(sb));
++      if (!err)
++              err = percpu_counter_init(&sbi->s_freeinodes_counter,
++                                        ext4_count_free_inodes(sb));
++      if (!err)
++              err = percpu_counter_init(&sbi->s_dirs_counter,
++                                        ext4_count_dirs(sb));
++      if (!err)
++              err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
++      if (err) {
++              ext4_msg(sb, KERN_ERR, "insufficient memory");
++              goto failed_mount_wq;
++      }
+       if (test_opt(sb, NOBH)) {
+               if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
+                       ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
+@@ -2965,6 +2960,10 @@ failed_mount_wq:
+               jbd2_journal_destroy(sbi->s_journal);
+               sbi->s_journal = NULL;
+       }
++      percpu_counter_destroy(&sbi->s_freeblocks_counter);
++      percpu_counter_destroy(&sbi->s_freeinodes_counter);
++      percpu_counter_destroy(&sbi->s_dirs_counter);
++      percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+ failed_mount3:
+       if (sbi->s_flex_groups) {
+               if (is_vmalloc_addr(sbi->s_flex_groups))
+@@ -2972,10 +2971,6 @@ failed_mount3:
+               else
+                       kfree(sbi->s_flex_groups);
+       }
+-      percpu_counter_destroy(&sbi->s_freeblocks_counter);
+-      percpu_counter_destroy(&sbi->s_freeinodes_counter);
+-      percpu_counter_destroy(&sbi->s_dirs_counter);
+-      percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
+ failed_mount2:
+       for (i = 0; i < db_count; i++)
+               brelse(sbi->s_group_desc[i]);
diff --git a/queue-2.6.32/0042-ext4-Remove-extraneous-newlines-in-ext4_msg-calls.patch b/queue-2.6.32/0042-ext4-Remove-extraneous-newlines-in-ext4_msg-calls.patch
new file mode 100644 (file)
index 0000000..4a4cd7b
--- /dev/null
@@ -0,0 +1,57 @@
+From 2db9e1a9cc528228b60ece755187b60331db966d Mon Sep 17 00:00:00 2001
+From: Curt Wohlgemuth <curtw@google.com>
+Date: Sun, 30 May 2010 22:49:56 -0400
+Subject: ext4: Remove extraneous newlines in ext4_msg() calls
+
+commit fbe845ddf368f77f86aa7500f8fd2690f54c66a8 upstream (as of v2.6.34-git13)
+
+Addresses-Google-Bug: #2562325
+
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/inode.c |    6 +++---
+ fs/ext4/super.c |    2 +-
+ 2 files changed, 4 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -2294,7 +2294,7 @@ static int mpage_da_map_blocks(struct mp
+               ext4_msg(mpd->inode->i_sb, KERN_CRIT,
+                        "delayed block allocation failed for inode %lu at "
+                        "logical offset %llu with max blocks %zd with "
+-                       "error %d\n", mpd->inode->i_ino,
++                       "error %d", mpd->inode->i_ino,
+                        (unsigned long long) next,
+                        mpd->b_size >> mpd->inode->i_blkbits, err);
+               printk(KERN_CRIT "This should not happen!!  "
+@@ -2956,7 +2956,7 @@ retry:
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
+-                             "%ld pages, ino %lu; err %d\n", __func__,
++                             "%ld pages, ino %lu; err %d", __func__,
+                               wbc->nr_to_write, inode->i_ino, ret);
+                       goto out_writepages;
+               }
+@@ -3031,7 +3031,7 @@ retry:
+       if (pages_skipped != wbc->pages_skipped)
+               ext4_msg(inode->i_sb, KERN_CRIT,
+                        "This should not happen leaving %s "
+-                       "with nr_to_write = %ld ret = %d\n",
++                       "with nr_to_write = %ld ret = %d",
+                        __func__, wbc->nr_to_write, ret);
+       /* Update index */
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -2902,7 +2902,7 @@ no_journal:
+       err = ext4_setup_system_zone(sb);
+       if (err) {
+               ext4_msg(sb, KERN_ERR, "failed to initialize system "
+-                       "zone (%d)\n", err);
++                       "zone (%d)", err);
+               goto failed_mount4;
+       }
diff --git a/queue-2.6.32/0043-ext4-Prevent-creation-of-files-larger-than-RLIMIT_FS.patch b/queue-2.6.32/0043-ext4-Prevent-creation-of-files-larger-than-RLIMIT_FS.patch
new file mode 100644 (file)
index 0000000..2d25790
--- /dev/null
@@ -0,0 +1,32 @@
+From 1050094d53941e319e9d50d4171f060dddd5dc87 Mon Sep 17 00:00:00 2001
+From: Nikanth Karthikesan <knikanth@suse.de>
+Date: Sun, 30 May 2010 22:49:57 -0400
+Subject: ext4: Prevent creation of files larger than RLIMIT_FSIZE using fallocate
+
+commit 6d19c42b7cf81c39632b6d4dbc514e8449bcd346 upstream (as of v2.6.34-git13)
+
+Currently using posix_fallocate one can bypass an RLIMIT_FSIZE limit
+and create a file larger than the limit. Add a check for that.
+
+Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
+Signed-off-by: Amit Arora <aarora@in.ibm.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c |    5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3607,6 +3607,11 @@ long ext4_fallocate(struct inode *inode,
+        */
+       credits = ext4_chunk_trans_blocks(inode, max_blocks);
+       mutex_lock(&inode->i_mutex);
++      ret = inode_newsize_ok(inode, (len + offset));
++      if (ret) {
++              mutex_unlock(&inode->i_mutex);
++              return ret;
++      }
+ retry:
+       while (ret >= 0 && ret < max_blocks) {
+               block = block + ret;
diff --git a/queue-2.6.32/0044-ext4-check-for-a-good-block-group-before-loading-bud.patch b/queue-2.6.32/0044-ext4-check-for-a-good-block-group-before-loading-bud.patch
new file mode 100644 (file)
index 0000000..69f7090
--- /dev/null
@@ -0,0 +1,214 @@
+From 7d4df70b86aef3e1c2b92bede60009527b3470fd Mon Sep 17 00:00:00 2001
+From: Curt Wohlgemuth <curtw@google.com>
+Date: Sun, 30 May 2010 22:49:58 -0400
+Subject: ext4: check for a good block group before loading buddy pages
+
+commit 8a57d9d61a6e361c7bb159dda797672c1df1a691 upstream (as of v2.6.34-git13)
+
+This adds a new field in ext4_group_info to cache the largest available
+block range in a block group; and don't load the buddy pages until *after*
+we've done a sanity check on the block group.
+
+With large allocation requests (e.g., fallocate(), 8MiB) and relatively full
+partitions, it's easy to have no block groups with a block extent large
+enough to satisfy the input request length.  This currently causes the loop
+during cr == 0 in ext4_mb_regular_allocator() to load the buddy bitmap pages
+for EVERY block group.  That can be a lot of pages.  The patch below allows
+us to call ext4_mb_good_group() BEFORE we load the buddy pages (although we
+have check again after we lock the block group).
+
+Addresses-Google-Bug: #2578108
+Addresses-Google-Bug: #2704453
+
+Signed-off-by: Curt Wohlgemuth <curtw@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h    |    1 
+ fs/ext4/mballoc.c |   70 +++++++++++++++++++++++++++++++++++++++++++-----------
+ 2 files changed, 58 insertions(+), 13 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1657,6 +1657,7 @@ struct ext4_group_info {
+       ext4_grpblk_t   bb_first_free;  /* first free block */
+       ext4_grpblk_t   bb_free;        /* total free blocks */
+       ext4_grpblk_t   bb_fragments;   /* nr of freespace fragments */
++      ext4_grpblk_t   bb_largest_free_order;/* order of largest frag in BG */
+       struct          list_head bb_prealloc_list;
+ #ifdef DOUBLE_CHECK
+       void            *bb_bitmap;
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(str
+       }
+ }
++/*
++ * Cache the order of the largest free extent we have available in this block
++ * group.
++ */
++static void
++mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
++{
++      int i;
++      int bits;
++
++      grp->bb_largest_free_order = -1; /* uninit */
++
++      bits = sb->s_blocksize_bits + 1;
++      for (i = bits; i >= 0; i--) {
++              if (grp->bb_counters[i] > 0) {
++                      grp->bb_largest_free_order = i;
++                      break;
++              }
++      }
++}
++
+ static noinline_for_stack
+ void ext4_mb_generate_buddy(struct super_block *sb,
+                               void *buddy, void *bitmap, ext4_group_t group)
+@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super
+                */
+               grp->bb_free = free;
+       }
++      mb_set_largest_free_order(sb, grp);
+       clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
+@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super
+  * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
+  * So it can have information regarding groups_per_page which
+  * is blocks_per_page/2
++ *
++ * Locking note:  This routine takes the block group lock of all groups
++ * for this page; do not hold this lock when calling this routine!
+  */
+ static int ext4_mb_init_cache(struct page *page, char *incore)
+@@ -910,6 +935,11 @@ out:
+       return err;
+ }
++/*
++ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
++ * block group lock of all groups for this page; do not hold the BG lock when
++ * calling this routine!
++ */
+ static noinline_for_stack
+ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+ {
+@@ -1004,6 +1034,11 @@ err:
+       return ret;
+ }
++/*
++ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
++ * block group lock of all groups for this page; do not hold the BG lock when
++ * calling this routine!
++ */
+ static noinline_for_stack int
+ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
+                                       struct ext4_buddy *e4b)
+@@ -1300,6 +1335,7 @@ static void mb_free_blocks(struct inode
+                       buddy = buddy2;
+               } while (1);
+       }
++      mb_set_largest_free_order(sb, e4b->bd_info);
+       mb_check_buddy(e4b);
+ }
+@@ -1428,6 +1464,7 @@ static int mb_mark_used(struct ext4_budd
+               e4b->bd_info->bb_counters[ord]++;
+               e4b->bd_info->bb_counters[ord]++;
+       }
++      mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
+       mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+       mb_check_buddy(e4b);
+@@ -1823,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_al
+       }
+ }
++/* This is now called BEFORE we load the buddy bitmap. */
+ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
+                               ext4_group_t group, int cr)
+ {
+       unsigned free, fragments;
+-      unsigned i, bits;
+       int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
+       struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
+       BUG_ON(cr < 0 || cr >= 4);
+-      BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
++
++      /* We only do this if the grp has never been initialized */
++      if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
++              int ret = ext4_mb_init_group(ac->ac_sb, group);
++              if (ret)
++                      return 0;
++      }
+       free = grp->bb_free;
+       fragments = grp->bb_fragments;
+@@ -1845,17 +1888,16 @@ static int ext4_mb_good_group(struct ext
+       case 0:
+               BUG_ON(ac->ac_2order == 0);
++              if (grp->bb_largest_free_order < ac->ac_2order)
++                      return 0;
++
+               /* Avoid using the first bg of a flexgroup for data files */
+               if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
+                   (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
+                   ((group % flex_size) == 0))
+                       return 0;
+-              bits = ac->ac_sb->s_blocksize_bits + 1;
+-              for (i = ac->ac_2order; i <= bits; i++)
+-                      if (grp->bb_counters[i] > 0)
+-                              return 1;
+-              break;
++              return 1;
+       case 1:
+               if ((free / fragments) >= ac->ac_g_ex.fe_len)
+                       return 1;
+@@ -2026,14 +2068,11 @@ repeat:
+               group = ac->ac_g_ex.fe_group;
+               for (i = 0; i < ngroups; group++, i++) {
+-                      struct ext4_group_info *grp;
+-
+                       if (group == ngroups)
+                               group = 0;
+-                      /* quick check to skip empty groups */
+-                      grp = ext4_get_group_info(sb, group);
+-                      if (grp->bb_free == 0)
++                      /* This now checks without needing the buddy page */
++                      if (!ext4_mb_good_group(ac, group, cr))
+                               continue;
+                       err = ext4_mb_load_buddy(sb, group, &e4b);
+@@ -2041,8 +2080,12 @@ repeat:
+                               goto out;
+                       ext4_lock_group(sb, group);
++
++                      /*
++                       * We need to check again after locking the
++                       * block group
++                       */
+                       if (!ext4_mb_good_group(ac, group, cr)) {
+-                              /* someone did allocation from this group */
+                               ext4_unlock_group(sb, group);
+                               ext4_mb_unload_buddy(&e4b);
+                               continue;
+@@ -2255,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_b
+       INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+       init_rwsem(&meta_group_info[i]->alloc_sem);
+       meta_group_info[i]->bb_free_root.rb_node = NULL;
++      meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
+ #ifdef DOUBLE_CHECK
+       {
diff --git a/queue-2.6.32/0045-ext4-Show-journal_checksum-option.patch b/queue-2.6.32/0045-ext4-Show-journal_checksum-option.patch
new file mode 100644 (file)
index 0000000..b497975
--- /dev/null
@@ -0,0 +1,27 @@
+From ab93377b76de07d4c8aacde97418651c7df6854e Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Sun, 30 May 2010 22:49:59 -0400
+Subject: ext4: Show journal_checksum option
+
+commit 39a4bade8c1826b658316d66ee81c09b0a4d7d42 upstream (as of v2.6.34-git13)
+
+We failed to show journal_checksum option in /proc/mounts. Fix it.
+
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/super.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -877,6 +877,8 @@ static int ext4_show_options(struct seq_
+       seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
+       if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+               seq_puts(seq, ",journal_async_commit");
++      else if (test_opt(sb, JOURNAL_CHECKSUM))
++              seq_puts(seq, ",journal_checksum");
+       if (test_opt(sb, NOBH))
+               seq_puts(seq, ",nobh");
+       if (test_opt(sb, I_VERSION))
diff --git a/queue-2.6.32/0046-ext4-Use-bitops-to-read-modify-i_flags-in-struct-ext.patch b/queue-2.6.32/0046-ext4-Use-bitops-to-read-modify-i_flags-in-struct-ext.patch
new file mode 100644 (file)
index 0000000..03444c2
--- /dev/null
@@ -0,0 +1,558 @@
+From cc781d3f1f03b2fd24b7260ed319dc34bf605ed0 Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:50:00 -0400
+Subject: ext4: Use bitops to read/modify i_flags in struct ext4_inode_info
+
+commit 12e9b892002d9af057655d35b44db8ee9243b0dc upstream (as of v2.6.34-git13)
+
+At several places we modify EXT4_I(inode)->i_flags without holding
+i_mutex (ext4_do_update_inode, ...). These modifications are racy and
+we can lose updates to i_flags. So convert handling of i_flags to use
+bitops which are atomic.
+
+https://bugzilla.kernel.org/show_bug.cgi?id=15792
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/dir.c         |    4 -
+ fs/ext4/ext4.h        |  109 +++++++++++++++++++++++++++++++++++++++++++-------
+ fs/ext4/ext4_jbd2.h   |    6 +-
+ fs/ext4/extents.c     |   10 ++--
+ fs/ext4/file.c        |    2 
+ fs/ext4/ialloc.c      |    4 -
+ fs/ext4/inode.c       |   30 ++++++-------
+ fs/ext4/mballoc.c     |    4 -
+ fs/ext4/migrate.c     |    2 
+ fs/ext4/move_extent.c |    4 -
+ fs/ext4/namei.c       |   10 ++--
+ fs/ext4/super.c       |    1 
+ fs/ext4/xattr.c       |    4 -
+ 13 files changed, 135 insertions(+), 55 deletions(-)
+
+--- a/fs/ext4/dir.c
++++ b/fs/ext4/dir.c
+@@ -111,7 +111,7 @@ static int ext4_readdir(struct file *fil
+       if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+                                   EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+-          ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
++          ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
+            ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
+               err = ext4_dx_readdir(filp, dirent, filldir);
+               if (err != ERR_BAD_DX_DIR) {
+@@ -122,7 +122,7 @@ static int ext4_readdir(struct file *fil
+                * We don't set the inode dirty flag since it's not
+                * critical that it get flushed back to the disk.
+                */
+-              EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
++              ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
+       }
+       stored = 0;
+       offset = filp->f_pos & (sb->s_blocksize - 1);
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -315,6 +315,83 @@ static inline __u32 ext4_mask_flags(umod
+               return flags & EXT4_OTHER_FLMASK;
+ }
++/*
++ * Inode flags used for atomic set/get
++ */
++enum {
++      EXT4_INODE_SECRM        = 0,    /* Secure deletion */
++      EXT4_INODE_UNRM         = 1,    /* Undelete */
++      EXT4_INODE_COMPR        = 2,    /* Compress file */
++      EXT4_INODE_SYNC         = 3,    /* Synchronous updates */
++      EXT4_INODE_IMMUTABLE    = 4,    /* Immutable file */
++      EXT4_INODE_APPEND       = 5,    /* writes to file may only append */
++      EXT4_INODE_NODUMP       = 6,    /* do not dump file */
++      EXT4_INODE_NOATIME      = 7,    /* do not update atime */
++/* Reserved for compression usage... */
++      EXT4_INODE_DIRTY        = 8,
++      EXT4_INODE_COMPRBLK     = 9,    /* One or more compressed clusters */
++      EXT4_INODE_NOCOMPR      = 10,   /* Don't compress */
++      EXT4_INODE_ECOMPR       = 11,   /* Compression error */
++/* End compression flags --- maybe not all used */
++      EXT4_INODE_INDEX        = 12,   /* hash-indexed directory */
++      EXT4_INODE_IMAGIC       = 13,   /* AFS directory */
++      EXT4_INODE_JOURNAL_DATA = 14,   /* file data should be journaled */
++      EXT4_INODE_NOTAIL       = 15,   /* file tail should not be merged */
++      EXT4_INODE_DIRSYNC      = 16,   /* dirsync behaviour (directories only) */
++      EXT4_INODE_TOPDIR       = 17,   /* Top of directory hierarchies*/
++      EXT4_INODE_HUGE_FILE    = 18,   /* Set to each huge file */
++      EXT4_INODE_EXTENTS      = 19,   /* Inode uses extents */
++      EXT4_INODE_EA_INODE     = 21,   /* Inode used for large EA */
++      EXT4_INODE_EOFBLOCKS    = 22,   /* Blocks allocated beyond EOF */
++      EXT4_INODE_RESERVED     = 31,   /* reserved for ext4 lib */
++};
++
++#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
++#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
++      printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
++              EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
++
++/*
++ * Since it's pretty easy to mix up bit numbers and hex values, and we
++ * can't do a compile-time test for ENUM values, we use a run-time
++ * test to make sure that EXT4_XXX_FL is consistent with respect to
++ * EXT4_INODE_XXX.  If all is well the printk and BUG_ON will all drop
++ * out so it won't cost any extra space in the compiled kernel image.
++ * But it's important that these values are the same, since we are
++ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
++ * must be consistent with the values of FS_XXX_FL defined in
++ * include/linux/fs.h and the on-disk values found in ext2, ext3, and
++ * ext4 filesystems, and of course the values defined in e2fsprogs.
++ *
++ * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
++ */
++static inline void ext4_check_flag_values(void)
++{
++      CHECK_FLAG_VALUE(SECRM);
++      CHECK_FLAG_VALUE(UNRM);
++      CHECK_FLAG_VALUE(COMPR);
++      CHECK_FLAG_VALUE(SYNC);
++      CHECK_FLAG_VALUE(IMMUTABLE);
++      CHECK_FLAG_VALUE(APPEND);
++      CHECK_FLAG_VALUE(NODUMP);
++      CHECK_FLAG_VALUE(NOATIME);
++      CHECK_FLAG_VALUE(DIRTY);
++      CHECK_FLAG_VALUE(COMPRBLK);
++      CHECK_FLAG_VALUE(NOCOMPR);
++      CHECK_FLAG_VALUE(ECOMPR);
++      CHECK_FLAG_VALUE(INDEX);
++      CHECK_FLAG_VALUE(IMAGIC);
++      CHECK_FLAG_VALUE(JOURNAL_DATA);
++      CHECK_FLAG_VALUE(NOTAIL);
++      CHECK_FLAG_VALUE(DIRSYNC);
++      CHECK_FLAG_VALUE(TOPDIR);
++      CHECK_FLAG_VALUE(HUGE_FILE);
++      CHECK_FLAG_VALUE(EXTENTS);
++      CHECK_FLAG_VALUE(EA_INODE);
++      CHECK_FLAG_VALUE(EOFBLOCKS);
++      CHECK_FLAG_VALUE(RESERVED);
++}
++
+ /* Used to pass group descriptor data when online resize is done */
+ struct ext4_new_group_input {
+       __u32 group;            /* Group number for this data */
+@@ -603,9 +680,8 @@ struct ext4_ext_cache {
+  */
+ struct ext4_inode_info {
+       __le32  i_data[15];     /* unconverted */
+-      __u32   i_flags;
+-      ext4_fsblk_t    i_file_acl;
+       __u32   i_dtime;
++      ext4_fsblk_t    i_file_acl;
+       /*
+        * i_block_group is the number of the block group which contains
+@@ -616,6 +692,7 @@ struct ext4_inode_info {
+        */
+       ext4_group_t    i_block_group;
+       unsigned long   i_state_flags;          /* Dynamic state flags */
++      unsigned long   i_flags;
+       ext4_lblk_t             i_dir_start_lookup;
+ #ifdef CONFIG_EXT4_FS_XATTR
+@@ -1049,20 +1126,22 @@ enum {
+       EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
+ };
+-static inline int ext4_test_inode_state(struct inode *inode, int bit)
+-{
+-      return test_bit(bit, &EXT4_I(inode)->i_state_flags);
+-}
+-
+-static inline void ext4_set_inode_state(struct inode *inode, int bit)
+-{
+-      set_bit(bit, &EXT4_I(inode)->i_state_flags);
++#define EXT4_INODE_BIT_FNS(name, field)                                       \
++static inline int ext4_test_inode_##name(struct inode *inode, int bit)        \
++{                                                                     \
++      return test_bit(bit, &EXT4_I(inode)->i_##field);                \
++}                                                                     \
++static inline void ext4_set_inode_##name(struct inode *inode, int bit)        \
++{                                                                     \
++      set_bit(bit, &EXT4_I(inode)->i_##field);                        \
++}                                                                     \
++static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
++{                                                                     \
++      clear_bit(bit, &EXT4_I(inode)->i_##field);                      \
+ }
+-static inline void ext4_clear_inode_state(struct inode *inode, int bit)
+-{
+-      clear_bit(bit, &EXT4_I(inode)->i_state_flags);
+-}
++EXT4_INODE_BIT_FNS(flag, flags)
++EXT4_INODE_BIT_FNS(state, state_flags)
+ #else
+ /* Assume that user mode programs are passing in an ext4fs superblock, not
+  * a kernel struct super_block.  This will allow us to call the feature-test
+@@ -1247,7 +1326,7 @@ struct ext4_dir_entry_2 {
+ #define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
+                                     EXT4_FEATURE_COMPAT_DIR_INDEX) && \
+-                    (EXT4_I(dir)->i_flags & EXT4_INDEX_FL))
++                  ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
+ #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
+ #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
+--- a/fs/ext4/ext4_jbd2.h
++++ b/fs/ext4/ext4_jbd2.h
+@@ -282,7 +282,7 @@ static inline int ext4_should_journal_da
+               return 1;
+       if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+               return 1;
+-      if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
++      if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+               return 1;
+       return 0;
+ }
+@@ -293,7 +293,7 @@ static inline int ext4_should_order_data
+               return 0;
+       if (!S_ISREG(inode->i_mode))
+               return 0;
+-      if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
++      if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+               return 0;
+       if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+               return 1;
+@@ -306,7 +306,7 @@ static inline int ext4_should_writeback_
+               return 0;
+       if (EXT4_JOURNAL(inode) == NULL)
+               return 1;
+-      if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
++      if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+               return 0;
+       if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+               return 1;
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3409,12 +3409,12 @@ int ext4_ext_get_blocks(handle_t *handle
+               }
+       }
+-      if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
++      if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
+               if (eh->eh_entries) {
+                       last_ex = EXT_LAST_EXTENT(eh);
+                       if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
+                                           + ext4_ext_get_actual_len(last_ex))
+-                              EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
++                              ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+               } else {
+                       WARN_ON(eh->eh_entries == 0);
+                       ext4_error(inode->i_sb, __func__,
+@@ -3560,7 +3560,7 @@ static void ext4_falloc_update_inode(str
+                * can proceed even if the new size is the same as i_size.
+                */
+               if (new_size > i_size_read(inode))
+-                      EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
++                      ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+       }
+ }
+@@ -3588,7 +3588,7 @@ long ext4_fallocate(struct inode *inode,
+        * currently supporting (pre)allocate mode for extent-based
+        * files _only_
+        */
+-      if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++      if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+               return -EOPNOTSUPP;
+       /* preallocation to directories is currently not supported */
+@@ -3838,7 +3838,7 @@ int ext4_fiemap(struct inode *inode, str
+       int error = 0;
+       /* fallback to generic here if not in extents fmt */
+-      if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++      if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+               return generic_block_fiemap(inode, fieinfo, start, len,
+                       ext4_get_block);
+--- a/fs/ext4/file.c
++++ b/fs/ext4/file.c
+@@ -65,7 +65,7 @@ ext4_file_write(struct kiocb *iocb, cons
+        * is smaller than s_maxbytes, which is for extent-mapped files.
+        */
+-      if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
++      if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+               struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+               size_t length = iov_length(iov, nr_segs);
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -497,7 +497,7 @@ static int find_group_orlov(struct super
+       if (S_ISDIR(mode) &&
+           ((parent == sb->s_root->d_inode) ||
+-           (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
++           (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
+               int best_ndir = inodes_per_group;
+               int ret = -1;
+@@ -1044,7 +1044,7 @@ got:
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+               /* set extent flag only for directory, file and normal symlink*/
+               if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
+-                      EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
++                      ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+                       ext4_ext_tree_init(handle, inode);
+               }
+       }
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -957,7 +957,7 @@ static int ext4_ind_get_blocks(handle_t
+       int count = 0;
+       ext4_fsblk_t first_block = 0;
+-      J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
++      J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
+       J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
+       depth = ext4_block_to_path(inode, iblock, offsets,
+                                  &blocks_to_boundary);
+@@ -1085,7 +1085,7 @@ static int ext4_indirect_calc_metadata_a
+  */
+ static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
+ {
+-      if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
++      if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+               return ext4_ext_calc_metadata_amount(inode, lblock);
+       return ext4_indirect_calc_metadata_amount(inode, lblock);
+@@ -1274,7 +1274,7 @@ int ext4_get_blocks(handle_t *handle, st
+        * file system block.
+        */
+       down_read((&EXT4_I(inode)->i_data_sem));
+-      if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
++      if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+               retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+                               bh, 0);
+       } else {
+@@ -1336,7 +1336,7 @@ int ext4_get_blocks(handle_t *handle, st
+        * We need to check for EXT4 here because migrate
+        * could have changed the inode type in between
+        */
+-      if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
++      if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+               retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+                                             bh, flags);
+       } else {
+@@ -2371,7 +2371,7 @@ static void mpage_add_bh_to_extent(struc
+               goto flush_it;
+       /* check if thereserved journal credits might overflow */
+-      if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
++      if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
+               if (nrblocks >= EXT4_MAX_TRANS_DATA) {
+                       /*
+                        * With non-extent format we are limited by the journal
+@@ -2836,7 +2836,7 @@ static int ext4_da_writepages_trans_bloc
+        * number of contiguous block. So we will limit
+        * number of contiguous block to a sane value
+        */
+-      if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
++      if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
+           (max_blocks > EXT4_MAX_TRANS_DATA))
+               max_blocks = EXT4_MAX_TRANS_DATA;
+@@ -3872,7 +3872,7 @@ static ssize_t ext4_direct_IO(int rw, st
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file->f_mapping->host;
+-      if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
++      if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+               return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+       return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+@@ -4503,12 +4503,12 @@ void ext4_truncate(struct inode *inode)
+       if (!ext4_can_truncate(inode))
+               return;
+-      EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
++      ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+       if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+               ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+-      if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
++      if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+               ext4_ext_truncate(inode);
+               return;
+       }
+@@ -5350,7 +5350,7 @@ int ext4_setattr(struct dentry *dentry,
+       }
+       if (attr->ia_valid & ATTR_SIZE) {
+-              if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
++              if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+                       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+                       if (attr->ia_size > sbi->s_bitmap_maxbytes) {
+@@ -5363,7 +5363,7 @@ int ext4_setattr(struct dentry *dentry,
+       if (S_ISREG(inode->i_mode) &&
+           attr->ia_valid & ATTR_SIZE &&
+           (attr->ia_size < inode->i_size ||
+-           (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
++           (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
+               handle_t *handle;
+               handle = ext4_journal_start(inode, 3);
+@@ -5395,7 +5395,7 @@ int ext4_setattr(struct dentry *dentry,
+                       }
+               }
+               /* ext4_truncate will clear the flag */
+-              if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
++              if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
+                       ext4_truncate(inode);
+       }
+@@ -5471,7 +5471,7 @@ static int ext4_indirect_trans_blocks(st
+ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+ {
+-      if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++      if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+               return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
+       return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
+ }
+@@ -5806,9 +5806,9 @@ int ext4_change_inode_journal_flag(struc
+        */
+       if (val)
+-              EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
++              ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+       else
+-              EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
++              ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
+       ext4_set_aops(inode);
+       jbd2_journal_unlock_updates(journal);
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -2008,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_al
+       sbi = EXT4_SB(sb);
+       ngroups = ext4_get_groups_count(sb);
+       /* non-extent files are limited to low blocks/groups */
+-      if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
++      if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
+               ngroups = sbi->s_blockfile_groups;
+       BUG_ON(ac->ac_status == AC_STATUS_FOUND);
+@@ -3176,7 +3176,7 @@ ext4_mb_use_preallocated(struct ext4_all
+                       continue;
+               /* non-extent files can't have physical blocks past 2^32 */
+-              if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
++              if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
+                       pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+                       continue;
+--- a/fs/ext4/migrate.c
++++ b/fs/ext4/migrate.c
+@@ -465,7 +465,7 @@ int ext4_ext_migrate(struct inode *inode
+        */
+       if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+                                      EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+-          (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++          (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+               return -EINVAL;
+       if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -975,11 +975,11 @@ mext_check_arguments(struct inode *orig_
+       }
+       /* Ext4 move extent supports only extent based file */
+-      if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
++      if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
+               ext4_debug("ext4 move extent: orig file is not extents "
+                       "based file [ino:orig %lu]\n", orig_inode->i_ino);
+               return -EOPNOTSUPP;
+-      } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
++      } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
+               ext4_debug("ext4 move extent: donor file is not extents "
+                       "based file [ino:donor %lu]\n", donor_inode->i_ino);
+               return -EOPNOTSUPP;
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -660,7 +660,7 @@ int ext4_htree_fill_tree(struct file *di
+       dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 
+                      start_hash, start_minor_hash));
+       dir = dir_file->f_path.dentry->d_inode;
+-      if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
++      if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
+               hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
+               if (hinfo.hash_version <= DX_HASH_TEA)
+                       hinfo.hash_version +=
+@@ -805,7 +805,7 @@ static void ext4_update_dx_flag(struct i
+ {
+       if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+                                    EXT4_FEATURE_COMPAT_DIR_INDEX))
+-              EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
++              ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+ }
+ /*
+@@ -1424,7 +1424,7 @@ static int make_indexed_dir(handle_t *ha
+               brelse(bh);
+               return retval;
+       }
+-      EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
++      ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
+       data1 = bh2->b_data;
+       memcpy (data1, de, len);
+@@ -1497,7 +1497,7 @@ static int ext4_add_entry(handle_t *hand
+               retval = ext4_dx_add_entry(handle, dentry, inode);
+               if (!retval || (retval != ERR_BAD_DX_DIR))
+                       return retval;
+-              EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
++              ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
+               dx_fallback++;
+               ext4_mark_inode_dirty(handle, dir);
+       }
+@@ -2292,7 +2292,7 @@ retry:
+               }
+       } else {
+               /* clear the extent format for fast symlink */
+-              EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
++              ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+               inode->i_op = &ext4_fast_symlink_inode_operations;
+               memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
+               inode->i_size = l-1;
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -3999,6 +3999,7 @@ static int __init init_ext4_fs(void)
+ {
+       int err;
++      ext4_check_flag_values();
+       err = init_ext4_system_zone();
+       if (err)
+               return err;
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -816,7 +816,7 @@ inserted:
+                                               EXT4_I(inode)->i_block_group);
+                       /* non-extent files can't have physical blocks past 2^32 */
+-                      if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++                      if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+                               goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+                       block = ext4_new_meta_blocks(handle, inode,
+@@ -824,7 +824,7 @@ inserted:
+                       if (error)
+                               goto cleanup;
+-                      if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
++                      if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+                               BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
+                       ea_idebug(inode, "creating block %d", block);
diff --git a/queue-2.6.32/0047-ext4-Avoid-crashing-on-NULL-ptr-dereference-on-a-fil.patch b/queue-2.6.32/0047-ext4-Avoid-crashing-on-NULL-ptr-dereference-on-a-fil.patch
new file mode 100644 (file)
index 0000000..e9a1275
--- /dev/null
@@ -0,0 +1,51 @@
+From 570f16c4bfa97a7b2d3b3e6c0b8936ee91f32481 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 30 May 2010 22:50:01 -0400
+Subject: ext4: Avoid crashing on NULL ptr dereference on a filesystem error
+
+commit f70f362b4a6fe47c239dbfb3efc0cc2c10e4f09c upstream (as of v2.6.34-git13)
+
+If the EOFBLOCK_FL flag is set when it should not be and the inode is
+zero length, then eh_entries is zero, and ex is NULL, so dereferencing
+ex to print ex->ee_block causes a kernel OOPS in
+ext4_ext_map_blocks().
+
+On top of that, the error message which is printed isn't very helpful.
+So we fix this by printing something more explanatory which doesn't
+involve trying to print ex->ee_block.
+
+Addresses-Google-Bug: #2655740
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c |   11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3281,8 +3281,8 @@ int ext4_ext_get_blocks(handle_t *handle
+        */
+       if (path[depth].p_ext == NULL && depth != 0) {
+               ext4_error(inode->i_sb, __func__, "bad extent address "
+-                         "inode: %lu, iblock: %d, depth: %d",
+-                         inode->i_ino, iblock, depth);
++                         "inode: %lu, iblock: %lu, depth: %d",
++                         inode->i_ino, (unsigned long) iblock, depth);
+               err = -EIO;
+               goto out2;
+       }
+@@ -3418,8 +3418,11 @@ int ext4_ext_get_blocks(handle_t *handle
+               } else {
+                       WARN_ON(eh->eh_entries == 0);
+                       ext4_error(inode->i_sb, __func__,
+-                              "inode#%lu, eh->eh_entries = 0!", inode->i_ino);
+-                      }
++                                 "inode#%lu, eh->eh_entries = 0 and "
++                                 "EOFBLOCKS_FL set", inode->i_ino);
++                      err = -EIO;
++                      goto out2;
++              }
+       }
+       err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+       if (err) {
diff --git a/queue-2.6.32/0048-ext4-Clear-the-EXT4_EOFBLOCKS_FL-flag-only-when-warr.patch b/queue-2.6.32/0048-ext4-Clear-the-EXT4_EOFBLOCKS_FL-flag-only-when-warr.patch
new file mode 100644 (file)
index 0000000..d5e1bac
--- /dev/null
@@ -0,0 +1,78 @@
+From 3b2905c2bc46795b9c8e54ddc435bd78f4391972 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sun, 30 May 2010 22:50:02 -0400
+Subject: ext4: Clear the EXT4_EOFBLOCKS_FL flag only when warranted
+
+commit 786ec7915e530936b9eb2e3d12274145cab7aa7d upstream (as of v2.6.34-git13)
+
+Dimitry Monakhov discovered an edge case where it was possible for the
+EXT4_EOFBLOCKS_FL flag could get cleared unnecessarily.  This is true;
+I have a test case that can be exercised via downloading and
+decompressing the file:
+
+wget ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/ext4-testcases/eofblocks-fl-test-case.img.bz2
+bunzip2 eofblocks-fl-test-case.img
+dd if=/dev/zero of=eofblocks-fl-test-case.img bs=1k seek=17925 bs=1k count=1 conv=notrunc
+
+However, triggering it in real life is highly unlikely since it
+requires an extremely fragmented sparse file with a hole in exactly
+the right place in the extent tree.  (It actually took quite a bit of
+work to generate this test case.)  Still, it's nice to get even
+extreme corner cases to be correct, so this patch makes sure that we
+don't clear the EXT4_EOFBLOCKS_FL incorrectly even in this corner
+case.
+
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c |   26 ++++++++++++++++++--------
+ 1 file changed, 18 insertions(+), 8 deletions(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3229,7 +3229,7 @@ int ext4_ext_get_blocks(handle_t *handle
+       struct ext4_extent_header *eh;
+       struct ext4_extent newex, *ex, *last_ex;
+       ext4_fsblk_t newblock;
+-      int err = 0, depth, ret, cache_type;
++      int i, err = 0, depth, ret, cache_type;
+       unsigned int allocated = 0;
+       struct ext4_allocation_request ar;
+       ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
+@@ -3410,19 +3410,29 @@ int ext4_ext_get_blocks(handle_t *handle
+       }
+       if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
+-              if (eh->eh_entries) {
+-                      last_ex = EXT_LAST_EXTENT(eh);
+-                      if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
+-                                          + ext4_ext_get_actual_len(last_ex))
+-                              ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+-              } else {
+-                      WARN_ON(eh->eh_entries == 0);
++              if (unlikely(!eh->eh_entries)) {
+                       ext4_error(inode->i_sb, __func__,
+                                  "inode#%lu, eh->eh_entries = 0 and "
+                                  "EOFBLOCKS_FL set", inode->i_ino);
+                       err = -EIO;
+                       goto out2;
+               }
++              last_ex = EXT_LAST_EXTENT(eh);
++              /*
++               * If the current leaf block was reached by looking at
++               * the last index block all the way down the tree, and
++               * we are extending the inode beyond the last extent
++               * in the current leaf block, then clear the
++               * EOFBLOCKS_FL flag.
++               */
++              for (i = depth-1; i >= 0; i--) {
++                      if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
++                              break;
++              }
++              if ((i < 0) &&
++                  (iblock + ar.len > le32_to_cpu(last_ex->ee_block) +
++                   ext4_ext_get_actual_len(last_ex)))
++                      ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
+       }
+       err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
+       if (err) {
diff --git a/queue-2.6.32/0049-ext4-restart-ext4_ext_remove_space-after-transaction.patch b/queue-2.6.32/0049-ext4-restart-ext4_ext_remove_space-after-transaction.patch
new file mode 100644 (file)
index 0000000..9f90304
--- /dev/null
@@ -0,0 +1,83 @@
+From b3143b86111dcac45717136a6d776f993aace17f Mon Sep 17 00:00:00 2001
+From: Dmitry Monakhov <dmonakhov@openvz.org>
+Date: Sun, 30 May 2010 22:50:03 -0400
+Subject: ext4: restart ext4_ext_remove_space() after transaction restart
+
+commit 0617b83fa239db9743a18ce6cc0e556f4d0fd567 upstream (as of v2.6.34-git13)
+
+If i_data_sem was internally dropped due to transaction restart, it is
+necessary to restart path look-up because extents tree was possibly
+modified by ext4_get_block().
+
+https://bugzilla.kernel.org/show_bug.cgi?id=15827
+
+Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Acked-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/extents.c |   16 +++++++++-------
+ 1 file changed, 9 insertions(+), 7 deletions(-)
+
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_rest
+       if (err <= 0)
+               return err;
+       err = ext4_truncate_restart_trans(handle, inode, needed);
+-      /*
+-       * We have dropped i_data_sem so someone might have cached again
+-       * an extent we are going to truncate.
+-       */
+-      ext4_ext_invalidate_cache(inode);
++      if (err == 0)
++              err = -EAGAIN;
+       return err;
+ }
+@@ -2263,7 +2260,7 @@ static int ext4_ext_remove_space(struct
+       int depth = ext_depth(inode);
+       struct ext4_ext_path *path;
+       handle_t *handle;
+-      int i = 0, err = 0;
++      int i, err;
+       ext_debug("truncate since %u\n", start);
+@@ -2272,23 +2269,26 @@ static int ext4_ext_remove_space(struct
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
++again:
+       ext4_ext_invalidate_cache(inode);
+       /*
+        * We start scanning from right side, freeing all the blocks
+        * after i_size and walking into the tree depth-wise.
+        */
++      depth = ext_depth(inode);
+       path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
+       if (path == NULL) {
+               ext4_journal_stop(handle);
+               return -ENOMEM;
+       }
++      path[0].p_depth = depth;
+       path[0].p_hdr = ext_inode_hdr(inode);
+       if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
+               err = -EIO;
+               goto out;
+       }
+-      path[0].p_depth = depth;
++      i = err = 0;
+       while (i >= 0 && err == 0) {
+               if (i == depth) {
+@@ -2382,6 +2382,8 @@ static int ext4_ext_remove_space(struct
+ out:
+       ext4_ext_drop_refs(path);
+       kfree(path);
++      if (err == -EAGAIN)
++              goto again;
+       ext4_journal_stop(handle);
+       return err;
diff --git a/queue-2.6.32/0050-ext4-Conditionally-define-compat-ioctl-numbers.patch b/queue-2.6.32/0050-ext4-Conditionally-define-compat-ioctl-numbers.patch
new file mode 100644 (file)
index 0000000..1adce6b
--- /dev/null
@@ -0,0 +1,36 @@
+From e58debc557cca3fa1ce0f893978be42dfa489699 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 30 May 2010 22:50:04 -0400
+Subject: ext4: Conditionally define compat ioctl numbers
+
+commit 899ad0cea6ad7ff4ba24b16318edbc3cbbe03fad upstream (as of v2.6.34-git13)
+
+It is unnecessary, and in general impossible, to define the compat
+ioctl numbers except when building the filesystem with CONFIG_COMPAT
+defined.
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -462,6 +462,7 @@ struct ext4_new_group_data {
+ #define EXT4_IOC_ALLOC_DA_BLKS                _IO('f', 12)
+ #define EXT4_IOC_MOVE_EXT             _IOWR('f', 15, struct move_extent)
++#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+ /*
+  * ioctl commands in 32 bit emulation
+  */
+@@ -477,6 +478,7 @@ struct ext4_new_group_data {
+ #endif
+ #define EXT4_IOC32_GETVERSION_OLD     FS_IOC32_GETVERSION
+ #define EXT4_IOC32_SETVERSION_OLD     FS_IOC32_SETVERSION
++#endif
+ /*
diff --git a/queue-2.6.32/0051-ext4-Fix-compat-EXT4_IOC_ADD_GROUP.patch b/queue-2.6.32/0051-ext4-Fix-compat-EXT4_IOC_ADD_GROUP.patch
new file mode 100644 (file)
index 0000000..420ca0c
--- /dev/null
@@ -0,0 +1,91 @@
+From a496748686cdccd4b5bf1b5696919e380dc48da0 Mon Sep 17 00:00:00 2001
+From: Ben Hutchings <ben@decadent.org.uk>
+Date: Sun, 30 May 2010 22:50:05 -0400
+Subject: ext4: Fix compat EXT4_IOC_ADD_GROUP
+
+commit 4d92dc0f00a775dc2e1267b0e00befb783902fe7 upstream (as of v2.6.34-git13)
+
+struct ext4_new_group_input needs to be converted because u64 has
+only 32-bit alignment on some 32-bit architectures, notably i386.
+
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h  |   16 ++++++++++++++++
+ fs/ext4/ioctl.c |   25 +++++++++++++++++++++++--
+ 2 files changed, 39 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -29,6 +29,9 @@
+ #include <linux/wait.h>
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
++#ifdef __KERNEL__
++#include <linux/compat.h>
++#endif
+ /*
+  * The fourth extended filesystem constants/structures
+@@ -403,6 +406,18 @@ struct ext4_new_group_input {
+       __u16 unused;
+ };
++#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
++struct compat_ext4_new_group_input {
++      u32 group;
++      compat_u64 block_bitmap;
++      compat_u64 inode_bitmap;
++      compat_u64 inode_table;
++      u32 blocks_count;
++      u16 reserved_blocks;
++      u16 unused;
++};
++#endif
++
+ /* The struct ext4_new_group_input in kernel space, with free_blocks_count */
+ struct ext4_new_group_data {
+       __u32 group;
+@@ -473,6 +488,7 @@ struct ext4_new_group_data {
+ #define EXT4_IOC32_GETRSVSZ           _IOR('f', 5, int)
+ #define EXT4_IOC32_SETRSVSZ           _IOW('f', 6, int)
+ #define EXT4_IOC32_GROUP_EXTEND               _IOW('f', 7, unsigned int)
++#define EXT4_IOC32_GROUP_ADD          _IOW('f', 8, struct compat_ext4_new_group_input)
+ #ifdef CONFIG_JBD2_DEBUG
+ #define EXT4_IOC32_WAIT_FOR_READONLY  _IOR('f', 99, int)
+ #endif
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -373,8 +373,29 @@ long ext4_compat_ioctl(struct file *file
+       case EXT4_IOC32_SETRSVSZ:
+               cmd = EXT4_IOC_SETRSVSZ;
+               break;
+-      case EXT4_IOC_GROUP_ADD:
+-              break;
++      case EXT4_IOC32_GROUP_ADD: {
++              struct compat_ext4_new_group_input __user *uinput;
++              struct ext4_new_group_input input;
++              mm_segment_t old_fs;
++              int err;
++
++              uinput = compat_ptr(arg);
++              err = get_user(input.group, &uinput->group);
++              err |= get_user(input.block_bitmap, &uinput->block_bitmap);
++              err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
++              err |= get_user(input.inode_table, &uinput->inode_table);
++              err |= get_user(input.blocks_count, &uinput->blocks_count);
++              err |= get_user(input.reserved_blocks,
++                              &uinput->reserved_blocks);
++              if (err)
++                      return -EFAULT;
++              old_fs = get_fs();
++              set_fs(KERNEL_DS);
++              err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
++                               (unsigned long) &input);
++              set_fs(old_fs);
++              return err;
++      }
+       case EXT4_IOC_MOVE_EXT:
+               break;
+       default:
diff --git a/queue-2.6.32/0052-ext4-Make-fsync-sync-new-parent-directories-in-no-jo.patch b/queue-2.6.32/0052-ext4-Make-fsync-sync-new-parent-directories-in-no-jo.patch
new file mode 100644 (file)
index 0000000..ed8bb2e
--- /dev/null
@@ -0,0 +1,92 @@
+From 2959737e6c8ee73e85bf706f11b272bab323597f Mon Sep 17 00:00:00 2001
+From: Frank Mayhar <fmayhar@google.com>
+Date: Sun, 30 May 2010 22:50:06 -0400
+Subject: ext4: Make fsync sync new parent directories in no-journal mode
+
+commit 14ece1028b3ed53ffec1b1213ffc6acaf79ad77c upstream (as of v2.6.34-git13)
+
+Add a new ext4 state to tell us when a file has been newly created; use
+that state in ext4_sync_file in no-journal mode to tell us when we need
+to sync the parent directory as well as the inode and data itself.  This
+fixes a problem in which a panic or power failure may lose the entire
+file even when using fsync, since the parent directory entry is lost.
+
+Addresses-Google-Bug: #2480057
+
+Signed-off-by: Frank Mayhar <fmayhar@google.com>
+Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+---
+ fs/ext4/ext4.h  |    1 +
+ fs/ext4/fsync.c |   31 +++++++++++++++++++++++++++++--
+ fs/ext4/namei.c |    2 ++
+ 3 files changed, 32 insertions(+), 2 deletions(-)
+
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1142,6 +1142,7 @@ enum {
+       EXT4_STATE_DA_ALLOC_CLOSE,      /* Alloc DA blks on close */
+       EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
+       EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
++      EXT4_STATE_NEWENTRY,            /* File just added to dir */
+ };
+ #define EXT4_INODE_BIT_FNS(name, field)                                       \
+--- a/fs/ext4/fsync.c
++++ b/fs/ext4/fsync.c
+@@ -35,6 +35,29 @@
+ #include <trace/events/ext4.h>
+ /*
++ * If we're not journaling and this is a just-created file, we have to
++ * sync our parent directory (if it was freshly created) since
++ * otherwise it will only be written by writeback, leaving a huge
++ * window during which a crash may lose the file.  This may apply for
++ * the parent directory's parent as well, and so on recursively, if
++ * they are also freshly created.
++ */
++static void ext4_sync_parent(struct inode *inode)
++{
++      struct dentry *dentry = NULL;
++
++      while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
++              ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
++              dentry = list_entry(inode->i_dentry.next,
++                                  struct dentry, d_alias);
++              if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
++                      break;
++              inode = dentry->d_parent->d_inode;
++              sync_mapping_buffers(inode->i_mapping);
++      }
++}
++
++/*
+  * akpm: A new design for ext4_sync_file().
+  *
+  * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
+@@ -67,8 +90,12 @@ int ext4_sync_file(struct file *file, st
+       if (ret < 0)
+               return ret;
+-      if (!journal)
+-              return simple_fsync(file, dentry, datasync);
++      if (!journal) {
++              ret = simple_fsync(file, dentry, datasync);
++              if (!ret && !list_empty(&inode->i_dentry))
++                      ext4_sync_parent(inode);
++              return ret;
++      }
+       /*
+        * data=writeback,ordered:
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -1525,6 +1525,8 @@ static int ext4_add_entry(handle_t *hand
+       de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
+       retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+       brelse(bh);
++      if (retval == 0)
++              ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
+       return retval;
+ }
index af745db618078ff2ae04b69f13513e26b1c6d8d6..100bdec8442b1d0630ba9360984fa9fe46fb5b61 100644 (file)
@@ -78,3 +78,59 @@ usb-sisusbvga-fix-for-usb-3.0.patch
 usb-add-quirk-for-broadcom-bt-dongle.patch
 usb-ftdi-add-support-for-the-rt-system-vx-7-radio-programming-cable.patch
 ethtool-fix-potential-user-buffer-overflow-for-ethtool_-g-s-rxfh.patch
+0001-ext4-Fix-potential-quota-deadlock.patch
+0002-ext4-replace-BUG-with-return-EIO-in-ext4_ext_get_blo.patch
+0003-ext4-jbd2-Add-barriers-for-file-systems-with-exernal.patch
+0004-ext4-Eliminate-potential-double-free-on-error-path.patch
+0005-ext4-return-correct-wbc.nr_to_write-in-ext4_da_write.patch
+0006-ext4-Ensure-zeroout-blocks-have-no-dirty-metadata.patch
+0007-ext4-Patch-up-how-we-claim-metadata-blocks-for-quota.patch
+0008-ext4-Fix-accounting-of-reserved-metadata-blocks.patch
+0009-ext4-Calculate-metadata-requirements-more-accurately.patch
+0010-ext4-Handle-EDQUOT-error-on-write.patch
+0011-ext4-Fix-quota-accounting-error-with-fallocate.patch
+0012-ext4-Drop-EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE-flag.patch
+0013-ext4-Use-bitops-to-read-modify-EXT4_I-inode-i_state.patch
+0014-ext4-Fix-BUG_ON-at-fs-buffer.c-652-in-no-journal-mod.patch
+0015-ext4-Add-flag-to-files-with-blocks-intentionally-pas.patch
+0016-ext4-Fix-fencepost-error-in-chosing-choosing-group-v.patch
+0017-ext4-fix-error-handling-in-migrate.patch
+0018-ext4-explicitly-remove-inode-from-orphan-list-after-.patch
+0019-ext4-Handle-non-empty-on-disk-orphan-link.patch
+0020-ext4-make-offset-consistent-in-ext4_check_dir_entry.patch
+0021-ext4-Fix-insertion-point-of-extent-in-mext_insert_ac.patch
+0022-ext4-Fix-the-NULL-reference-in-double_down_write_dat.patch
+0023-ext4-Code-cleanup-for-EXT4_IOC_MOVE_EXT-ioctl.patch
+0024-ext4-Fix-estimate-of-of-blocks-needed-to-write-indir.patch
+0025-ext4-Fixed-inode-allocator-to-correctly-track-a-flex.patch
+0026-ext4-Fix-possible-lost-inode-write-in-no-journal-mod.patch
+0027-ext4-Fix-buffer-head-leaks-after-calls-to-ext4_get_i.patch
+0028-ext4-Issue-the-discard-operation-before-releasing-th.patch
+0029-ext4-check-missed-return-value-in-ext4_sync_file.patch
+0030-ext4-fix-memory-leaks-in-error-path-handling-of-ext4.patch
+0031-ext4-Remove-unnecessary-call-to-ext4_get_group_desc-.patch
+0032-ext4-rename-ext4_mb_release_desc-to-ext4_mb_unload_b.patch
+0033-ext4-allow-defrag-EXT4_IOC_MOVE_EXT-in-32bit-compat-.patch
+0034-ext4-fix-quota-accounting-in-case-of-fallocate.patch
+0035-ext4-check-s_log_groups_per_flex-in-online-resize-co.patch
+0036-ext4-don-t-return-to-userspace-after-freezing-the-fs.patch
+0037-ext4-stop-issuing-discards-if-not-supported-by-devic.patch
+0038-ext4-don-t-scan-accumulate-more-pages-than-mballoc-w.patch
+0039-ext4-Do-not-zero-out-uninitialized-extents-beyond-i_.patch
+0040-ext4-clean-up-inode-bitmaps-manipulation-in-ext4_fre.patch
+0041-ext4-init-statistics-after-journal-recovery.patch
+0042-ext4-Remove-extraneous-newlines-in-ext4_msg-calls.patch
+0043-ext4-Prevent-creation-of-files-larger-than-RLIMIT_FS.patch
+0044-ext4-check-for-a-good-block-group-before-loading-bud.patch
+0045-ext4-Show-journal_checksum-option.patch
+0046-ext4-Use-bitops-to-read-modify-i_flags-in-struct-ext.patch
+0047-ext4-Avoid-crashing-on-NULL-ptr-dereference-on-a-fil.patch
+0048-ext4-Clear-the-EXT4_EOFBLOCKS_FL-flag-only-when-warr.patch
+0049-ext4-restart-ext4_ext_remove_space-after-transaction.patch
+0050-ext4-Conditionally-define-compat-ioctl-numbers.patch
+0051-ext4-Fix-compat-EXT4_IOC_ADD_GROUP.patch
+0052-ext4-Make-fsync-sync-new-parent-directories-in-no-jo.patch
+0001-KVM-MMU-Remove-user-access-when-allowing-kernel-acce.patch
+0002-KVM-SVM-Handle-MCEs-early-in-the-vmexit-process.patch
+0003-KVM-SVM-Implement-workaround-for-Erratum-383.patch
+0004-KVM-MMU-invalidate-and-flush-on-spte-small-large-pag.patch