From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 22 Jan 2009 23:31:44 +0000 (-0800)
Subject: .27 patches
X-Git-Tag: v2.6.27.13~4
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=05ec1fc4dac4ff4bf0ca0690b3f405a4e96bfd80;p=thirdparty%2Fkernel%2Fstable-queue.git

.27 patches
---

diff --git a/queue-2.6.27/fs-remove-wb_sync_hold.patch b/queue-2.6.27/fs-remove-wb_sync_hold.patch
new file mode 100644
index 00000000000..11cb83b3c91
--- /dev/null
+++ b/queue-2.6.27/fs-remove-wb_sync_hold.patch
@@ -0,0 +1,91 @@
+From 4f5a99d64c17470a784a6c68064207d82e3e74a5 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:40:25 -0800
+Subject: fs: remove WB_SYNC_HOLD
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 4f5a99d64c17470a784a6c68064207d82e3e74a5 upstream.
+
+Remove WB_SYNC_HOLD.  The primary motiviation is the design of my
+anti-starvation code for fsync.  It requires taking an inode lock over the
+sync operation, so we could run into lock ordering problems with multiple
+inodes.  It is possible to take a single global lock to solve the ordering
+problem, but then that would prevent a future nice implementation of "sync
+multiple inodes" based on lock order via inode address.
+
+Seems like a backward step to remove this, but actually it is busted
+anyway: we can't use the inode lists for data integrity wait: an inode can
+be taken off the dirty lists but still be under writeback.  In order to
+satisfy data integrity semantics, we should wait for it to finish
+writeback, but if we only search the dirty lists, we'll miss it.
+
+It would be possible to have a "writeback" list, for sys_sync, I suppose.
+But why complicate things by prematurely optimise?  For unmounting, we
+could avoid the "livelock avoidance" code, which would be easier, but
+again premature IMO.
+
+Fixing the existing data integrity problem will come next.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/fs-writeback.c         |   12 ++----------
+ include/linux/writeback.h |    1 -
+ 2 files changed, 2 insertions(+), 11 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *i
+  * If we're a pdlfush thread, then implement pdflush collision avoidance
+  * against the entire list.
+  *
+- * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
+- * that it can be located for waiting on in __writeback_single_inode().
+- *
+  * If `bdi' is non-zero then we're being asked to writeback a specific queue.
+  * This function assumes that the blockdev superblock's inodes are backed by
+  * a variety of queues, so all inodes are searched.  For other superblocks,
+@@ -499,10 +496,6 @@ void generic_sync_sb_inodes(struct super
+ 		__iget(inode);
+ 		pages_skipped = wbc->pages_skipped;
+ 		__writeback_single_inode(inode, wbc);
+-		if (wbc->sync_mode == WB_SYNC_HOLD) {
+-			inode->dirtied_when = jiffies;
+-			list_move(&inode->i_list, &sb->s_dirty);
+-		}
+ 		if (current_is_pdflush())
+ 			writeback_release(bdi);
+ 		if (wbc->pages_skipped != pages_skipped) {
+@@ -588,8 +581,7 @@ restart:
+ 
+ /*
+  * writeback and wait upon the filesystem's dirty inodes.  The caller will
+- * do this in two passes - one to write, and one to wait.  WB_SYNC_HOLD is
+- * used to park the written inodes on sb->s_dirty for the wait pass.
++ * do this in two passes - one to write, and one to wait.
+  *
+  * A finite limit is set on the number of pages which will be written.
+  * To prevent infinite livelock of sys_sync().
+@@ -600,7 +592,7 @@ restart:
+ void sync_inodes_sb(struct super_block *sb, int wait)
+ {
+ 	struct writeback_control wbc = {
+-		.sync_mode	= wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
++		.sync_mode	= wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+ 		.range_start	= 0,
+ 		.range_end	= LLONG_MAX,
+ 	};
+--- a/include/linux/writeback.h
++++ b/include/linux/writeback.h
+@@ -30,7 +30,6 @@ static inline int task_is_pdflush(struct
+ enum writeback_sync_modes {
+ 	WB_SYNC_NONE,	/* Don't wait on anything */
+ 	WB_SYNC_ALL,	/* Wait on every mapping */
+-	WB_SYNC_HOLD,	/* Hold the inode on sb_dirty for sys_sync() */
+ };
+ 
+ /*
diff --git a/queue-2.6.27/fs-sync_sb_inodes-fix.patch b/queue-2.6.27/fs-sync_sb_inodes-fix.patch
new file mode 100644
index 00000000000..f5a003ec40a
--- /dev/null
+++ b/queue-2.6.27/fs-sync_sb_inodes-fix.patch
@@ -0,0 +1,106 @@
+From 38f21977663126fef53f5585e7f1653d8ebe55c4 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:40:25 -0800
+Subject: fs: sync_sb_inodes fix
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 38f21977663126fef53f5585e7f1653d8ebe55c4 upstream.
+
+Fix data integrity semantics required by sys_sync, by iterating over all
+inodes and waiting for any writeback pages after the initial writeout.
+Comments explain the exact problem.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/fs-writeback.c |   60 +++++++++++++++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 53 insertions(+), 7 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -440,6 +440,7 @@ void generic_sync_sb_inodes(struct super
+ 				struct writeback_control *wbc)
+ {
+ 	const unsigned long start = jiffies;	/* livelock avoidance */
++	int sync = wbc->sync_mode == WB_SYNC_ALL;
+ 
+ 	spin_lock(&inode_lock);
+ 	if (!wbc->for_kupdate || list_empty(&sb->s_io))
+@@ -516,7 +517,49 @@ void generic_sync_sb_inodes(struct super
+ 		if (!list_empty(&sb->s_more_io))
+ 			wbc->more_io = 1;
+ 	}
+-	spin_unlock(&inode_lock);
++
++	if (sync) {
++		struct inode *inode, *old_inode = NULL;
++
++		/*
++		 * Data integrity sync. Must wait for all pages under writeback,
++		 * because there may have been pages dirtied before our sync
++		 * call, but which had writeout started before we write it out.
++		 * In which case, the inode may not be on the dirty list, but
++		 * we still have to wait for that writeout.
++		 */
++		list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
++			struct address_space *mapping;
++
++			if (inode->i_state & (I_FREEING|I_WILL_FREE))
++				continue;
++			mapping = inode->i_mapping;
++			if (mapping->nrpages == 0)
++				continue;
++			__iget(inode);
++			spin_unlock(&inode_lock);
++			/*
++			 * We hold a reference to 'inode' so it couldn't have
++			 * been removed from s_inodes list while we dropped the
++			 * inode_lock.  We cannot iput the inode now as we can
++			 * be holding the last reference and we cannot iput it
++			 * under inode_lock. So we keep the reference and iput
++			 * it later.
++			 */
++			iput(old_inode);
++			old_inode = inode;
++
++			filemap_fdatawait(mapping);
++
++			cond_resched();
++
++			spin_lock(&inode_lock);
++		}
++		spin_unlock(&inode_lock);
++		iput(old_inode);
++	} else
++		spin_unlock(&inode_lock);
++
+ 	return;		/* Leave any unwritten inodes on s_io */
+ }
+ EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
+@@ -596,13 +639,16 @@ void sync_inodes_sb(struct super_block *
+ 		.range_start	= 0,
+ 		.range_end	= LLONG_MAX,
+ 	};
+-	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+-	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+ 
+-	wbc.nr_to_write = nr_dirty + nr_unstable +
+-			(inodes_stat.nr_inodes - inodes_stat.nr_unused) +
+-			nr_dirty + nr_unstable;
+-	wbc.nr_to_write += wbc.nr_to_write / 2;		/* Bit more for luck */
++	if (!wait) {
++		unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
++		unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
++
++		wbc.nr_to_write = nr_dirty + nr_unstable +
++			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
++	} else
++		wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
++
+ 	sync_sb_inodes(sb, &wbc);
+ }
+ 
diff --git a/queue-2.6.27/fs-sys_sync-fix.patch b/queue-2.6.27/fs-sys_sync-fix.patch
new file mode 100644
index 00000000000..c285093a8fc
--- /dev/null
+++ b/queue-2.6.27/fs-sys_sync-fix.patch
@@ -0,0 +1,83 @@
+From 856bf4d717feb8c55d4e2f817b71ebb70cfbc67b Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:40:26 -0800
+Subject: fs: sys_sync fix
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 856bf4d717feb8c55d4e2f817b71ebb70cfbc67b upstream.
+
+s_syncing livelock avoidance was breaking data integrity guarantee of
+sys_sync, by allowing sys_sync to skip writing or waiting for superblocks
+if there is a concurrent sys_sync happening.
+
+This livelock avoidance is much less important now that we don't have the
+get_super_to_sync() call after every sb that we sync.  This was replaced
+by __put_super_and_need_restart.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/fs-writeback.c  |   20 +-------------------
+ include/linux/fs.h |    1 -
+ 2 files changed, 1 insertion(+), 20 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -652,18 +652,6 @@ void sync_inodes_sb(struct super_block *
+ 	sync_sb_inodes(sb, &wbc);
+ }
+ 
+-/*
+- * Rather lame livelock avoidance.
+- */
+-static void set_sb_syncing(int val)
+-{
+-	struct super_block *sb;
+-	spin_lock(&sb_lock);
+-	list_for_each_entry_reverse(sb, &super_blocks, s_list)
+-		sb->s_syncing = val;
+-	spin_unlock(&sb_lock);
+-}
+-
+ /**
+  * sync_inodes - writes all inodes to disk
+  * @wait: wait for completion
+@@ -690,9 +678,6 @@ static void __sync_inodes(int wait)
+ 	spin_lock(&sb_lock);
+ restart:
+ 	list_for_each_entry(sb, &super_blocks, s_list) {
+-		if (sb->s_syncing)
+-			continue;
+-		sb->s_syncing = 1;
+ 		sb->s_count++;
+ 		spin_unlock(&sb_lock);
+ 		down_read(&sb->s_umount);
+@@ -710,13 +695,10 @@ restart:
+ 
+ void sync_inodes(int wait)
+ {
+-	set_sb_syncing(0);
+ 	__sync_inodes(0);
+ 
+-	if (wait) {
+-		set_sb_syncing(0);
++	if (wait)
+ 		__sync_inodes(1);
+-	}
+ }
+ 
+ /**
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -1080,7 +1080,6 @@ struct super_block {
+ 	struct rw_semaphore	s_umount;
+ 	struct mutex		s_lock;
+ 	int			s_count;
+-	int			s_syncing;
+ 	int			s_need_sync_fs;
+ 	atomic_t		s_active;
+ #ifdef CONFIG_SECURITY
diff --git a/queue-2.6.27/mm-direct-io-starvation-improvement.patch b/queue-2.6.27/mm-direct-io-starvation-improvement.patch
new file mode 100644
index 00000000000..1872a649e7d
--- /dev/null
+++ b/queue-2.6.27/mm-direct-io-starvation-improvement.patch
@@ -0,0 +1,73 @@
+From 48b47c561e41525061b5bc0cfd67d6367fd11dc4 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:40:22 -0800
+Subject: mm: direct IO starvation improvement
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 48b47c561e41525061b5bc0cfd67d6367fd11dc4 upstream.
+
+Direct IO can invalidate and sync a lot of pagecache pages in the mapping.
+ A 4K direct IO will actually try to sync and/or invalidate the pagecache
+of the entire file, for example (which might be many GB or TB large).
+
+Improve this by doing range syncs.  Also, memory no longer has to be
+unmapped to catch the dirty bits for syncing, as dirty bits would remain
+coherent due to dirty mmap accounting.
+
+This fixes the immediate DM deadlocks when doing direct IO reads to block
+device with a mounted filesystem, if only by papering over the problem
+somewhat rather than addressing the fsync starvation cases.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/filemap.c |   16 +++++-----------
+ 1 file changed, 5 insertions(+), 11 deletions(-)
+
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -1304,7 +1304,8 @@ generic_file_aio_read(struct kiocb *iocb
+ 			goto out; /* skip atime */
+ 		size = i_size_read(inode);
+ 		if (pos < size) {
+-			retval = filemap_write_and_wait(mapping);
++			retval = filemap_write_and_wait_range(mapping, pos,
++					pos + iov_length(iov, nr_segs) - 1);
+ 			if (!retval) {
+ 				retval = mapping->a_ops->direct_IO(READ, iocb,
+ 							iov, pos, nr_segs);
+@@ -2117,18 +2118,10 @@ generic_file_direct_write(struct kiocb *
+ 	if (count != ocount)
+ 		*nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
+ 
+-	/*
+-	 * Unmap all mmappings of the file up-front.
+-	 *
+-	 * This will cause any pte dirty bits to be propagated into the
+-	 * pageframes for the subsequent filemap_write_and_wait().
+-	 */
+ 	write_len = iov_length(iov, *nr_segs);
+ 	end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
+-	if (mapping_mapped(mapping))
+-		unmap_mapping_range(mapping, pos, write_len, 0);
+ 
+-	written = filemap_write_and_wait(mapping);
++	written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
+ 	if (written)
+ 		goto out;
+ 
+@@ -2519,7 +2512,8 @@ generic_file_buffered_write(struct kiocb
+ 	 * the file data here, to try to honour O_DIRECT expectations.
+ 	 */
+ 	if (unlikely(file->f_flags & O_DIRECT) && written)
+-		status = filemap_write_and_wait(mapping);
++		status = filemap_write_and_wait_range(mapping,
++					pos, pos + written - 1);
+ 
+ 	return written ? written : status;
+ }
diff --git a/queue-2.6.27/mm-do_sync_mapping_range-integrity-fix.patch b/queue-2.6.27/mm-do_sync_mapping_range-integrity-fix.patch
new file mode 100644
index 00000000000..8a36b0a64c8
--- /dev/null
+++ b/queue-2.6.27/mm-do_sync_mapping_range-integrity-fix.patch
@@ -0,0 +1,37 @@
+From ee53a891f47444c53318b98dac947ede963db400 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:12 -0800
+Subject: mm: do_sync_mapping_range integrity fix
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit ee53a891f47444c53318b98dac947ede963db400 upstream.
+
+Chris Mason notices do_sync_mapping_range didn't actually ask for data
+integrity writeout.  Unfortunately, it is advertised as being usable for
+data integrity operations.
+
+This is a data integrity bug.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/sync.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/sync.c
++++ b/fs/sync.c
+@@ -287,7 +287,7 @@ int do_sync_mapping_range(struct address
+ 
+ 	if (flags & SYNC_FILE_RANGE_WRITE) {
+ 		ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
+-						WB_SYNC_NONE);
++						WB_SYNC_ALL);
+ 		if (ret < 0)
+ 			goto out;
+ 	}
diff --git a/queue-2.6.27/mm-write_cache_pages-cleanups.patch b/queue-2.6.27/mm-write_cache_pages-cleanups.patch
new file mode 100644
index 00000000000..f41ed9589af
--- /dev/null
+++ b/queue-2.6.27/mm-write_cache_pages-cleanups.patch
@@ -0,0 +1,94 @@
+From 5a3d5c9813db56a75934eb1015367fda23a8b0b4 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:09 -0800
+Subject: mm: write_cache_pages cleanups
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 5a3d5c9813db56a75934eb1015367fda23a8b0b4 upstream.
+
+Get rid of some complex expressions from flow control statements, add a
+comment, remove some duplicate code.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c |   34 ++++++++++++++++++++++------------
+ 1 file changed, 22 insertions(+), 12 deletions(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -902,11 +902,14 @@ int write_cache_pages(struct address_spa
+ 	}
+ retry:
+ 	done_index = index;
+-	while (!done && (index <= end) &&
+-	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+-					      PAGECACHE_TAG_DIRTY,
+-					      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+-		unsigned i;
++	while (!done && (index <= end)) {
++		int i;
++
++		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
++			      PAGECACHE_TAG_DIRTY,
++			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
++		if (nr_pages == 0)
++			break;
+ 
+ 		for (i = 0; i < nr_pages; i++) {
+ 			struct page *page = pvec.pages[i];
+@@ -922,7 +925,16 @@ retry:
+ 			 */
+ 			lock_page(page);
+ 
++			/*
++			 * Page truncated or invalidated. We can freely skip it
++			 * then, even for data integrity operations: the page
++			 * has disappeared concurrently, so there could be no
++			 * real expectation of this data interity operation
++			 * even if there is now a new, dirty page at the same
++			 * pagecache address.
++			 */
+ 			if (unlikely(page->mapping != mapping)) {
++continue_unlock:
+ 				unlock_page(page);
+ 				continue;
+ 			}
+@@ -933,18 +945,15 @@ retry:
+ 				 * end == -1 in that case.
+ 				 */
+ 				done = 1;
+-				unlock_page(page);
+-				continue;
++				goto continue_unlock;
+ 			}
+ 
+ 			if (wbc->sync_mode != WB_SYNC_NONE)
+ 				wait_on_page_writeback(page);
+ 
+ 			if (PageWriteback(page) ||
+-			    !clear_page_dirty_for_io(page)) {
+-				unlock_page(page);
+-				continue;
+-			}
++			    !clear_page_dirty_for_io(page))
++				goto continue_unlock;
+ 
+ 			ret = (*writepage)(page, wbc, data);
+ 
+@@ -968,7 +977,8 @@ retry:
+  			}
+ 
+ 			if (wbc->sync_mode == WB_SYNC_NONE) {
+-				if (--wbc->nr_to_write <= 0)
++				wbc->nr_to_write--;
++				if (wbc->nr_to_write <= 0)
+ 					done = 1;
+ 			}
+ 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
diff --git a/queue-2.6.27/mm-write_cache_pages-early-loop-termination.patch b/queue-2.6.27/mm-write_cache_pages-early-loop-termination.patch
new file mode 100644
index 00000000000..4be92787246
--- /dev/null
+++ b/queue-2.6.27/mm-write_cache_pages-early-loop-termination.patch
@@ -0,0 +1,70 @@
+From bd19e012f6fd3b7309689165ea865cbb7bb88c1e Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:06 -0800
+Subject: mm: write_cache_pages early loop termination
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit bd19e012f6fd3b7309689165ea865cbb7bb88c1e upstream.
+
+We'd like to break out of the loop early in many situations, however the
+existing code has been setting mapping->writeback_index past the final
+page in the pagevec lookup for cyclic writeback.  This is a problem if we
+don't process all pages up to the final page.
+
+Currently the code mostly keeps writeback_index reasonable and hacked
+around this by not breaking out of the loop or writing pages outside the
+range in these cases.  Keep track of a real "done index" that enables us
+to terminate the loop in a much more flexible manner.
+
+Needed by the subsequent patch to preserve writepage errors, and then
+further patches to break out of the loop early for other reasons.  However
+there are no functional changes with this patch alone.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -875,6 +875,7 @@ int write_cache_pages(struct address_spa
+ 	pgoff_t uninitialized_var(writeback_index);
+ 	pgoff_t index;
+ 	pgoff_t end;		/* Inclusive */
++	pgoff_t done_index;
+ 	int cycled;
+ 	int range_whole = 0;
+ 
+@@ -900,6 +901,7 @@ int write_cache_pages(struct address_spa
+ 		cycled = 1; /* ignore range_cyclic tests */
+ 	}
+ retry:
++	done_index = index;
+ 	while (!done && (index <= end) &&
+ 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ 					      PAGECACHE_TAG_DIRTY,
+@@ -909,6 +911,8 @@ retry:
+ 		for (i = 0; i < nr_pages; i++) {
+ 			struct page *page = pvec.pages[i];
+ 
++			done_index = page->index + 1;
++
+ 			/*
+ 			 * At this point we hold neither mapping->tree_lock nor
+ 			 * lock on the page itself: the page may be truncated or
+@@ -970,7 +974,7 @@ retry:
+ 		goto retry;
+ 	}
+ 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+-		mapping->writeback_index = index;
++		mapping->writeback_index = done_index;
+ 
+ 	if (wbc->range_cont)
+ 		wbc->range_start = index << PAGE_CACHE_SHIFT;
diff --git a/queue-2.6.27/mm-write_cache_pages-integrity-fix.patch b/queue-2.6.27/mm-write_cache_pages-integrity-fix.patch
new file mode 100644
index 00000000000..4099f01981a
--- /dev/null
+++ b/queue-2.6.27/mm-write_cache_pages-integrity-fix.patch
@@ -0,0 +1,85 @@
+From 05fe478dd04e02fa230c305ab9b5616669821dd3 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:08 -0800
+Subject: mm: write_cache_pages integrity fix
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 05fe478dd04e02fa230c305ab9b5616669821dd3 upstream.
+
+In write_cache_pages, nr_to_write is heeded even for data-integrity syncs,
+so the function will return success after writing out nr_to_write pages,
+even if that was not sufficient to guarantee data integrity.
+
+The callers tend to set it to values that could break data interity
+semantics easily in practice.  For example, nr_to_write can be set to
+mapping->nr_pages * 2, however if a file has a single, dirty page, then
+fsync is called, subsequent pages might be concurrently added and dirtied,
+then write_cache_pages might writeout two of these newly dirty pages,
+while not writing out the old page that should have been written out.
+
+Fix this by ignoring nr_to_write if it is a data integrity sync.
+
+This is a data integrity bug.
+
+The reason this has been done in the past is to avoid stalling sync
+operations behind page dirtiers.
+
+ "If a file has one dirty page at offset 1000000000000000 then someone
+  does an fsync() and someone else gets in first and starts madly writing
+  pages at offset 0, we want to write that page at 1000000000000000.
+  Somehow."
+
+What we do today is return success after an arbitrary amount of pages are
+written, whether or not we have provided the data-integrity semantics that
+the caller has asked for.  Even this doesn't actually fix all stall cases
+completely: in the above situation, if the file has a huge number of pages
+in pagecache (but not dirty), then mapping->nrpages is going to be huge,
+even if pages are being dirtied.
+
+This change does indeed make the possibility of long stalls lager, and
+that's not a good thing, but lying about data integrity is even worse.  We
+have to either perform the sync, or return -ELINUXISLAME so at least the
+caller knows what has happened.
+
+There are subsequent competing approaches in the works to solve the stall
+problems properly, without compromising data integrity.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/filemap.c        |    2 +-
+ mm/page-writeback.c |    6 ++++--
+ 2 files changed, 5 insertions(+), 3 deletions(-)
+
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -209,7 +209,7 @@ int __filemap_fdatawrite_range(struct ad
+ 	int ret;
+ 	struct writeback_control wbc = {
+ 		.sync_mode = sync_mode,
+-		.nr_to_write = mapping->nrpages * 2,
++		.nr_to_write = LONG_MAX,
+ 		.range_start = start,
+ 		.range_end = end,
+ 	};
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -967,8 +967,10 @@ retry:
+ 				}
+  			}
+ 
+-			if (--(wbc->nr_to_write) <= 0)
+-				done = 1;
++			if (wbc->sync_mode == WB_SYNC_NONE) {
++				if (--wbc->nr_to_write <= 0)
++					done = 1;
++			}
+ 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ 				wbc->encountered_congestion = 1;
+ 				done = 1;
diff --git a/queue-2.6.27/mm-write_cache_pages-more-terminate-quickly.patch b/queue-2.6.27/mm-write_cache_pages-more-terminate-quickly.patch
new file mode 100644
index 00000000000..da422cf4282
--- /dev/null
+++ b/queue-2.6.27/mm-write_cache_pages-more-terminate-quickly.patch
@@ -0,0 +1,42 @@
+From 82fd1a9a8ced9607312b54859572bcc6211e8919 Mon Sep 17 00:00:00 2001
+From: Andrew Morton <akpm@linux-foundation.org>
+Date: Tue, 6 Jan 2009 14:39:11 -0800
+Subject: mm: write_cache_pages more terminate quickly
+
+From: Andrew Morton <akpm@linux-foundation.org>
+
+commit 82fd1a9a8ced9607312b54859572bcc6211e8919 upstream.
+
+Now that we have the early-termination logic in place, it makes sense to
+bail out early in all other cases where done is set to 1.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c |    5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -987,12 +987,15 @@ continue_unlock:
+ 
+ 			if (wbc->sync_mode == WB_SYNC_NONE) {
+ 				wbc->nr_to_write--;
+-				if (wbc->nr_to_write <= 0)
++				if (wbc->nr_to_write <= 0) {
+ 					done = 1;
++					break;
++				}
+ 			}
+ 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ 				wbc->encountered_congestion = 1;
+ 				done = 1;
++				break;
+ 			}
+ 		}
+ 		pagevec_release(&pvec);
diff --git a/queue-2.6.27/mm-write_cache_pages-optimise-page-cleaning.patch b/queue-2.6.27/mm-write_cache_pages-optimise-page-cleaning.patch
new file mode 100644
index 00000000000..0a33de0f317
--- /dev/null
+++ b/queue-2.6.27/mm-write_cache_pages-optimise-page-cleaning.patch
@@ -0,0 +1,57 @@
+From 515f4a037fb9ab736f8bad733fcd2ffd350cf265 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:10 -0800
+Subject: mm: write_cache_pages optimise page cleaning
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 515f4a037fb9ab736f8bad733fcd2ffd350cf265 upstream.
+
+In write_cache_pages, if we get stuck behind another process that is
+cleaning pages, we will be forced to wait for them to finish, then perform
+our own writeout (if it was redirtied during the long wait), then wait for
+that.
+
+If a page under writeout is still clean, we can skip waiting for it (if
+we're part of a data integrity sync, we'll be waiting for all writeout
+pages afterwards, so we'll still be waiting for the other guy's write
+that's cleaned the page).
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c |   17 +++++++++++++----
+ 1 file changed, 13 insertions(+), 4 deletions(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -948,11 +948,20 @@ continue_unlock:
+ 				goto continue_unlock;
+ 			}
+ 
+-			if (wbc->sync_mode != WB_SYNC_NONE)
+-				wait_on_page_writeback(page);
++			if (!PageDirty(page)) {
++				/* someone wrote it for us */
++				goto continue_unlock;
++			}
++
++			if (PageWriteback(page)) {
++				if (wbc->sync_mode != WB_SYNC_NONE)
++					wait_on_page_writeback(page);
++				else
++					goto continue_unlock;
++			}
+ 
+-			if (PageWriteback(page) ||
+-			    !clear_page_dirty_for_io(page))
++			BUG_ON(PageWriteback(page));
++			if (!clear_page_dirty_for_io(page))
+ 				goto continue_unlock;
+ 
+ 			ret = (*writepage)(page, wbc, data);
diff --git a/queue-2.6.27/mm-write_cache_pages-terminate-quickly.patch b/queue-2.6.27/mm-write_cache_pages-terminate-quickly.patch
new file mode 100644
index 00000000000..de9da6d14c4
--- /dev/null
+++ b/queue-2.6.27/mm-write_cache_pages-terminate-quickly.patch
@@ -0,0 +1,75 @@
+From d5482cdf8a0aacb1e6468a97d5544f5829c8d8c4 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:11 -0800
+Subject: mm: write_cache_pages terminate quickly
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit d5482cdf8a0aacb1e6468a97d5544f5829c8d8c4 upstream.
+
+Terminate the write_cache_pages loop upon encountering the first page past
+end, without locking the page.  Pages cannot have their index change when
+we have a reference on them (truncate, eg truncate_inode_pages_range
+performs the same check without the page lock).
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c |   32 ++++++++++++++++----------------
+ 1 file changed, 16 insertions(+), 16 deletions(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -914,15 +914,24 @@ retry:
+ 		for (i = 0; i < nr_pages; i++) {
+ 			struct page *page = pvec.pages[i];
+ 
+-			done_index = page->index + 1;
+-
+ 			/*
+-			 * At this point we hold neither mapping->tree_lock nor
+-			 * lock on the page itself: the page may be truncated or
+-			 * invalidated (changing page->mapping to NULL), or even
+-			 * swizzled back from swapper_space to tmpfs file
+-			 * mapping
++			 * At this point, the page may be truncated or
++			 * invalidated (changing page->mapping to NULL), or
++			 * even swizzled back from swapper_space to tmpfs file
++			 * mapping. However, page->index will not change
++			 * because we have a reference on the page.
+ 			 */
++			if (page->index > end) {
++				/*
++				 * can't be range_cyclic (1st pass) because
++				 * end == -1 in that case.
++				 */
++				done = 1;
++				break;
++			}
++
++			done_index = page->index + 1;
++
+ 			lock_page(page);
+ 
+ 			/*
+@@ -939,15 +948,6 @@ continue_unlock:
+ 				continue;
+ 			}
+ 
+-			if (page->index > end) {
+-				/*
+-				 * can't be range_cyclic (1st pass) because
+-				 * end == -1 in that case.
+-				 */
+-				done = 1;
+-				goto continue_unlock;
+-			}
+-
+ 			if (!PageDirty(page)) {
+ 				/* someone wrote it for us */
+ 				goto continue_unlock;
diff --git a/queue-2.6.27/mm-write_cache_pages-writepage-error-fix.patch b/queue-2.6.27/mm-write_cache_pages-writepage-error-fix.patch
new file mode 100644
index 00000000000..c75a00584e8
--- /dev/null
+++ b/queue-2.6.27/mm-write_cache_pages-writepage-error-fix.patch
@@ -0,0 +1,67 @@
+From 00266770b8b3a6a77f896ca501a0613739086832 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:06 -0800
+Subject: mm: write_cache_pages writepage error fix
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 00266770b8b3a6a77f896ca501a0613739086832 upstream.
+
+In write_cache_pages, if ret signals a real error, but we still have some
+pages left in the pagevec, done would be set to 1, but the remaining pages
+would continue to be processed and ret will be overwritten in the process.
+
+It could easily be overwritten with success, and thus success will be
+returned even if there is an error.  Thus the caller is told all writes
+succeeded, wheras in reality some did not.
+
+Fix this by bailing immediately if there is an error, and retaining the
+first error code.
+
+This is a data integrity bug.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c |   25 ++++++++++++++++++++-----
+ 1 file changed, 20 insertions(+), 5 deletions(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -948,11 +948,26 @@ retry:
+ 
+ 			ret = (*writepage)(page, wbc, data);
+ 
+-			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+-				unlock_page(page);
+-				ret = 0;
+-			}
+-			if (ret || (--(wbc->nr_to_write) <= 0))
++			if (unlikely(ret)) {
++				if (ret == AOP_WRITEPAGE_ACTIVATE) {
++					unlock_page(page);
++					ret = 0;
++				} else {
++					/*
++					 * done_index is set past this page,
++					 * so media errors will not choke
++					 * background writeout for the entire
++					 * file. This has consequences for
++					 * range_cyclic semantics (ie. it may
++					 * not be suitable for data integrity
++					 * writeout).
++					 */
++					done = 1;
++					break;
++				}
++ 			}
++
++			if (--(wbc->nr_to_write) <= 0)
+ 				done = 1;
+ 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ 				wbc->encountered_congestion = 1;
diff --git a/queue-2.6.27/series b/queue-2.6.27/series
index 2a12e53ad52..d4523051de0 100644
--- a/queue-2.6.27/series
+++ b/queue-2.6.27/series
@@ -26,3 +26,15 @@ dell_rbu-use-scnprintf-instead-of-less-secure-sprintf.patch
 hwmon-fix-config_dmi-n-fallback-to-probe.patch
 powerpc-is_hugepage_only_range-must-account-for-both-4kb-and-64kb-slices.patch
 mm-write_cache_pages-cyclic-fix.patch
+mm-write_cache_pages-early-loop-termination.patch
+mm-write_cache_pages-writepage-error-fix.patch
+mm-write_cache_pages-integrity-fix.patch
+mm-write_cache_pages-cleanups.patch
+mm-write_cache_pages-optimise-page-cleaning.patch
+mm-write_cache_pages-terminate-quickly.patch
+mm-write_cache_pages-more-terminate-quickly.patch
+mm-do_sync_mapping_range-integrity-fix.patch
+mm-direct-io-starvation-improvement.patch
+fs-remove-wb_sync_hold.patch
+fs-sync_sb_inodes-fix.patch
+fs-sys_sync-fix.patch