--- /dev/null
+From 4f5a99d64c17470a784a6c68064207d82e3e74a5 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:40:25 -0800
+Subject: fs: remove WB_SYNC_HOLD
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 4f5a99d64c17470a784a6c68064207d82e3e74a5 upstream.
+
+Remove WB_SYNC_HOLD. The primary motiviation is the design of my
+anti-starvation code for fsync. It requires taking an inode lock over the
+sync operation, so we could run into lock ordering problems with multiple
+inodes. It is possible to take a single global lock to solve the ordering
+problem, but then that would prevent a future nice implementation of "sync
+multiple inodes" based on lock order via inode address.
+
+Seems like a backward step to remove this, but actually it is busted
+anyway: we can't use the inode lists for data integrity wait: an inode can
+be taken off the dirty lists but still be under writeback. In order to
+satisfy data integrity semantics, we should wait for it to finish
+writeback, but if we only search the dirty lists, we'll miss it.
+
+It would be possible to have a "writeback" list, for sys_sync, I suppose.
+But why complicate things by prematurely optimise? For unmounting, we
+could avoid the "livelock avoidance" code, which would be easier, but
+again premature IMO.
+
+Fixing the existing data integrity problem will come next.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/fs-writeback.c | 12 ++----------
+ include/linux/writeback.h | 1 -
+ 2 files changed, 2 insertions(+), 11 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *i
+ * If we're a pdlfush thread, then implement pdflush collision avoidance
+ * against the entire list.
+ *
+- * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
+- * that it can be located for waiting on in __writeback_single_inode().
+- *
+ * If `bdi' is non-zero then we're being asked to writeback a specific queue.
+ * This function assumes that the blockdev superblock's inodes are backed by
+ * a variety of queues, so all inodes are searched. For other superblocks,
+@@ -499,10 +496,6 @@ void generic_sync_sb_inodes(struct super
+ __iget(inode);
+ pages_skipped = wbc->pages_skipped;
+ __writeback_single_inode(inode, wbc);
+- if (wbc->sync_mode == WB_SYNC_HOLD) {
+- inode->dirtied_when = jiffies;
+- list_move(&inode->i_list, &sb->s_dirty);
+- }
+ if (current_is_pdflush())
+ writeback_release(bdi);
+ if (wbc->pages_skipped != pages_skipped) {
+@@ -588,8 +581,7 @@ restart:
+
+ /*
+ * writeback and wait upon the filesystem's dirty inodes. The caller will
+- * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is
+- * used to park the written inodes on sb->s_dirty for the wait pass.
++ * do this in two passes - one to write, and one to wait.
+ *
+ * A finite limit is set on the number of pages which will be written.
+ * To prevent infinite livelock of sys_sync().
+@@ -600,7 +592,7 @@ restart:
+ void sync_inodes_sb(struct super_block *sb, int wait)
+ {
+ struct writeback_control wbc = {
+- .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
++ .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
+--- a/include/linux/writeback.h
++++ b/include/linux/writeback.h
+@@ -30,7 +30,6 @@ static inline int task_is_pdflush(struct
+ enum writeback_sync_modes {
+ WB_SYNC_NONE, /* Don't wait on anything */
+ WB_SYNC_ALL, /* Wait on every mapping */
+- WB_SYNC_HOLD, /* Hold the inode on sb_dirty for sys_sync() */
+ };
+
+ /*
--- /dev/null
+From 38f21977663126fef53f5585e7f1653d8ebe55c4 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:40:25 -0800
+Subject: fs: sync_sb_inodes fix
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 38f21977663126fef53f5585e7f1653d8ebe55c4 upstream.
+
+Fix data integrity semantics required by sys_sync, by iterating over all
+inodes and waiting for any writeback pages after the initial writeout.
+Comments explain the exact problem.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/fs-writeback.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 53 insertions(+), 7 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -440,6 +440,7 @@ void generic_sync_sb_inodes(struct super
+ struct writeback_control *wbc)
+ {
+ const unsigned long start = jiffies; /* livelock avoidance */
++ int sync = wbc->sync_mode == WB_SYNC_ALL;
+
+ spin_lock(&inode_lock);
+ if (!wbc->for_kupdate || list_empty(&sb->s_io))
+@@ -516,7 +517,49 @@ void generic_sync_sb_inodes(struct super
+ if (!list_empty(&sb->s_more_io))
+ wbc->more_io = 1;
+ }
+- spin_unlock(&inode_lock);
++
++ if (sync) {
++ struct inode *inode, *old_inode = NULL;
++
++ /*
++ * Data integrity sync. Must wait for all pages under writeback,
++ * because there may have been pages dirtied before our sync
++ * call, but which had writeout started before we write it out.
++ * In which case, the inode may not be on the dirty list, but
++ * we still have to wait for that writeout.
++ */
++ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
++ struct address_space *mapping;
++
++ if (inode->i_state & (I_FREEING|I_WILL_FREE))
++ continue;
++ mapping = inode->i_mapping;
++ if (mapping->nrpages == 0)
++ continue;
++ __iget(inode);
++ spin_unlock(&inode_lock);
++ /*
++ * We hold a reference to 'inode' so it couldn't have
++ * been removed from s_inodes list while we dropped the
++ * inode_lock. We cannot iput the inode now as we can
++ * be holding the last reference and we cannot iput it
++ * under inode_lock. So we keep the reference and iput
++ * it later.
++ */
++ iput(old_inode);
++ old_inode = inode;
++
++ filemap_fdatawait(mapping);
++
++ cond_resched();
++
++ spin_lock(&inode_lock);
++ }
++ spin_unlock(&inode_lock);
++ iput(old_inode);
++ } else
++ spin_unlock(&inode_lock);
++
+ return; /* Leave any unwritten inodes on s_io */
+ }
+ EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
+@@ -596,13 +639,16 @@ void sync_inodes_sb(struct super_block *
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ };
+- unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+- unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+
+- wbc.nr_to_write = nr_dirty + nr_unstable +
+- (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
+- nr_dirty + nr_unstable;
+- wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
++ if (!wait) {
++ unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
++ unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
++
++ wbc.nr_to_write = nr_dirty + nr_unstable +
++ (inodes_stat.nr_inodes - inodes_stat.nr_unused);
++ } else
++ wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
++
+ sync_sb_inodes(sb, &wbc);
+ }
+
--- /dev/null
+From 856bf4d717feb8c55d4e2f817b71ebb70cfbc67b Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:40:26 -0800
+Subject: fs: sys_sync fix
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 856bf4d717feb8c55d4e2f817b71ebb70cfbc67b upstream.
+
+s_syncing livelock avoidance was breaking data integrity guarantee of
+sys_sync, by allowing sys_sync to skip writing or waiting for superblocks
+if there is a concurrent sys_sync happening.
+
+This livelock avoidance is much less important now that we don't have the
+get_super_to_sync() call after every sb that we sync. This was replaced
+by __put_super_and_need_restart.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/fs-writeback.c | 20 +-------------------
+ include/linux/fs.h | 1 -
+ 2 files changed, 1 insertion(+), 20 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -652,18 +652,6 @@ void sync_inodes_sb(struct super_block *
+ sync_sb_inodes(sb, &wbc);
+ }
+
+-/*
+- * Rather lame livelock avoidance.
+- */
+-static void set_sb_syncing(int val)
+-{
+- struct super_block *sb;
+- spin_lock(&sb_lock);
+- list_for_each_entry_reverse(sb, &super_blocks, s_list)
+- sb->s_syncing = val;
+- spin_unlock(&sb_lock);
+-}
+-
+ /**
+ * sync_inodes - writes all inodes to disk
+ * @wait: wait for completion
+@@ -690,9 +678,6 @@ static void __sync_inodes(int wait)
+ spin_lock(&sb_lock);
+ restart:
+ list_for_each_entry(sb, &super_blocks, s_list) {
+- if (sb->s_syncing)
+- continue;
+- sb->s_syncing = 1;
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+ down_read(&sb->s_umount);
+@@ -710,13 +695,10 @@ restart:
+
+ void sync_inodes(int wait)
+ {
+- set_sb_syncing(0);
+ __sync_inodes(0);
+
+- if (wait) {
+- set_sb_syncing(0);
++ if (wait)
+ __sync_inodes(1);
+- }
+ }
+
+ /**
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -1124,7 +1124,6 @@ struct super_block {
+ struct rw_semaphore s_umount;
+ struct mutex s_lock;
+ int s_count;
+- int s_syncing;
+ int s_need_sync_fs;
+ atomic_t s_active;
+ #ifdef CONFIG_SECURITY
--- /dev/null
+From 48b47c561e41525061b5bc0cfd67d6367fd11dc4 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:40:22 -0800
+Subject: mm: direct IO starvation improvement
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 48b47c561e41525061b5bc0cfd67d6367fd11dc4 upstream.
+
+Direct IO can invalidate and sync a lot of pagecache pages in the mapping.
+ A 4K direct IO will actually try to sync and/or invalidate the pagecache
+of the entire file, for example (which might be many GB or TB large).
+
+Improve this by doing range syncs. Also, memory no longer has to be
+unmapped to catch the dirty bits for syncing, as dirty bits would remain
+coherent due to dirty mmap accounting.
+
+This fixes the immediate DM deadlocks when doing direct IO reads to block
+device with a mounted filesystem, if only by papering over the problem
+somewhat rather than addressing the fsync starvation cases.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/filemap.c | 16 +++++-----------
+ 1 file changed, 5 insertions(+), 11 deletions(-)
+
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -1317,7 +1317,8 @@ generic_file_aio_read(struct kiocb *iocb
+ goto out; /* skip atime */
+ size = i_size_read(inode);
+ if (pos < size) {
+- retval = filemap_write_and_wait(mapping);
++ retval = filemap_write_and_wait_range(mapping, pos,
++ pos + iov_length(iov, nr_segs) - 1);
+ if (!retval) {
+ retval = mapping->a_ops->direct_IO(READ, iocb,
+ iov, pos, nr_segs);
+@@ -2067,18 +2068,10 @@ generic_file_direct_write(struct kiocb *
+ if (count != ocount)
+ *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
+
+- /*
+- * Unmap all mmappings of the file up-front.
+- *
+- * This will cause any pte dirty bits to be propagated into the
+- * pageframes for the subsequent filemap_write_and_wait().
+- */
+ write_len = iov_length(iov, *nr_segs);
+ end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
+- if (mapping_mapped(mapping))
+- unmap_mapping_range(mapping, pos, write_len, 0);
+
+- written = filemap_write_and_wait(mapping);
++ written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
+ if (written)
+ goto out;
+
+@@ -2298,7 +2291,8 @@ generic_file_buffered_write(struct kiocb
+ * the file data here, to try to honour O_DIRECT expectations.
+ */
+ if (unlikely(file->f_flags & O_DIRECT) && written)
+- status = filemap_write_and_wait(mapping);
++ status = filemap_write_and_wait_range(mapping,
++ pos, pos + written - 1);
+
+ return written ? written : status;
+ }
--- /dev/null
+From ee53a891f47444c53318b98dac947ede963db400 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:12 -0800
+Subject: mm: do_sync_mapping_range integrity fix
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit ee53a891f47444c53318b98dac947ede963db400 upstream.
+
+Chris Mason notices do_sync_mapping_range didn't actually ask for data
+integrity writeout. Unfortunately, it is advertised as being usable for
+data integrity operations.
+
+This is a data integrity bug.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/sync.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/sync.c
++++ b/fs/sync.c
+@@ -287,7 +287,7 @@ int do_sync_mapping_range(struct address
+
+ if (flags & SYNC_FILE_RANGE_WRITE) {
+ ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
+- WB_SYNC_NONE);
++ WB_SYNC_ALL);
+ if (ret < 0)
+ goto out;
+ }
--- /dev/null
+From 5a3d5c9813db56a75934eb1015367fda23a8b0b4 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:09 -0800
+Subject: mm: write_cache_pages cleanups
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 5a3d5c9813db56a75934eb1015367fda23a8b0b4 upstream.
+
+Get rid of some complex expressions from flow control statements, add a
+comment, remove some duplicate code.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c | 34 ++++++++++++++++++++++------------
+ 1 file changed, 22 insertions(+), 12 deletions(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -899,11 +899,14 @@ int write_cache_pages(struct address_spa
+ }
+ retry:
+ done_index = index;
+- while (!done && (index <= end) &&
+- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+- PAGECACHE_TAG_DIRTY,
+- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+- unsigned i;
++ while (!done && (index <= end)) {
++ int i;
++
++ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
++ PAGECACHE_TAG_DIRTY,
++ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
++ if (nr_pages == 0)
++ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+@@ -919,7 +922,16 @@ retry:
+ */
+ lock_page(page);
+
++ /*
++ * Page truncated or invalidated. We can freely skip it
++ * then, even for data integrity operations: the page
++ * has disappeared concurrently, so there could be no
++ * real expectation of this data interity operation
++ * even if there is now a new, dirty page at the same
++ * pagecache address.
++ */
+ if (unlikely(page->mapping != mapping)) {
++continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+@@ -930,18 +942,15 @@ retry:
+ * end == -1 in that case.
+ */
+ done = 1;
+- unlock_page(page);
+- continue;
++ goto continue_unlock;
+ }
+
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+
+ if (PageWriteback(page) ||
+- !clear_page_dirty_for_io(page)) {
+- unlock_page(page);
+- continue;
+- }
++ !clear_page_dirty_for_io(page))
++ goto continue_unlock;
+
+ ret = (*writepage)(page, wbc, data);
+ if (unlikely(ret)) {
+@@ -964,7 +973,8 @@ retry:
+ }
+
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+- if (--wbc->nr_to_write <= 0)
++ wbc->nr_to_write--;
++ if (wbc->nr_to_write <= 0)
+ done = 1;
+ }
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
--- /dev/null
+From bd19e012f6fd3b7309689165ea865cbb7bb88c1e Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:06 -0800
+Subject: mm: write_cache_pages early loop termination
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit bd19e012f6fd3b7309689165ea865cbb7bb88c1e upstream.
+
+We'd like to break out of the loop early in many situations, however the
+existing code has been setting mapping->writeback_index past the final
+page in the pagevec lookup for cyclic writeback. This is a problem if we
+don't process all pages up to the final page.
+
+Currently the code mostly keeps writeback_index reasonable and hacked
+around this by not breaking out of the loop or writing pages outside the
+range in these cases. Keep track of a real "done index" that enables us
+to terminate the loop in a much more flexible manner.
+
+Needed by the subsequent patch to preserve writepage errors, and then
+further patches to break out of the loop early for other reasons. However
+there are no functional changes with this patch alone.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -871,6 +871,7 @@ int write_cache_pages(struct address_spa
+ pgoff_t uninitialized_var(writeback_index);
+ pgoff_t index;
+ pgoff_t end; /* Inclusive */
++ pgoff_t done_index;
+ int cycled;
+ int range_whole = 0;
+ long nr_to_write = wbc->nr_to_write;
+@@ -897,6 +898,7 @@ int write_cache_pages(struct address_spa
+ cycled = 1; /* ignore range_cyclic tests */
+ }
+ retry:
++ done_index = index;
+ while (!done && (index <= end) &&
+ (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+@@ -906,6 +908,8 @@ retry:
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
++ done_index = page->index + 1;
++
+ /*
+ * At this point we hold neither mapping->tree_lock nor
+ * lock on the page itself: the page may be truncated or
+@@ -968,7 +972,7 @@ retry:
+ }
+ if (!wbc->no_nrwrite_index_update) {
+ if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
+- mapping->writeback_index = index;
++ mapping->writeback_index = done_index;
+ wbc->nr_to_write = nr_to_write;
+ }
+
--- /dev/null
+From 05fe478dd04e02fa230c305ab9b5616669821dd3 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:08 -0800
+Subject: mm: write_cache_pages integrity fix
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 05fe478dd04e02fa230c305ab9b5616669821dd3 upstream.
+
+In write_cache_pages, nr_to_write is heeded even for data-integrity syncs,
+so the function will return success after writing out nr_to_write pages,
+even if that was not sufficient to guarantee data integrity.
+
+The callers tend to set it to values that could break data interity
+semantics easily in practice. For example, nr_to_write can be set to
+mapping->nr_pages * 2, however if a file has a single, dirty page, then
+fsync is called, subsequent pages might be concurrently added and dirtied,
+then write_cache_pages might writeout two of these newly dirty pages,
+while not writing out the old page that should have been written out.
+
+Fix this by ignoring nr_to_write if it is a data integrity sync.
+
+This is a data integrity bug.
+
+The reason this has been done in the past is to avoid stalling sync
+operations behind page dirtiers.
+
+ "If a file has one dirty page at offset 1000000000000000 then someone
+ does an fsync() and someone else gets in first and starts madly writing
+ pages at offset 0, we want to write that page at 1000000000000000.
+ Somehow."
+
+What we do today is return success after an arbitrary amount of pages are
+written, whether or not we have provided the data-integrity semantics that
+the caller has asked for. Even this doesn't actually fix all stall cases
+completely: in the above situation, if the file has a huge number of pages
+in pagecache (but not dirty), then mapping->nrpages is going to be huge,
+even if pages are being dirtied.
+
+This change does indeed make the possibility of long stalls lager, and
+that's not a good thing, but lying about data integrity is even worse. We
+have to either perform the sync, or return -ELINUXISLAME so at least the
+caller knows what has happened.
+
+There are subsequent competing approaches in the works to solve the stall
+problems properly, without compromising data integrity.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/filemap.c | 2 +-
+ mm/page-writeback.c | 6 ++++--
+ 2 files changed, 5 insertions(+), 3 deletions(-)
+
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct ad
+ int ret;
+ struct writeback_control wbc = {
+ .sync_mode = sync_mode,
+- .nr_to_write = mapping->nrpages * 2,
++ .nr_to_write = LONG_MAX,
+ .range_start = start,
+ .range_end = end,
+ };
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -963,8 +963,10 @@ retry:
+ }
+ }
+
+- if (--nr_to_write <= 0)
+- done = 1;
++ if (wbc->sync_mode == WB_SYNC_NONE) {
++ if (--wbc->nr_to_write <= 0)
++ done = 1;
++ }
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ done = 1;
--- /dev/null
+From 82fd1a9a8ced9607312b54859572bcc6211e8919 Mon Sep 17 00:00:00 2001
+From: Andrew Morton <akpm@linux-foundation.org>
+Date: Tue, 6 Jan 2009 14:39:11 -0800
+Subject: mm: write_cache_pages more terminate quickly
+
+From: Andrew Morton <akpm@linux-foundation.org>
+
+commit 82fd1a9a8ced9607312b54859572bcc6211e8919 upstream.
+
+Now that we have the early-termination logic in place, it makes sense to
+bail out early in all other cases where done is set to 1.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -983,12 +983,15 @@ continue_unlock:
+
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ wbc->nr_to_write--;
+- if (wbc->nr_to_write <= 0)
++ if (wbc->nr_to_write <= 0) {
+ done = 1;
++ break;
++ }
+ }
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
+ done = 1;
++ break;
+ }
+ }
+ pagevec_release(&pvec);
--- /dev/null
+From 515f4a037fb9ab736f8bad733fcd2ffd350cf265 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:10 -0800
+Subject: mm: write_cache_pages optimise page cleaning
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 515f4a037fb9ab736f8bad733fcd2ffd350cf265 upstream.
+
+In write_cache_pages, if we get stuck behind another process that is
+cleaning pages, we will be forced to wait for them to finish, then perform
+our own writeout (if it was redirtied during the long wait), then wait for
+that.
+
+If a page under writeout is still clean, we can skip waiting for it (if
+we're part of a data integrity sync, we'll be waiting for all writeout
+pages afterwards, so we'll still be waiting for the other guy's write
+that's cleaned the page).
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c | 17 +++++++++++++----
+ 1 file changed, 13 insertions(+), 4 deletions(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -945,11 +945,20 @@ continue_unlock:
+ goto continue_unlock;
+ }
+
+- if (wbc->sync_mode != WB_SYNC_NONE)
+- wait_on_page_writeback(page);
++ if (!PageDirty(page)) {
++ /* someone wrote it for us */
++ goto continue_unlock;
++ }
++
++ if (PageWriteback(page)) {
++ if (wbc->sync_mode != WB_SYNC_NONE)
++ wait_on_page_writeback(page);
++ else
++ goto continue_unlock;
++ }
+
+- if (PageWriteback(page) ||
+- !clear_page_dirty_for_io(page))
++ BUG_ON(PageWriteback(page));
++ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
+ ret = (*writepage)(page, wbc, data);
--- /dev/null
+From d5482cdf8a0aacb1e6468a97d5544f5829c8d8c4 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:11 -0800
+Subject: mm: write_cache_pages terminate quickly
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit d5482cdf8a0aacb1e6468a97d5544f5829c8d8c4 upstream.
+
+Terminate the write_cache_pages loop upon encountering the first page past
+end, without locking the page. Pages cannot have their index change when
+we have a reference on them (truncate, eg truncate_inode_pages_range
+performs the same check without the page lock).
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c | 32 ++++++++++++++++----------------
+ 1 file changed, 16 insertions(+), 16 deletions(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -911,15 +911,24 @@ retry:
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+- done_index = page->index + 1;
+-
+ /*
+- * At this point we hold neither mapping->tree_lock nor
+- * lock on the page itself: the page may be truncated or
+- * invalidated (changing page->mapping to NULL), or even
+- * swizzled back from swapper_space to tmpfs file
+- * mapping
++ * At this point, the page may be truncated or
++ * invalidated (changing page->mapping to NULL), or
++ * even swizzled back from swapper_space to tmpfs file
++ * mapping. However, page->index will not change
++ * because we have a reference on the page.
+ */
++ if (page->index > end) {
++ /*
++ * can't be range_cyclic (1st pass) because
++ * end == -1 in that case.
++ */
++ done = 1;
++ break;
++ }
++
++ done_index = page->index + 1;
++
+ lock_page(page);
+
+ /*
+@@ -936,15 +945,6 @@ continue_unlock:
+ continue;
+ }
+
+- if (page->index > end) {
+- /*
+- * can't be range_cyclic (1st pass) because
+- * end == -1 in that case.
+- */
+- done = 1;
+- goto continue_unlock;
+- }
+-
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
--- /dev/null
+From 00266770b8b3a6a77f896ca501a0613739086832 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:06 -0800
+Subject: mm: write_cache_pages writepage error fix
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 00266770b8b3a6a77f896ca501a0613739086832 upstream.
+
+In write_cache_pages, if ret signals a real error, but we still have some
+pages left in the pagevec, done would be set to 1, but the remaining pages
+would continue to be processed and ret will be overwritten in the process.
+
+It could easily be overwritten with success, and thus success will be
+returned even if there is an error. Thus the caller is told all writes
+succeeded, wheras in reality some did not.
+
+Fix this by bailing immediately if there is an error, and retaining the
+first error code.
+
+This is a data integrity bug.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c | 24 +++++++++++++++++++-----
+ 1 file changed, 19 insertions(+), 5 deletions(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -944,12 +944,26 @@ retry:
+ }
+
+ ret = (*writepage)(page, wbc, data);
++ if (unlikely(ret)) {
++ if (ret == AOP_WRITEPAGE_ACTIVATE) {
++ unlock_page(page);
++ ret = 0;
++ } else {
++ /*
++ * done_index is set past this page,
++ * so media errors will not choke
++ * background writeout for the entire
++ * file. This has consequences for
++ * range_cyclic semantics (ie. it may
++ * not be suitable for data integrity
++ * writeout).
++ */
++ done = 1;
++ break;
++ }
++ }
+
+- if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+- unlock_page(page);
+- ret = 0;
+- }
+- if (ret || (--nr_to_write <= 0))
++ if (--nr_to_write <= 0)
+ done = 1;
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ wbc->encountered_congestion = 1;
powerpc-is_hugepage_only_range-must-account-for-both-4kb-and-64kb-slices.patch
hwmon-fix-config_dmi-n-fallback-to-probe.patch
mm-write_cache_pages-cyclic-fix.patch
+mm-write_cache_pages-early-loop-termination.patch
+mm-write_cache_pages-writepage-error-fix.patch
+mm-write_cache_pages-integrity-fix.patch
+mm-write_cache_pages-cleanups.patch
+mm-write_cache_pages-optimise-page-cleaning.patch
+mm-write_cache_pages-terminate-quickly.patch
+mm-write_cache_pages-more-terminate-quickly.patch
+mm-do_sync_mapping_range-integrity-fix.patch
+mm-direct-io-starvation-improvement.patch
+fs-remove-wb_sync_hold.patch
+fs-sync_sb_inodes-fix.patch
+fs-sys_sync-fix.patch