.28 patches

author Greg Kroah-Hartman <gregkh@suse.de>

Thu, 22 Jan 2009 23:17:36 +0000 (15:17 -0800)

committer Greg Kroah-Hartman <gregkh@suse.de>

Thu, 22 Jan 2009 23:17:36 +0000 (15:17 -0800)
author Greg Kroah-Hartman <gregkh@suse.de>
Thu, 22 Jan 2009 23:17:36 +0000 (15:17 -0800)
committer Greg Kroah-Hartman <gregkh@suse.de>
Thu, 22 Jan 2009 23:17:36 +0000 (15:17 -0800)
diff --git a/queue-2.6.28/fs-remove-wb_sync_hold.patch b/queue-2.6.28/fs-remove-wb_sync_hold.patch

new file mode 100644 (file)

index 0000000..11cb83b
--- /dev/null
+++ b/queue-2.6.28/fs-remove-wb_sync_hold.patch
@@ -0,0 +1,91 @@
+From 4f5a99d64c17470a784a6c68064207d82e3e74a5 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:40:25 -0800
+Subject: fs: remove WB_SYNC_HOLD
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 4f5a99d64c17470a784a6c68064207d82e3e74a5 upstream.
+
+Remove WB_SYNC_HOLD.  The primary motiviation is the design of my
+anti-starvation code for fsync.  It requires taking an inode lock over the
+sync operation, so we could run into lock ordering problems with multiple
+inodes.  It is possible to take a single global lock to solve the ordering
+problem, but then that would prevent a future nice implementation of "sync
+multiple inodes" based on lock order via inode address.
+
+Seems like a backward step to remove this, but actually it is busted
+anyway: we can't use the inode lists for data integrity wait: an inode can
+be taken off the dirty lists but still be under writeback.  In order to
+satisfy data integrity semantics, we should wait for it to finish
+writeback, but if we only search the dirty lists, we'll miss it.
+
+It would be possible to have a "writeback" list, for sys_sync, I suppose.
+But why complicate things by prematurely optimise?  For unmounting, we
+could avoid the "livelock avoidance" code, which would be easier, but
+again premature IMO.
+
+Fixing the existing data integrity problem will come next.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/fs-writeback.c         |   12 ++----------
+ include/linux/writeback.h |    1 -
+ 2 files changed, 2 insertions(+), 11 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *i
+  * If we're a pdlfush thread, then implement pdflush collision avoidance
+  * against the entire list.
+  *
+- * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
+- * that it can be located for waiting on in __writeback_single_inode().
+- *
+  * If `bdi' is non-zero then we're being asked to writeback a specific queue.
+  * This function assumes that the blockdev superblock's inodes are backed by
+  * a variety of queues, so all inodes are searched.  For other superblocks,
+@@ -499,10 +496,6 @@ void generic_sync_sb_inodes(struct super
+               __iget(inode);
+               pages_skipped = wbc->pages_skipped;
+               __writeback_single_inode(inode, wbc);
+-              if (wbc->sync_mode == WB_SYNC_HOLD) {
+-                      inode->dirtied_when = jiffies;
+-                      list_move(&inode->i_list, &sb->s_dirty);
+-              }
+               if (current_is_pdflush())
+                       writeback_release(bdi);
+               if (wbc->pages_skipped != pages_skipped) {
+@@ -588,8 +581,7 @@ restart:
+ 
+ /*
+  * writeback and wait upon the filesystem's dirty inodes.  The caller will
+- * do this in two passes - one to write, and one to wait.  WB_SYNC_HOLD is
+- * used to park the written inodes on sb->s_dirty for the wait pass.
++ * do this in two passes - one to write, and one to wait.
+  *
+  * A finite limit is set on the number of pages which will be written.
+  * To prevent infinite livelock of sys_sync().
+@@ -600,7 +592,7 @@ restart:
+ void sync_inodes_sb(struct super_block *sb, int wait)
+ {
+       struct writeback_control wbc = {
+-              .sync_mode      = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
++              .sync_mode      = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+               .range_start    = 0,
+               .range_end      = LLONG_MAX,
+       };
+--- a/include/linux/writeback.h
++++ b/include/linux/writeback.h
+@@ -30,7 +30,6 @@ static inline int task_is_pdflush(struct
+ enum writeback_sync_modes {
+       WB_SYNC_NONE,   /* Don't wait on anything */
+       WB_SYNC_ALL,    /* Wait on every mapping */
+-      WB_SYNC_HOLD,   /* Hold the inode on sb_dirty for sys_sync() */
+ };
+ 
+ /*
diff --git a/queue-2.6.28/fs-sync_sb_inodes-fix.patch b/queue-2.6.28/fs-sync_sb_inodes-fix.patch

new file mode 100644 (file)

index 0000000..f5a003e
--- /dev/null
+++ b/queue-2.6.28/fs-sync_sb_inodes-fix.patch
@@ -0,0 +1,106 @@
+From 38f21977663126fef53f5585e7f1653d8ebe55c4 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:40:25 -0800
+Subject: fs: sync_sb_inodes fix
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 38f21977663126fef53f5585e7f1653d8ebe55c4 upstream.
+
+Fix data integrity semantics required by sys_sync, by iterating over all
+inodes and waiting for any writeback pages after the initial writeout.
+Comments explain the exact problem.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/fs-writeback.c |   60 +++++++++++++++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 53 insertions(+), 7 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -440,6 +440,7 @@ void generic_sync_sb_inodes(struct super
+                               struct writeback_control *wbc)
+ {
+       const unsigned long start = jiffies;    /* livelock avoidance */
++      int sync = wbc->sync_mode == WB_SYNC_ALL;
+ 
+       spin_lock(&inode_lock);
+       if (!wbc->for_kupdate || list_empty(&sb->s_io))
+@@ -516,7 +517,49 @@ void generic_sync_sb_inodes(struct super
+               if (!list_empty(&sb->s_more_io))
+                       wbc->more_io = 1;
+       }
+-      spin_unlock(&inode_lock);
++
++      if (sync) {
++              struct inode *inode, *old_inode = NULL;
++
++              /*
++               * Data integrity sync. Must wait for all pages under writeback,
++               * because there may have been pages dirtied before our sync
++               * call, but which had writeout started before we write it out.
++               * In which case, the inode may not be on the dirty list, but
++               * we still have to wait for that writeout.
++               */
++              list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
++                      struct address_space *mapping;
++
++                      if (inode->i_state & (I_FREEING|I_WILL_FREE))
++                              continue;
++                      mapping = inode->i_mapping;
++                      if (mapping->nrpages == 0)
++                              continue;
++                      __iget(inode);
++                      spin_unlock(&inode_lock);
++                      /*
++                       * We hold a reference to 'inode' so it couldn't have
++                       * been removed from s_inodes list while we dropped the
++                       * inode_lock.  We cannot iput the inode now as we can
++                       * be holding the last reference and we cannot iput it
++                       * under inode_lock. So we keep the reference and iput
++                       * it later.
++                       */
++                      iput(old_inode);
++                      old_inode = inode;
++
++                      filemap_fdatawait(mapping);
++
++                      cond_resched();
++
++                      spin_lock(&inode_lock);
++              }
++              spin_unlock(&inode_lock);
++              iput(old_inode);
++      } else
++              spin_unlock(&inode_lock);
++
+       return;         /* Leave any unwritten inodes on s_io */
+ }
+ EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
+@@ -596,13 +639,16 @@ void sync_inodes_sb(struct super_block *
+               .range_start    = 0,
+               .range_end      = LLONG_MAX,
+       };
+-      unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+-      unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+ 
+-      wbc.nr_to_write = nr_dirty + nr_unstable +
+-                      (inodes_stat.nr_inodes - inodes_stat.nr_unused) +
+-                      nr_dirty + nr_unstable;
+-      wbc.nr_to_write += wbc.nr_to_write / 2;         /* Bit more for luck */
++      if (!wait) {
++              unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
++              unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
++
++              wbc.nr_to_write = nr_dirty + nr_unstable +
++                      (inodes_stat.nr_inodes - inodes_stat.nr_unused);
++      } else
++              wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
++
+       sync_sb_inodes(sb, &wbc);
+ }
+ 
diff --git a/queue-2.6.28/fs-sys_sync-fix.patch b/queue-2.6.28/fs-sys_sync-fix.patch

new file mode 100644 (file)

index 0000000..88f9236
--- /dev/null
+++ b/queue-2.6.28/fs-sys_sync-fix.patch
@@ -0,0 +1,83 @@
+From 856bf4d717feb8c55d4e2f817b71ebb70cfbc67b Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:40:26 -0800
+Subject: fs: sys_sync fix
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 856bf4d717feb8c55d4e2f817b71ebb70cfbc67b upstream.
+
+s_syncing livelock avoidance was breaking data integrity guarantee of
+sys_sync, by allowing sys_sync to skip writing or waiting for superblocks
+if there is a concurrent sys_sync happening.
+
+This livelock avoidance is much less important now that we don't have the
+get_super_to_sync() call after every sb that we sync.  This was replaced
+by __put_super_and_need_restart.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/fs-writeback.c  |   20 +-------------------
+ include/linux/fs.h |    1 -
+ 2 files changed, 1 insertion(+), 20 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -652,18 +652,6 @@ void sync_inodes_sb(struct super_block *
+       sync_sb_inodes(sb, &wbc);
+ }
+ 
+-/*
+- * Rather lame livelock avoidance.
+- */
+-static void set_sb_syncing(int val)
+-{
+-      struct super_block *sb;
+-      spin_lock(&sb_lock);
+-      list_for_each_entry_reverse(sb, &super_blocks, s_list)
+-              sb->s_syncing = val;
+-      spin_unlock(&sb_lock);
+-}
+-
+ /**
+  * sync_inodes - writes all inodes to disk
+  * @wait: wait for completion
+@@ -690,9 +678,6 @@ static void __sync_inodes(int wait)
+       spin_lock(&sb_lock);
+ restart:
+       list_for_each_entry(sb, &super_blocks, s_list) {
+-              if (sb->s_syncing)
+-                      continue;
+-              sb->s_syncing = 1;
+               sb->s_count++;
+               spin_unlock(&sb_lock);
+               down_read(&sb->s_umount);
+@@ -710,13 +695,10 @@ restart:
+ 
+ void sync_inodes(int wait)
+ {
+-      set_sb_syncing(0);
+       __sync_inodes(0);
+ 
+-      if (wait) {
+-              set_sb_syncing(0);
++      if (wait)
+               __sync_inodes(1);
+-      }
+ }
+ 
+ /**
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -1124,7 +1124,6 @@ struct super_block {
+       struct rw_semaphore     s_umount;
+       struct mutex            s_lock;
+       int                     s_count;
+-      int                     s_syncing;
+       int                     s_need_sync_fs;
+       atomic_t                s_active;
+ #ifdef CONFIG_SECURITY
diff --git a/queue-2.6.28/mm-direct-io-starvation-improvement.patch b/queue-2.6.28/mm-direct-io-starvation-improvement.patch

new file mode 100644 (file)

index 0000000..83dc629
--- /dev/null
+++ b/queue-2.6.28/mm-direct-io-starvation-improvement.patch
@@ -0,0 +1,73 @@
+From 48b47c561e41525061b5bc0cfd67d6367fd11dc4 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:40:22 -0800
+Subject: mm: direct IO starvation improvement
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 48b47c561e41525061b5bc0cfd67d6367fd11dc4 upstream.
+
+Direct IO can invalidate and sync a lot of pagecache pages in the mapping.
+ A 4K direct IO will actually try to sync and/or invalidate the pagecache
+of the entire file, for example (which might be many GB or TB large).
+
+Improve this by doing range syncs.  Also, memory no longer has to be
+unmapped to catch the dirty bits for syncing, as dirty bits would remain
+coherent due to dirty mmap accounting.
+
+This fixes the immediate DM deadlocks when doing direct IO reads to block
+device with a mounted filesystem, if only by papering over the problem
+somewhat rather than addressing the fsync starvation cases.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/filemap.c |   16 +++++-----------
+ 1 file changed, 5 insertions(+), 11 deletions(-)
+
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -1317,7 +1317,8 @@ generic_file_aio_read(struct kiocb *iocb
+                       goto out; /* skip atime */
+               size = i_size_read(inode);
+               if (pos < size) {
+-                      retval = filemap_write_and_wait(mapping);
++                      retval = filemap_write_and_wait_range(mapping, pos,
++                                      pos + iov_length(iov, nr_segs) - 1);
+                       if (!retval) {
+                               retval = mapping->a_ops->direct_IO(READ, iocb,
+                                                       iov, pos, nr_segs);
+@@ -2067,18 +2068,10 @@ generic_file_direct_write(struct kiocb *
+       if (count != ocount)
+               *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
+ 
+-      /*
+-       * Unmap all mmappings of the file up-front.
+-       *
+-       * This will cause any pte dirty bits to be propagated into the
+-       * pageframes for the subsequent filemap_write_and_wait().
+-       */
+       write_len = iov_length(iov, *nr_segs);
+       end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
+-      if (mapping_mapped(mapping))
+-              unmap_mapping_range(mapping, pos, write_len, 0);
+ 
+-      written = filemap_write_and_wait(mapping);
++      written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
+       if (written)
+               goto out;
+ 
+@@ -2298,7 +2291,8 @@ generic_file_buffered_write(struct kiocb
+        * the file data here, to try to honour O_DIRECT expectations.
+        */
+       if (unlikely(file->f_flags & O_DIRECT) && written)
+-              status = filemap_write_and_wait(mapping);
++              status = filemap_write_and_wait_range(mapping,
++                                      pos, pos + written - 1);
+ 
+       return written ? written : status;
+ }
diff --git a/queue-2.6.28/mm-do_sync_mapping_range-integrity-fix.patch b/queue-2.6.28/mm-do_sync_mapping_range-integrity-fix.patch

new file mode 100644 (file)

index 0000000..8a36b0a
--- /dev/null
+++ b/queue-2.6.28/mm-do_sync_mapping_range-integrity-fix.patch
@@ -0,0 +1,37 @@
+From ee53a891f47444c53318b98dac947ede963db400 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:12 -0800
+Subject: mm: do_sync_mapping_range integrity fix
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit ee53a891f47444c53318b98dac947ede963db400 upstream.
+
+Chris Mason notices do_sync_mapping_range didn't actually ask for data
+integrity writeout.  Unfortunately, it is advertised as being usable for
+data integrity operations.
+
+This is a data integrity bug.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ fs/sync.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/sync.c
++++ b/fs/sync.c
+@@ -287,7 +287,7 @@ int do_sync_mapping_range(struct address
+ 
+       if (flags & SYNC_FILE_RANGE_WRITE) {
+               ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
+-                                              WB_SYNC_NONE);
++                                              WB_SYNC_ALL);
+               if (ret < 0)
+                       goto out;
+       }
diff --git a/queue-2.6.28/mm-write_cache_pages-cleanups.patch b/queue-2.6.28/mm-write_cache_pages-cleanups.patch

new file mode 100644 (file)

index 0000000..70f0ce5
--- /dev/null
+++ b/queue-2.6.28/mm-write_cache_pages-cleanups.patch
@@ -0,0 +1,94 @@
+From 5a3d5c9813db56a75934eb1015367fda23a8b0b4 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:09 -0800
+Subject: mm: write_cache_pages cleanups
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 5a3d5c9813db56a75934eb1015367fda23a8b0b4 upstream.
+
+Get rid of some complex expressions from flow control statements, add a
+comment, remove some duplicate code.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c |   34 ++++++++++++++++++++++------------
+ 1 file changed, 22 insertions(+), 12 deletions(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -899,11 +899,14 @@ int write_cache_pages(struct address_spa
+       }
+ retry:
+       done_index = index;
+-      while (!done && (index <= end) &&
+-             (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+-                                            PAGECACHE_TAG_DIRTY,
+-                                            min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+-              unsigned i;
++      while (!done && (index <= end)) {
++              int i;
++
++              nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
++                            PAGECACHE_TAG_DIRTY,
++                            min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
++              if (nr_pages == 0)
++                      break;
+ 
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+@@ -919,7 +922,16 @@ retry:
+                        */
+                       lock_page(page);
+ 
++                      /*
++                       * Page truncated or invalidated. We can freely skip it
++                       * then, even for data integrity operations: the page
++                       * has disappeared concurrently, so there could be no
++                       * real expectation of this data interity operation
++                       * even if there is now a new, dirty page at the same
++                       * pagecache address.
++                       */
+                       if (unlikely(page->mapping != mapping)) {
++continue_unlock:
+                               unlock_page(page);
+                               continue;
+                       }
+@@ -930,18 +942,15 @@ retry:
+                                * end == -1 in that case.
+                                */
+                               done = 1;
+-                              unlock_page(page);
+-                              continue;
++                              goto continue_unlock;
+                       }
+ 
+                       if (wbc->sync_mode != WB_SYNC_NONE)
+                               wait_on_page_writeback(page);
+ 
+                       if (PageWriteback(page) ||
+-                          !clear_page_dirty_for_io(page)) {
+-                              unlock_page(page);
+-                              continue;
+-                      }
++                          !clear_page_dirty_for_io(page))
++                              goto continue_unlock;
+ 
+                       ret = (*writepage)(page, wbc, data);
+                       if (unlikely(ret)) {
+@@ -964,7 +973,8 @@ retry:
+                       }
+ 
+                       if (wbc->sync_mode == WB_SYNC_NONE) {
+-                              if (--wbc->nr_to_write <= 0)
++                              wbc->nr_to_write--;
++                              if (wbc->nr_to_write <= 0)
+                                       done = 1;
+                       }
+                       if (wbc->nonblocking && bdi_write_congested(bdi)) {
diff --git a/queue-2.6.28/mm-write_cache_pages-early-loop-termination.patch b/queue-2.6.28/mm-write_cache_pages-early-loop-termination.patch

new file mode 100644 (file)

index 0000000..46b6f45
--- /dev/null
+++ b/queue-2.6.28/mm-write_cache_pages-early-loop-termination.patch
@@ -0,0 +1,70 @@
+From bd19e012f6fd3b7309689165ea865cbb7bb88c1e Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:06 -0800
+Subject: mm: write_cache_pages early loop termination
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit bd19e012f6fd3b7309689165ea865cbb7bb88c1e upstream.
+
+We'd like to break out of the loop early in many situations, however the
+existing code has been setting mapping->writeback_index past the final
+page in the pagevec lookup for cyclic writeback.  This is a problem if we
+don't process all pages up to the final page.
+
+Currently the code mostly keeps writeback_index reasonable and hacked
+around this by not breaking out of the loop or writing pages outside the
+range in these cases.  Keep track of a real "done index" that enables us
+to terminate the loop in a much more flexible manner.
+
+Needed by the subsequent patch to preserve writepage errors, and then
+further patches to break out of the loop early for other reasons.  However
+there are no functional changes with this patch alone.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -871,6 +871,7 @@ int write_cache_pages(struct address_spa
+       pgoff_t uninitialized_var(writeback_index);
+       pgoff_t index;
+       pgoff_t end;            /* Inclusive */
++      pgoff_t done_index;
+       int cycled;
+       int range_whole = 0;
+       long nr_to_write = wbc->nr_to_write;
+@@ -897,6 +898,7 @@ int write_cache_pages(struct address_spa
+               cycled = 1; /* ignore range_cyclic tests */
+       }
+ retry:
++      done_index = index;
+       while (!done && (index <= end) &&
+              (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                                             PAGECACHE_TAG_DIRTY,
+@@ -906,6 +908,8 @@ retry:
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+ 
++                      done_index = page->index + 1;
++
+                       /*
+                        * At this point we hold neither mapping->tree_lock nor
+                        * lock on the page itself: the page may be truncated or
+@@ -968,7 +972,7 @@ retry:
+       }
+       if (!wbc->no_nrwrite_index_update) {
+               if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
+-                      mapping->writeback_index = index;
++                      mapping->writeback_index = done_index;
+               wbc->nr_to_write = nr_to_write;
+       }
+ 
diff --git a/queue-2.6.28/mm-write_cache_pages-integrity-fix.patch b/queue-2.6.28/mm-write_cache_pages-integrity-fix.patch

new file mode 100644 (file)

index 0000000..87979a7
--- /dev/null
+++ b/queue-2.6.28/mm-write_cache_pages-integrity-fix.patch
@@ -0,0 +1,85 @@
+From 05fe478dd04e02fa230c305ab9b5616669821dd3 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:08 -0800
+Subject: mm: write_cache_pages integrity fix
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 05fe478dd04e02fa230c305ab9b5616669821dd3 upstream.
+
+In write_cache_pages, nr_to_write is heeded even for data-integrity syncs,
+so the function will return success after writing out nr_to_write pages,
+even if that was not sufficient to guarantee data integrity.
+
+The callers tend to set it to values that could break data interity
+semantics easily in practice.  For example, nr_to_write can be set to
+mapping->nr_pages * 2, however if a file has a single, dirty page, then
+fsync is called, subsequent pages might be concurrently added and dirtied,
+then write_cache_pages might writeout two of these newly dirty pages,
+while not writing out the old page that should have been written out.
+
+Fix this by ignoring nr_to_write if it is a data integrity sync.
+
+This is a data integrity bug.
+
+The reason this has been done in the past is to avoid stalling sync
+operations behind page dirtiers.
+
+ "If a file has one dirty page at offset 1000000000000000 then someone
+  does an fsync() and someone else gets in first and starts madly writing
+  pages at offset 0, we want to write that page at 1000000000000000.
+  Somehow."
+
+What we do today is return success after an arbitrary amount of pages are
+written, whether or not we have provided the data-integrity semantics that
+the caller has asked for.  Even this doesn't actually fix all stall cases
+completely: in the above situation, if the file has a huge number of pages
+in pagecache (but not dirty), then mapping->nrpages is going to be huge,
+even if pages are being dirtied.
+
+This change does indeed make the possibility of long stalls lager, and
+that's not a good thing, but lying about data integrity is even worse.  We
+have to either perform the sync, or return -ELINUXISLAME so at least the
+caller knows what has happened.
+
+There are subsequent competing approaches in the works to solve the stall
+problems properly, without compromising data integrity.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/filemap.c        |    2 +-
+ mm/page-writeback.c |    6 ++++--
+ 2 files changed, 5 insertions(+), 3 deletions(-)
+
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct ad
+       int ret;
+       struct writeback_control wbc = {
+               .sync_mode = sync_mode,
+-              .nr_to_write = mapping->nrpages * 2,
++              .nr_to_write = LONG_MAX,
+               .range_start = start,
+               .range_end = end,
+       };
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -963,8 +963,10 @@ retry:
+                               }
+                       }
+ 
+-                      if (--nr_to_write <= 0)
+-                              done = 1;
++                      if (wbc->sync_mode == WB_SYNC_NONE) {
++                              if (--wbc->nr_to_write <= 0)
++                                      done = 1;
++                      }
+                       if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                               wbc->encountered_congestion = 1;
+                               done = 1;
diff --git a/queue-2.6.28/mm-write_cache_pages-more-terminate-quickly.patch b/queue-2.6.28/mm-write_cache_pages-more-terminate-quickly.patch

new file mode 100644 (file)

index 0000000..c2ed8bf
--- /dev/null
+++ b/queue-2.6.28/mm-write_cache_pages-more-terminate-quickly.patch
@@ -0,0 +1,42 @@
+From 82fd1a9a8ced9607312b54859572bcc6211e8919 Mon Sep 17 00:00:00 2001
+From: Andrew Morton <akpm@linux-foundation.org>
+Date: Tue, 6 Jan 2009 14:39:11 -0800
+Subject: mm: write_cache_pages more terminate quickly
+
+From: Andrew Morton <akpm@linux-foundation.org>
+
+commit 82fd1a9a8ced9607312b54859572bcc6211e8919 upstream.
+
+Now that we have the early-termination logic in place, it makes sense to
+bail out early in all other cases where done is set to 1.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c |    5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -983,12 +983,15 @@ continue_unlock:
+ 
+                       if (wbc->sync_mode == WB_SYNC_NONE) {
+                               wbc->nr_to_write--;
+-                              if (wbc->nr_to_write <= 0)
++                              if (wbc->nr_to_write <= 0) {
+                                       done = 1;
++                                      break;
++                              }
+                       }
+                       if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                               wbc->encountered_congestion = 1;
+                               done = 1;
++                              break;
+                       }
+               }
+               pagevec_release(&pvec);
diff --git a/queue-2.6.28/mm-write_cache_pages-optimise-page-cleaning.patch b/queue-2.6.28/mm-write_cache_pages-optimise-page-cleaning.patch

new file mode 100644 (file)

index 0000000..20a32e9
--- /dev/null
+++ b/queue-2.6.28/mm-write_cache_pages-optimise-page-cleaning.patch
@@ -0,0 +1,57 @@
+From 515f4a037fb9ab736f8bad733fcd2ffd350cf265 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:10 -0800
+Subject: mm: write_cache_pages optimise page cleaning
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 515f4a037fb9ab736f8bad733fcd2ffd350cf265 upstream.
+
+In write_cache_pages, if we get stuck behind another process that is
+cleaning pages, we will be forced to wait for them to finish, then perform
+our own writeout (if it was redirtied during the long wait), then wait for
+that.
+
+If a page under writeout is still clean, we can skip waiting for it (if
+we're part of a data integrity sync, we'll be waiting for all writeout
+pages afterwards, so we'll still be waiting for the other guy's write
+that's cleaned the page).
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c |   17 +++++++++++++----
+ 1 file changed, 13 insertions(+), 4 deletions(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -945,11 +945,20 @@ continue_unlock:
+                               goto continue_unlock;
+                       }
+ 
+-                      if (wbc->sync_mode != WB_SYNC_NONE)
+-                              wait_on_page_writeback(page);
++                      if (!PageDirty(page)) {
++                              /* someone wrote it for us */
++                              goto continue_unlock;
++                      }
++
++                      if (PageWriteback(page)) {
++                              if (wbc->sync_mode != WB_SYNC_NONE)
++                                      wait_on_page_writeback(page);
++                              else
++                                      goto continue_unlock;
++                      }
+ 
+-                      if (PageWriteback(page) ||
+-                          !clear_page_dirty_for_io(page))
++                      BUG_ON(PageWriteback(page));
++                      if (!clear_page_dirty_for_io(page))
+                               goto continue_unlock;
+ 
+                       ret = (*writepage)(page, wbc, data);
diff --git a/queue-2.6.28/mm-write_cache_pages-terminate-quickly.patch b/queue-2.6.28/mm-write_cache_pages-terminate-quickly.patch

new file mode 100644 (file)

index 0000000..251f483
--- /dev/null
+++ b/queue-2.6.28/mm-write_cache_pages-terminate-quickly.patch
@@ -0,0 +1,75 @@
+From d5482cdf8a0aacb1e6468a97d5544f5829c8d8c4 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:11 -0800
+Subject: mm: write_cache_pages terminate quickly
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit d5482cdf8a0aacb1e6468a97d5544f5829c8d8c4 upstream.
+
+Terminate the write_cache_pages loop upon encountering the first page past
+end, without locking the page.  Pages cannot have their index change when
+we have a reference on them (truncate, eg truncate_inode_pages_range
+performs the same check without the page lock).
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c |   32 ++++++++++++++++----------------
+ 1 file changed, 16 insertions(+), 16 deletions(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -911,15 +911,24 @@ retry:
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+ 
+-                      done_index = page->index + 1;
+-
+                       /*
+-                       * At this point we hold neither mapping->tree_lock nor
+-                       * lock on the page itself: the page may be truncated or
+-                       * invalidated (changing page->mapping to NULL), or even
+-                       * swizzled back from swapper_space to tmpfs file
+-                       * mapping
++                       * At this point, the page may be truncated or
++                       * invalidated (changing page->mapping to NULL), or
++                       * even swizzled back from swapper_space to tmpfs file
++                       * mapping. However, page->index will not change
++                       * because we have a reference on the page.
+                        */
++                      if (page->index > end) {
++                              /*
++                               * can't be range_cyclic (1st pass) because
++                               * end == -1 in that case.
++                               */
++                              done = 1;
++                              break;
++                      }
++
++                      done_index = page->index + 1;
++
+                       lock_page(page);
+ 
+                       /*
+@@ -936,15 +945,6 @@ continue_unlock:
+                               continue;
+                       }
+ 
+-                      if (page->index > end) {
+-                              /*
+-                               * can't be range_cyclic (1st pass) because
+-                               * end == -1 in that case.
+-                               */
+-                              done = 1;
+-                              goto continue_unlock;
+-                      }
+-
+                       if (!PageDirty(page)) {
+                               /* someone wrote it for us */
+                               goto continue_unlock;
diff --git a/queue-2.6.28/mm-write_cache_pages-writepage-error-fix.patch b/queue-2.6.28/mm-write_cache_pages-writepage-error-fix.patch

new file mode 100644 (file)

index 0000000..9a476fe
--- /dev/null
+++ b/queue-2.6.28/mm-write_cache_pages-writepage-error-fix.patch
@@ -0,0 +1,67 @@
+From 00266770b8b3a6a77f896ca501a0613739086832 Mon Sep 17 00:00:00 2001
+From: Nick Piggin <npiggin@suse.de>
+Date: Tue, 6 Jan 2009 14:39:06 -0800
+Subject: mm: write_cache_pages writepage error fix
+
+From: Nick Piggin <npiggin@suse.de>
+
+commit 00266770b8b3a6a77f896ca501a0613739086832 upstream.
+
+In write_cache_pages, if ret signals a real error, but we still have some
+pages left in the pagevec, done would be set to 1, but the remaining pages
+would continue to be processed and ret will be overwritten in the process.
+
+It could easily be overwritten with success, and thus success will be
+returned even if there is an error.  Thus the caller is told all writes
+succeeded, wheras in reality some did not.
+
+Fix this by bailing immediately if there is an error, and retaining the
+first error code.
+
+This is a data integrity bug.
+
+Signed-off-by: Nick Piggin <npiggin@suse.de>
+Cc: Chris Mason <chris.mason@oracle.com>
+Cc: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
+
+---
+ mm/page-writeback.c |   24 +++++++++++++++++++-----
+ 1 file changed, 19 insertions(+), 5 deletions(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -944,12 +944,26 @@ retry:
+                       }
+ 
+                       ret = (*writepage)(page, wbc, data);
++                      if (unlikely(ret)) {
++                              if (ret == AOP_WRITEPAGE_ACTIVATE) {
++                                      unlock_page(page);
++                                      ret = 0;
++                              } else {
++                                      /*
++                                       * done_index is set past this page,
++                                       * so media errors will not choke
++                                       * background writeout for the entire
++                                       * file. This has consequences for
++                                       * range_cyclic semantics (ie. it may
++                                       * not be suitable for data integrity
++                                       * writeout).
++                                       */
++                                      done = 1;
++                                      break;
++                              }
++                      }
+ 
+-                      if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
+-                              unlock_page(page);
+-                              ret = 0;
+-                      }
+-                      if (ret || (--nr_to_write <= 0))
++                      if (--nr_to_write <= 0)
+                               done = 1;
+                       if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                               wbc->encountered_congestion = 1;
diff --git a/queue-2.6.28/series b/queue-2.6.28/series

index eb202e131e8230fb236625cf9c1a066af414532c..db6825b845dc743e003c5a01d1ffd19ae6d31947 100644 (file)
--- a/queue-2.6.28/series
+++ b/queue-2.6.28/series
@@ -32,3 +32,15 @@ dell_rbu-use-scnprintf-instead-of-less-secure-sprintf.patch
  powerpc-is_hugepage_only_range-must-account-for-both-4kb-and-64kb-slices.patch
  hwmon-fix-config_dmi-n-fallback-to-probe.patch
  mm-write_cache_pages-cyclic-fix.patch
+mm-write_cache_pages-early-loop-termination.patch
+mm-write_cache_pages-writepage-error-fix.patch
+mm-write_cache_pages-integrity-fix.patch
+mm-write_cache_pages-cleanups.patch
+mm-write_cache_pages-optimise-page-cleaning.patch
+mm-write_cache_pages-terminate-quickly.patch
+mm-write_cache_pages-more-terminate-quickly.patch
+mm-do_sync_mapping_range-integrity-fix.patch
+mm-direct-io-starvation-improvement.patch
+fs-remove-wb_sync_hold.patch
+fs-sync_sb_inodes-fix.patch
+fs-sys_sync-fix.patch
author	Greg Kroah-Hartman <gregkh@suse.de>
	Thu, 22 Jan 2009 23:17:36 +0000 (15:17 -0800)
committer	Greg Kroah-Hartman <gregkh@suse.de>
	Thu, 22 Jan 2009 23:17:36 +0000 (15:17 -0800)
queue-2.6.28/fs-remove-wb_sync_hold.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.28/fs-sync_sb_inodes-fix.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.28/fs-sys_sync-fix.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.28/mm-direct-io-starvation-improvement.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.28/mm-do_sync_mapping_range-integrity-fix.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.28/mm-write_cache_pages-cleanups.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.28/mm-write_cache_pages-early-loop-termination.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.28/mm-write_cache_pages-integrity-fix.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.28/mm-write_cache_pages-more-terminate-quickly.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.28/mm-write_cache_pages-optimise-page-cleaning.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.28/mm-write_cache_pages-terminate-quickly.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.28/mm-write_cache_pages-writepage-error-fix.patch	[new file with mode: 0644]	patch \| blob
queue-2.6.28/series		patch \| blob \| blame \| history