From: Greg Kroah-Hartman Date: Thu, 22 Jan 2009 23:31:44 +0000 (-0800) Subject: .27 patches X-Git-Tag: v2.6.27.13~4 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=05ec1fc4dac4ff4bf0ca0690b3f405a4e96bfd80;p=thirdparty%2Fkernel%2Fstable-queue.git .27 patches --- diff --git a/queue-2.6.27/fs-remove-wb_sync_hold.patch b/queue-2.6.27/fs-remove-wb_sync_hold.patch new file mode 100644 index 00000000000..11cb83b3c91 --- /dev/null +++ b/queue-2.6.27/fs-remove-wb_sync_hold.patch @@ -0,0 +1,91 @@ +From 4f5a99d64c17470a784a6c68064207d82e3e74a5 Mon Sep 17 00:00:00 2001 +From: Nick Piggin +Date: Tue, 6 Jan 2009 14:40:25 -0800 +Subject: fs: remove WB_SYNC_HOLD + +From: Nick Piggin + +commit 4f5a99d64c17470a784a6c68064207d82e3e74a5 upstream. + +Remove WB_SYNC_HOLD. The primary motiviation is the design of my +anti-starvation code for fsync. It requires taking an inode lock over the +sync operation, so we could run into lock ordering problems with multiple +inodes. It is possible to take a single global lock to solve the ordering +problem, but then that would prevent a future nice implementation of "sync +multiple inodes" based on lock order via inode address. + +Seems like a backward step to remove this, but actually it is busted +anyway: we can't use the inode lists for data integrity wait: an inode can +be taken off the dirty lists but still be under writeback. In order to +satisfy data integrity semantics, we should wait for it to finish +writeback, but if we only search the dirty lists, we'll miss it. + +It would be possible to have a "writeback" list, for sys_sync, I suppose. +But why complicate things by prematurely optimise? For unmounting, we +could avoid the "livelock avoidance" code, which would be easier, but +again premature IMO. + +Fixing the existing data integrity problem will come next. + +Signed-off-by: Nick Piggin +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/fs-writeback.c | 12 ++---------- + include/linux/writeback.h | 1 - + 2 files changed, 2 insertions(+), 11 deletions(-) + +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *i + * If we're a pdlfush thread, then implement pdflush collision avoidance + * against the entire list. + * +- * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so +- * that it can be located for waiting on in __writeback_single_inode(). +- * + * If `bdi' is non-zero then we're being asked to writeback a specific queue. + * This function assumes that the blockdev superblock's inodes are backed by + * a variety of queues, so all inodes are searched. For other superblocks, +@@ -499,10 +496,6 @@ void generic_sync_sb_inodes(struct super + __iget(inode); + pages_skipped = wbc->pages_skipped; + __writeback_single_inode(inode, wbc); +- if (wbc->sync_mode == WB_SYNC_HOLD) { +- inode->dirtied_when = jiffies; +- list_move(&inode->i_list, &sb->s_dirty); +- } + if (current_is_pdflush()) + writeback_release(bdi); + if (wbc->pages_skipped != pages_skipped) { +@@ -588,8 +581,7 @@ restart: + + /* + * writeback and wait upon the filesystem's dirty inodes. The caller will +- * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is +- * used to park the written inodes on sb->s_dirty for the wait pass. ++ * do this in two passes - one to write, and one to wait. + * + * A finite limit is set on the number of pages which will be written. + * To prevent infinite livelock of sys_sync(). +@@ -600,7 +592,7 @@ restart: + void sync_inodes_sb(struct super_block *sb, int wait) + { + struct writeback_control wbc = { +- .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, ++ .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, + .range_start = 0, + .range_end = LLONG_MAX, + }; +--- a/include/linux/writeback.h ++++ b/include/linux/writeback.h +@@ -30,7 +30,6 @@ static inline int task_is_pdflush(struct + enum writeback_sync_modes { + WB_SYNC_NONE, /* Don't wait on anything */ + WB_SYNC_ALL, /* Wait on every mapping */ +- WB_SYNC_HOLD, /* Hold the inode on sb_dirty for sys_sync() */ + }; + + /* diff --git a/queue-2.6.27/fs-sync_sb_inodes-fix.patch b/queue-2.6.27/fs-sync_sb_inodes-fix.patch new file mode 100644 index 00000000000..f5a003ec40a --- /dev/null +++ b/queue-2.6.27/fs-sync_sb_inodes-fix.patch @@ -0,0 +1,106 @@ +From 38f21977663126fef53f5585e7f1653d8ebe55c4 Mon Sep 17 00:00:00 2001 +From: Nick Piggin +Date: Tue, 6 Jan 2009 14:40:25 -0800 +Subject: fs: sync_sb_inodes fix + +From: Nick Piggin + +commit 38f21977663126fef53f5585e7f1653d8ebe55c4 upstream. + +Fix data integrity semantics required by sys_sync, by iterating over all +inodes and waiting for any writeback pages after the initial writeout. +Comments explain the exact problem. + +Signed-off-by: Nick Piggin +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/fs-writeback.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++------- + 1 file changed, 53 insertions(+), 7 deletions(-) + +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -440,6 +440,7 @@ void generic_sync_sb_inodes(struct super + struct writeback_control *wbc) + { + const unsigned long start = jiffies; /* livelock avoidance */ ++ int sync = wbc->sync_mode == WB_SYNC_ALL; + + spin_lock(&inode_lock); + if (!wbc->for_kupdate || list_empty(&sb->s_io)) +@@ -516,7 +517,49 @@ void generic_sync_sb_inodes(struct super + if (!list_empty(&sb->s_more_io)) + wbc->more_io = 1; + } +- spin_unlock(&inode_lock); ++ ++ if (sync) { ++ struct inode *inode, *old_inode = NULL; ++ ++ /* ++ * Data integrity sync. Must wait for all pages under writeback, ++ * because there may have been pages dirtied before our sync ++ * call, but which had writeout started before we write it out. ++ * In which case, the inode may not be on the dirty list, but ++ * we still have to wait for that writeout. ++ */ ++ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { ++ struct address_space *mapping; ++ ++ if (inode->i_state & (I_FREEING|I_WILL_FREE)) ++ continue; ++ mapping = inode->i_mapping; ++ if (mapping->nrpages == 0) ++ continue; ++ __iget(inode); ++ spin_unlock(&inode_lock); ++ /* ++ * We hold a reference to 'inode' so it couldn't have ++ * been removed from s_inodes list while we dropped the ++ * inode_lock. We cannot iput the inode now as we can ++ * be holding the last reference and we cannot iput it ++ * under inode_lock. So we keep the reference and iput ++ * it later. ++ */ ++ iput(old_inode); ++ old_inode = inode; ++ ++ filemap_fdatawait(mapping); ++ ++ cond_resched(); ++ ++ spin_lock(&inode_lock); ++ } ++ spin_unlock(&inode_lock); ++ iput(old_inode); ++ } else ++ spin_unlock(&inode_lock); ++ + return; /* Leave any unwritten inodes on s_io */ + } + EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); +@@ -596,13 +639,16 @@ void sync_inodes_sb(struct super_block * + .range_start = 0, + .range_end = LLONG_MAX, + }; +- unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); +- unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + +- wbc.nr_to_write = nr_dirty + nr_unstable + +- (inodes_stat.nr_inodes - inodes_stat.nr_unused) + +- nr_dirty + nr_unstable; +- wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */ ++ if (!wait) { ++ unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); ++ unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); ++ ++ wbc.nr_to_write = nr_dirty + nr_unstable + ++ (inodes_stat.nr_inodes - inodes_stat.nr_unused); ++ } else ++ wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */ ++ + sync_sb_inodes(sb, &wbc); + } + diff --git a/queue-2.6.27/fs-sys_sync-fix.patch b/queue-2.6.27/fs-sys_sync-fix.patch new file mode 100644 index 00000000000..c285093a8fc --- /dev/null +++ b/queue-2.6.27/fs-sys_sync-fix.patch @@ -0,0 +1,83 @@ +From 856bf4d717feb8c55d4e2f817b71ebb70cfbc67b Mon Sep 17 00:00:00 2001 +From: Nick Piggin +Date: Tue, 6 Jan 2009 14:40:26 -0800 +Subject: fs: sys_sync fix + +From: Nick Piggin + +commit 856bf4d717feb8c55d4e2f817b71ebb70cfbc67b upstream. + +s_syncing livelock avoidance was breaking data integrity guarantee of +sys_sync, by allowing sys_sync to skip writing or waiting for superblocks +if there is a concurrent sys_sync happening. + +This livelock avoidance is much less important now that we don't have the +get_super_to_sync() call after every sb that we sync. This was replaced +by __put_super_and_need_restart. + +Signed-off-by: Nick Piggin +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/fs-writeback.c | 20 +------------------- + include/linux/fs.h | 1 - + 2 files changed, 1 insertion(+), 20 deletions(-) + +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -652,18 +652,6 @@ void sync_inodes_sb(struct super_block * + sync_sb_inodes(sb, &wbc); + } + +-/* +- * Rather lame livelock avoidance. +- */ +-static void set_sb_syncing(int val) +-{ +- struct super_block *sb; +- spin_lock(&sb_lock); +- list_for_each_entry_reverse(sb, &super_blocks, s_list) +- sb->s_syncing = val; +- spin_unlock(&sb_lock); +-} +- + /** + * sync_inodes - writes all inodes to disk + * @wait: wait for completion +@@ -690,9 +678,6 @@ static void __sync_inodes(int wait) + spin_lock(&sb_lock); + restart: + list_for_each_entry(sb, &super_blocks, s_list) { +- if (sb->s_syncing) +- continue; +- sb->s_syncing = 1; + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); +@@ -710,13 +695,10 @@ restart: + + void sync_inodes(int wait) + { +- set_sb_syncing(0); + __sync_inodes(0); + +- if (wait) { +- set_sb_syncing(0); ++ if (wait) + __sync_inodes(1); +- } + } + + /** +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -1080,7 +1080,6 @@ struct super_block { + struct rw_semaphore s_umount; + struct mutex s_lock; + int s_count; +- int s_syncing; + int s_need_sync_fs; + atomic_t s_active; + #ifdef CONFIG_SECURITY diff --git a/queue-2.6.27/mm-direct-io-starvation-improvement.patch b/queue-2.6.27/mm-direct-io-starvation-improvement.patch new file mode 100644 index 00000000000..1872a649e7d --- /dev/null +++ b/queue-2.6.27/mm-direct-io-starvation-improvement.patch @@ -0,0 +1,73 @@ +From 48b47c561e41525061b5bc0cfd67d6367fd11dc4 Mon Sep 17 00:00:00 2001 +From: Nick Piggin +Date: Tue, 6 Jan 2009 14:40:22 -0800 +Subject: mm: direct IO starvation improvement + +From: Nick Piggin + +commit 48b47c561e41525061b5bc0cfd67d6367fd11dc4 upstream. + +Direct IO can invalidate and sync a lot of pagecache pages in the mapping. + A 4K direct IO will actually try to sync and/or invalidate the pagecache +of the entire file, for example (which might be many GB or TB large). + +Improve this by doing range syncs. Also, memory no longer has to be +unmapped to catch the dirty bits for syncing, as dirty bits would remain +coherent due to dirty mmap accounting. + +This fixes the immediate DM deadlocks when doing direct IO reads to block +device with a mounted filesystem, if only by papering over the problem +somewhat rather than addressing the fsync starvation cases. + +Signed-off-by: Nick Piggin +Reviewed-by: Jeff Moyer +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/filemap.c | 16 +++++----------- + 1 file changed, 5 insertions(+), 11 deletions(-) + +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -1304,7 +1304,8 @@ generic_file_aio_read(struct kiocb *iocb + goto out; /* skip atime */ + size = i_size_read(inode); + if (pos < size) { +- retval = filemap_write_and_wait(mapping); ++ retval = filemap_write_and_wait_range(mapping, pos, ++ pos + iov_length(iov, nr_segs) - 1); + if (!retval) { + retval = mapping->a_ops->direct_IO(READ, iocb, + iov, pos, nr_segs); +@@ -2117,18 +2118,10 @@ generic_file_direct_write(struct kiocb * + if (count != ocount) + *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); + +- /* +- * Unmap all mmappings of the file up-front. +- * +- * This will cause any pte dirty bits to be propagated into the +- * pageframes for the subsequent filemap_write_and_wait(). +- */ + write_len = iov_length(iov, *nr_segs); + end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; +- if (mapping_mapped(mapping)) +- unmap_mapping_range(mapping, pos, write_len, 0); + +- written = filemap_write_and_wait(mapping); ++ written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1); + if (written) + goto out; + +@@ -2519,7 +2512,8 @@ generic_file_buffered_write(struct kiocb + * the file data here, to try to honour O_DIRECT expectations. + */ + if (unlikely(file->f_flags & O_DIRECT) && written) +- status = filemap_write_and_wait(mapping); ++ status = filemap_write_and_wait_range(mapping, ++ pos, pos + written - 1); + + return written ? written : status; + } diff --git a/queue-2.6.27/mm-do_sync_mapping_range-integrity-fix.patch b/queue-2.6.27/mm-do_sync_mapping_range-integrity-fix.patch new file mode 100644 index 00000000000..8a36b0a64c8 --- /dev/null +++ b/queue-2.6.27/mm-do_sync_mapping_range-integrity-fix.patch @@ -0,0 +1,37 @@ +From ee53a891f47444c53318b98dac947ede963db400 Mon Sep 17 00:00:00 2001 +From: Nick Piggin +Date: Tue, 6 Jan 2009 14:39:12 -0800 +Subject: mm: do_sync_mapping_range integrity fix + +From: Nick Piggin + +commit ee53a891f47444c53318b98dac947ede963db400 upstream. + +Chris Mason notices do_sync_mapping_range didn't actually ask for data +integrity writeout. Unfortunately, it is advertised as being usable for +data integrity operations. + +This is a data integrity bug. + +Signed-off-by: Nick Piggin +Cc: Chris Mason +Cc: Dave Chinner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/sync.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/sync.c ++++ b/fs/sync.c +@@ -287,7 +287,7 @@ int do_sync_mapping_range(struct address + + if (flags & SYNC_FILE_RANGE_WRITE) { + ret = __filemap_fdatawrite_range(mapping, offset, endbyte, +- WB_SYNC_NONE); ++ WB_SYNC_ALL); + if (ret < 0) + goto out; + } diff --git a/queue-2.6.27/mm-write_cache_pages-cleanups.patch b/queue-2.6.27/mm-write_cache_pages-cleanups.patch new file mode 100644 index 00000000000..f41ed9589af --- /dev/null +++ b/queue-2.6.27/mm-write_cache_pages-cleanups.patch @@ -0,0 +1,94 @@ +From 5a3d5c9813db56a75934eb1015367fda23a8b0b4 Mon Sep 17 00:00:00 2001 +From: Nick Piggin +Date: Tue, 6 Jan 2009 14:39:09 -0800 +Subject: mm: write_cache_pages cleanups + +From: Nick Piggin + +commit 5a3d5c9813db56a75934eb1015367fda23a8b0b4 upstream. + +Get rid of some complex expressions from flow control statements, add a +comment, remove some duplicate code. + +Signed-off-by: Nick Piggin +Cc: Chris Mason +Cc: Dave Chinner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page-writeback.c | 34 ++++++++++++++++++++++------------ + 1 file changed, 22 insertions(+), 12 deletions(-) + +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -902,11 +902,14 @@ int write_cache_pages(struct address_spa + } + retry: + done_index = index; +- while (!done && (index <= end) && +- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, +- PAGECACHE_TAG_DIRTY, +- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { +- unsigned i; ++ while (!done && (index <= end)) { ++ int i; ++ ++ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, ++ PAGECACHE_TAG_DIRTY, ++ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); ++ if (nr_pages == 0) ++ break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; +@@ -922,7 +925,16 @@ retry: + */ + lock_page(page); + ++ /* ++ * Page truncated or invalidated. We can freely skip it ++ * then, even for data integrity operations: the page ++ * has disappeared concurrently, so there could be no ++ * real expectation of this data interity operation ++ * even if there is now a new, dirty page at the same ++ * pagecache address. ++ */ + if (unlikely(page->mapping != mapping)) { ++continue_unlock: + unlock_page(page); + continue; + } +@@ -933,18 +945,15 @@ retry: + * end == -1 in that case. + */ + done = 1; +- unlock_page(page); +- continue; ++ goto continue_unlock; + } + + if (wbc->sync_mode != WB_SYNC_NONE) + wait_on_page_writeback(page); + + if (PageWriteback(page) || +- !clear_page_dirty_for_io(page)) { +- unlock_page(page); +- continue; +- } ++ !clear_page_dirty_for_io(page)) ++ goto continue_unlock; + + ret = (*writepage)(page, wbc, data); + +@@ -968,7 +977,8 @@ retry: + } + + if (wbc->sync_mode == WB_SYNC_NONE) { +- if (--wbc->nr_to_write <= 0) ++ wbc->nr_to_write--; ++ if (wbc->nr_to_write <= 0) + done = 1; + } + if (wbc->nonblocking && bdi_write_congested(bdi)) { diff --git a/queue-2.6.27/mm-write_cache_pages-early-loop-termination.patch b/queue-2.6.27/mm-write_cache_pages-early-loop-termination.patch new file mode 100644 index 00000000000..4be92787246 --- /dev/null +++ b/queue-2.6.27/mm-write_cache_pages-early-loop-termination.patch @@ -0,0 +1,70 @@ +From bd19e012f6fd3b7309689165ea865cbb7bb88c1e Mon Sep 17 00:00:00 2001 +From: Nick Piggin +Date: Tue, 6 Jan 2009 14:39:06 -0800 +Subject: mm: write_cache_pages early loop termination + +From: Nick Piggin + +commit bd19e012f6fd3b7309689165ea865cbb7bb88c1e upstream. + +We'd like to break out of the loop early in many situations, however the +existing code has been setting mapping->writeback_index past the final +page in the pagevec lookup for cyclic writeback. This is a problem if we +don't process all pages up to the final page. + +Currently the code mostly keeps writeback_index reasonable and hacked +around this by not breaking out of the loop or writing pages outside the +range in these cases. Keep track of a real "done index" that enables us +to terminate the loop in a much more flexible manner. + +Needed by the subsequent patch to preserve writepage errors, and then +further patches to break out of the loop early for other reasons. However +there are no functional changes with this patch alone. + +Signed-off-by: Nick Piggin +Cc: Chris Mason +Cc: Dave Chinner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page-writeback.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -875,6 +875,7 @@ int write_cache_pages(struct address_spa + pgoff_t uninitialized_var(writeback_index); + pgoff_t index; + pgoff_t end; /* Inclusive */ ++ pgoff_t done_index; + int cycled; + int range_whole = 0; + +@@ -900,6 +901,7 @@ int write_cache_pages(struct address_spa + cycled = 1; /* ignore range_cyclic tests */ + } + retry: ++ done_index = index; + while (!done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, +@@ -909,6 +911,8 @@ retry: + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + ++ done_index = page->index + 1; ++ + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or +@@ -970,7 +974,7 @@ retry: + goto retry; + } + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) +- mapping->writeback_index = index; ++ mapping->writeback_index = done_index; + + if (wbc->range_cont) + wbc->range_start = index << PAGE_CACHE_SHIFT; diff --git a/queue-2.6.27/mm-write_cache_pages-integrity-fix.patch b/queue-2.6.27/mm-write_cache_pages-integrity-fix.patch new file mode 100644 index 00000000000..4099f01981a --- /dev/null +++ b/queue-2.6.27/mm-write_cache_pages-integrity-fix.patch @@ -0,0 +1,85 @@ +From 05fe478dd04e02fa230c305ab9b5616669821dd3 Mon Sep 17 00:00:00 2001 +From: Nick Piggin +Date: Tue, 6 Jan 2009 14:39:08 -0800 +Subject: mm: write_cache_pages integrity fix + +From: Nick Piggin + +commit 05fe478dd04e02fa230c305ab9b5616669821dd3 upstream. + +In write_cache_pages, nr_to_write is heeded even for data-integrity syncs, +so the function will return success after writing out nr_to_write pages, +even if that was not sufficient to guarantee data integrity. + +The callers tend to set it to values that could break data interity +semantics easily in practice. For example, nr_to_write can be set to +mapping->nr_pages * 2, however if a file has a single, dirty page, then +fsync is called, subsequent pages might be concurrently added and dirtied, +then write_cache_pages might writeout two of these newly dirty pages, +while not writing out the old page that should have been written out. + +Fix this by ignoring nr_to_write if it is a data integrity sync. + +This is a data integrity bug. + +The reason this has been done in the past is to avoid stalling sync +operations behind page dirtiers. + + "If a file has one dirty page at offset 1000000000000000 then someone + does an fsync() and someone else gets in first and starts madly writing + pages at offset 0, we want to write that page at 1000000000000000. + Somehow." + +What we do today is return success after an arbitrary amount of pages are +written, whether or not we have provided the data-integrity semantics that +the caller has asked for. Even this doesn't actually fix all stall cases +completely: in the above situation, if the file has a huge number of pages +in pagecache (but not dirty), then mapping->nrpages is going to be huge, +even if pages are being dirtied. + +This change does indeed make the possibility of long stalls lager, and +that's not a good thing, but lying about data integrity is even worse. We +have to either perform the sync, or return -ELINUXISLAME so at least the +caller knows what has happened. + +There are subsequent competing approaches in the works to solve the stall +problems properly, without compromising data integrity. + +Signed-off-by: Nick Piggin +Cc: Chris Mason +Cc: Dave Chinner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/filemap.c | 2 +- + mm/page-writeback.c | 6 ++++-- + 2 files changed, 5 insertions(+), 3 deletions(-) + +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -209,7 +209,7 @@ int __filemap_fdatawrite_range(struct ad + int ret; + struct writeback_control wbc = { + .sync_mode = sync_mode, +- .nr_to_write = mapping->nrpages * 2, ++ .nr_to_write = LONG_MAX, + .range_start = start, + .range_end = end, + }; +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -967,8 +967,10 @@ retry: + } + } + +- if (--(wbc->nr_to_write) <= 0) +- done = 1; ++ if (wbc->sync_mode == WB_SYNC_NONE) { ++ if (--wbc->nr_to_write <= 0) ++ done = 1; ++ } + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; diff --git a/queue-2.6.27/mm-write_cache_pages-more-terminate-quickly.patch b/queue-2.6.27/mm-write_cache_pages-more-terminate-quickly.patch new file mode 100644 index 00000000000..da422cf4282 --- /dev/null +++ b/queue-2.6.27/mm-write_cache_pages-more-terminate-quickly.patch @@ -0,0 +1,42 @@ +From 82fd1a9a8ced9607312b54859572bcc6211e8919 Mon Sep 17 00:00:00 2001 +From: Andrew Morton +Date: Tue, 6 Jan 2009 14:39:11 -0800 +Subject: mm: write_cache_pages more terminate quickly + +From: Andrew Morton + +commit 82fd1a9a8ced9607312b54859572bcc6211e8919 upstream. + +Now that we have the early-termination logic in place, it makes sense to +bail out early in all other cases where done is set to 1. + +Signed-off-by: Nick Piggin +Cc: Chris Mason +Cc: Dave Chinner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page-writeback.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -987,12 +987,15 @@ continue_unlock: + + if (wbc->sync_mode == WB_SYNC_NONE) { + wbc->nr_to_write--; +- if (wbc->nr_to_write <= 0) ++ if (wbc->nr_to_write <= 0) { + done = 1; ++ break; ++ } + } + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; ++ break; + } + } + pagevec_release(&pvec); diff --git a/queue-2.6.27/mm-write_cache_pages-optimise-page-cleaning.patch b/queue-2.6.27/mm-write_cache_pages-optimise-page-cleaning.patch new file mode 100644 index 00000000000..0a33de0f317 --- /dev/null +++ b/queue-2.6.27/mm-write_cache_pages-optimise-page-cleaning.patch @@ -0,0 +1,57 @@ +From 515f4a037fb9ab736f8bad733fcd2ffd350cf265 Mon Sep 17 00:00:00 2001 +From: Nick Piggin +Date: Tue, 6 Jan 2009 14:39:10 -0800 +Subject: mm: write_cache_pages optimise page cleaning + +From: Nick Piggin + +commit 515f4a037fb9ab736f8bad733fcd2ffd350cf265 upstream. + +In write_cache_pages, if we get stuck behind another process that is +cleaning pages, we will be forced to wait for them to finish, then perform +our own writeout (if it was redirtied during the long wait), then wait for +that. + +If a page under writeout is still clean, we can skip waiting for it (if +we're part of a data integrity sync, we'll be waiting for all writeout +pages afterwards, so we'll still be waiting for the other guy's write +that's cleaned the page). + +Signed-off-by: Nick Piggin +Cc: Chris Mason +Cc: Dave Chinner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page-writeback.c | 17 +++++++++++++---- + 1 file changed, 13 insertions(+), 4 deletions(-) + +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -948,11 +948,20 @@ continue_unlock: + goto continue_unlock; + } + +- if (wbc->sync_mode != WB_SYNC_NONE) +- wait_on_page_writeback(page); ++ if (!PageDirty(page)) { ++ /* someone wrote it for us */ ++ goto continue_unlock; ++ } ++ ++ if (PageWriteback(page)) { ++ if (wbc->sync_mode != WB_SYNC_NONE) ++ wait_on_page_writeback(page); ++ else ++ goto continue_unlock; ++ } + +- if (PageWriteback(page) || +- !clear_page_dirty_for_io(page)) ++ BUG_ON(PageWriteback(page)); ++ if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = (*writepage)(page, wbc, data); diff --git a/queue-2.6.27/mm-write_cache_pages-terminate-quickly.patch b/queue-2.6.27/mm-write_cache_pages-terminate-quickly.patch new file mode 100644 index 00000000000..de9da6d14c4 --- /dev/null +++ b/queue-2.6.27/mm-write_cache_pages-terminate-quickly.patch @@ -0,0 +1,75 @@ +From d5482cdf8a0aacb1e6468a97d5544f5829c8d8c4 Mon Sep 17 00:00:00 2001 +From: Nick Piggin +Date: Tue, 6 Jan 2009 14:39:11 -0800 +Subject: mm: write_cache_pages terminate quickly + +From: Nick Piggin + +commit d5482cdf8a0aacb1e6468a97d5544f5829c8d8c4 upstream. + +Terminate the write_cache_pages loop upon encountering the first page past +end, without locking the page. Pages cannot have their index change when +we have a reference on them (truncate, eg truncate_inode_pages_range +performs the same check without the page lock). + +Signed-off-by: Nick Piggin +Cc: Chris Mason +Cc: Dave Chinner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page-writeback.c | 32 ++++++++++++++++---------------- + 1 file changed, 16 insertions(+), 16 deletions(-) + +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -914,15 +914,24 @@ retry: + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + +- done_index = page->index + 1; +- + /* +- * At this point we hold neither mapping->tree_lock nor +- * lock on the page itself: the page may be truncated or +- * invalidated (changing page->mapping to NULL), or even +- * swizzled back from swapper_space to tmpfs file +- * mapping ++ * At this point, the page may be truncated or ++ * invalidated (changing page->mapping to NULL), or ++ * even swizzled back from swapper_space to tmpfs file ++ * mapping. However, page->index will not change ++ * because we have a reference on the page. + */ ++ if (page->index > end) { ++ /* ++ * can't be range_cyclic (1st pass) because ++ * end == -1 in that case. ++ */ ++ done = 1; ++ break; ++ } ++ ++ done_index = page->index + 1; ++ + lock_page(page); + + /* +@@ -939,15 +948,6 @@ continue_unlock: + continue; + } + +- if (page->index > end) { +- /* +- * can't be range_cyclic (1st pass) because +- * end == -1 in that case. +- */ +- done = 1; +- goto continue_unlock; +- } +- + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; diff --git a/queue-2.6.27/mm-write_cache_pages-writepage-error-fix.patch b/queue-2.6.27/mm-write_cache_pages-writepage-error-fix.patch new file mode 100644 index 00000000000..c75a00584e8 --- /dev/null +++ b/queue-2.6.27/mm-write_cache_pages-writepage-error-fix.patch @@ -0,0 +1,67 @@ +From 00266770b8b3a6a77f896ca501a0613739086832 Mon Sep 17 00:00:00 2001 +From: Nick Piggin +Date: Tue, 6 Jan 2009 14:39:06 -0800 +Subject: mm: write_cache_pages writepage error fix + +From: Nick Piggin + +commit 00266770b8b3a6a77f896ca501a0613739086832 upstream. + +In write_cache_pages, if ret signals a real error, but we still have some +pages left in the pagevec, done would be set to 1, but the remaining pages +would continue to be processed and ret will be overwritten in the process. + +It could easily be overwritten with success, and thus success will be +returned even if there is an error. Thus the caller is told all writes +succeeded, wheras in reality some did not. + +Fix this by bailing immediately if there is an error, and retaining the +first error code. + +This is a data integrity bug. + +Signed-off-by: Nick Piggin +Cc: Chris Mason +Cc: Dave Chinner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page-writeback.c | 25 ++++++++++++++++++++----- + 1 file changed, 20 insertions(+), 5 deletions(-) + +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -948,11 +948,26 @@ retry: + + ret = (*writepage)(page, wbc, data); + +- if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { +- unlock_page(page); +- ret = 0; +- } +- if (ret || (--(wbc->nr_to_write) <= 0)) ++ if (unlikely(ret)) { ++ if (ret == AOP_WRITEPAGE_ACTIVATE) { ++ unlock_page(page); ++ ret = 0; ++ } else { ++ /* ++ * done_index is set past this page, ++ * so media errors will not choke ++ * background writeout for the entire ++ * file. This has consequences for ++ * range_cyclic semantics (ie. it may ++ * not be suitable for data integrity ++ * writeout). ++ */ ++ done = 1; ++ break; ++ } ++ } ++ ++ if (--(wbc->nr_to_write) <= 0) + done = 1; + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; diff --git a/queue-2.6.27/series b/queue-2.6.27/series index 2a12e53ad52..d4523051de0 100644 --- a/queue-2.6.27/series +++ b/queue-2.6.27/series @@ -26,3 +26,15 @@ dell_rbu-use-scnprintf-instead-of-less-secure-sprintf.patch hwmon-fix-config_dmi-n-fallback-to-probe.patch powerpc-is_hugepage_only_range-must-account-for-both-4kb-and-64kb-slices.patch mm-write_cache_pages-cyclic-fix.patch +mm-write_cache_pages-early-loop-termination.patch +mm-write_cache_pages-writepage-error-fix.patch +mm-write_cache_pages-integrity-fix.patch +mm-write_cache_pages-cleanups.patch +mm-write_cache_pages-optimise-page-cleaning.patch +mm-write_cache_pages-terminate-quickly.patch +mm-write_cache_pages-more-terminate-quickly.patch +mm-do_sync_mapping_range-integrity-fix.patch +mm-direct-io-starvation-improvement.patch +fs-remove-wb_sync_hold.patch +fs-sync_sb_inodes-fix.patch +fs-sys_sync-fix.patch