]>
Commit | Line | Data |
---|---|---|
8c71e98e GKH |
1 | From ef5d437f71afdf4afdbab99213add99f4b1318fd Mon Sep 17 00:00:00 2001 |
2 | From: Jan Kara <jack@suse.cz> | |
3 | Date: Thu, 25 Oct 2012 13:37:31 -0700 | |
4 | Subject: mm: fix XFS oops due to dirty pages without buffers on s390 | |
5 | ||
6 | From: Jan Kara <jack@suse.cz> | |
7 | ||
8 | commit ef5d437f71afdf4afdbab99213add99f4b1318fd upstream. | |
9 | ||
10 | On s390 any write to a page (even from kernel itself) sets architecture | |
11 | specific page dirty bit. Thus when a page is written to via buffered | |
12 | write, HW dirty bit gets set and when we later map and unmap the page, | |
13 | page_remove_rmap() finds the dirty bit and calls set_page_dirty(). | |
14 | ||
15 | Dirtying of a page which shouldn't be dirty can cause all sorts of | |
16 | problems to filesystems. The bug we observed in practice is that | |
17 | buffers from the page get freed, so when the page gets later marked as | |
18 | dirty and writeback writes it, XFS crashes due to an assertion | |
19 | BUG_ON(!PagePrivate(page)) in page_buffers() called from | |
20 | xfs_count_page_state(). | |
21 | ||
22 | Similar problem can also happen when zero_user_segment() call from | |
23 | xfs_vm_writepage() (or block_write_full_page() for that matter) set the | |
24 | hardware dirty bit during writeback, later buffers get freed, and then | |
25 | page unmapped. | |
26 | ||
27 | Fix the issue by ignoring s390 HW dirty bit for page cache pages of | |
28 | mappings with mapping_cap_account_dirty(). This is safe because for | |
29 | such mappings when a page gets marked as writeable in PTE it is also | |
30 | marked dirty in do_wp_page() or do_page_fault(). When the dirty bit is | |
31 | cleared by clear_page_dirty_for_io(), the page gets writeprotected in | |
32 | page_mkclean(). So pagecache page is writeable if and only if it is | |
33 | dirty. | |
34 | ||
35 | Thanks to Hugh Dickins for pointing out mapping has to have | |
36 | mapping_cap_account_dirty() for things to work and proposing a cleaned | |
37 | up variant of the patch. | |
38 | ||
39 | The patch has survived about two hours of running fsx-linux on tmpfs | |
40 | while heavily swapping and several days of running on out build machines | |
41 | where the original problem was triggered. | |
42 | ||
43 | Signed-off-by: Jan Kara <jack@suse.cz> | |
44 | Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> | |
45 | Cc: Mel Gorman <mgorman@suse.de> | |
46 | Cc: Hugh Dickins <hughd@google.com> | |
47 | Cc: Heiko Carstens <heiko.carstens@de.ibm.com> | |
48 | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | |
49 | Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> | |
50 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
51 | ||
52 | --- | |
53 | mm/rmap.c | 20 +++++++++++++++----- | |
54 | 1 file changed, 15 insertions(+), 5 deletions(-) | |
55 | ||
56 | --- a/mm/rmap.c | |
57 | +++ b/mm/rmap.c | |
58 | @@ -56,6 +56,7 @@ | |
59 | #include <linux/mmu_notifier.h> | |
60 | #include <linux/migrate.h> | |
61 | #include <linux/hugetlb.h> | |
62 | +#include <linux/backing-dev.h> | |
63 | ||
64 | #include <asm/tlbflush.h> | |
65 | ||
66 | @@ -977,11 +978,8 @@ int page_mkclean(struct page *page) | |
67 | ||
68 | if (page_mapped(page)) { | |
69 | struct address_space *mapping = page_mapping(page); | |
70 | - if (mapping) { | |
71 | + if (mapping) | |
72 | ret = page_mkclean_file(mapping, page); | |
73 | - if (page_test_and_clear_dirty(page_to_pfn(page), 1)) | |
74 | - ret = 1; | |
75 | - } | |
76 | } | |
77 | ||
78 | return ret; | |
79 | @@ -1167,6 +1165,7 @@ void page_add_file_rmap(struct page *pag | |
80 | */ | |
81 | void page_remove_rmap(struct page *page) | |
82 | { | |
83 | + struct address_space *mapping = page_mapping(page); | |
84 | bool anon = PageAnon(page); | |
85 | bool locked; | |
86 | unsigned long flags; | |
87 | @@ -1189,8 +1188,19 @@ void page_remove_rmap(struct page *page) | |
88 | * this if the page is anon, so about to be freed; but perhaps | |
89 | * not if it's in swapcache - there might be another pte slot | |
90 | * containing the swap entry, but page not yet written to swap. | |
91 | + * | |
92 | + * And we can skip it on file pages, so long as the filesystem | |
93 | + * participates in dirty tracking; but need to catch shm and tmpfs | |
94 | + * and ramfs pages which have been modified since creation by read | |
95 | + * fault. | |
96 | + * | |
97 | + * Note that mapping must be decided above, before decrementing | |
98 | + * mapcount (which luckily provides a barrier): once page is unmapped, | |
99 | + * it could be truncated and page->mapping reset to NULL at any moment. | |
100 | + * Note also that we are relying on page_mapping(page) to set mapping | |
101 | + * to &swapper_space when PageSwapCache(page). | |
102 | */ | |
103 | - if ((!anon || PageSwapCache(page)) && | |
104 | + if (mapping && !mapping_cap_account_dirty(mapping) && | |
105 | page_test_and_clear_dirty(page_to_pfn(page), 1)) | |
106 | set_page_dirty(page); | |
107 | /* |