]>
Commit | Line | Data |
---|---|---|
5789e898 GKH |
1 | From 54b9dd14d09f24927285359a227aa363ce46089e Mon Sep 17 00:00:00 2001 |
2 | From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> | |
3 | Date: Thu, 23 Jan 2014 15:53:14 -0800 | |
4 | Subject: mm/memory-failure.c: shift page lock from head page to tail page after thp split | |
5 | ||
6 | From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> | |
7 | ||
8 | commit 54b9dd14d09f24927285359a227aa363ce46089e upstream. | |
9 | ||
10 | After thp split in hwpoison_user_mappings(), we hold page lock on the | |
11 | raw error page only between try_to_unmap, hence we are in danger of race | |
12 | condition. | |
13 | ||
14 | I found in the RHEL7 MCE-relay testing that we have "bad page" error | |
15 | when a memory error happens on a thp tail page used by qemu-kvm: | |
16 | ||
17 | Triggering MCE exception on CPU 10 | |
18 | mce: [Hardware Error]: Machine check events logged | |
19 | MCE exception done on CPU 10 | |
20 | MCE 0x38c535: Killing qemu-kvm:8418 due to hardware memory corruption | |
21 | MCE 0x38c535: dirty LRU page recovery: Recovered | |
22 | qemu-kvm[8418]: segfault at 20 ip 00007ffb0f0f229a sp 00007fffd6bc5240 error 4 in qemu-kvm[7ffb0ef14000+420000] | |
23 | BUG: Bad page state in process qemu-kvm pfn:38c400 | |
24 | page:ffffea000e310000 count:0 mapcount:0 mapping: (null) index:0x7ffae3c00 | |
25 | page flags: 0x2fffff0008001d(locked|referenced|uptodate|dirty|swapbacked) | |
26 | Modules linked in: hwpoison_inject mce_inject vhost_net macvtap macvlan ... | |
27 | CPU: 0 PID: 8418 Comm: qemu-kvm Tainted: G M -------------- 3.10.0-54.0.1.el7.mce_test_fixed.x86_64 #1 | |
28 | Hardware name: NEC NEC Express5800/R120b-1 [N8100-1719F]/MS-91E7-001, BIOS 4.6.3C19 02/10/2011 | |
29 | Call Trace: | |
30 | dump_stack+0x19/0x1b | |
31 | bad_page.part.59+0xcf/0xe8 | |
32 | free_pages_prepare+0x148/0x160 | |
33 | free_hot_cold_page+0x31/0x140 | |
34 | free_hot_cold_page_list+0x46/0xa0 | |
35 | release_pages+0x1c1/0x200 | |
36 | free_pages_and_swap_cache+0xad/0xd0 | |
37 | tlb_flush_mmu.part.46+0x4c/0x90 | |
38 | tlb_finish_mmu+0x55/0x60 | |
39 | exit_mmap+0xcb/0x170 | |
40 | mmput+0x67/0xf0 | |
41 | vhost_dev_cleanup+0x231/0x260 [vhost_net] | |
42 | vhost_net_release+0x3f/0x90 [vhost_net] | |
43 | __fput+0xe9/0x270 | |
44 | ____fput+0xe/0x10 | |
45 | task_work_run+0xc4/0xe0 | |
46 | do_exit+0x2bb/0xa40 | |
47 | do_group_exit+0x3f/0xa0 | |
48 | get_signal_to_deliver+0x1d0/0x6e0 | |
49 | do_signal+0x48/0x5e0 | |
50 | do_notify_resume+0x71/0xc0 | |
51 | retint_signal+0x48/0x8c | |
52 | ||
53 | The reason of this bug is that a page fault happens before unlocking the | |
54 | head page at the end of memory_failure(). This strange page fault is | |
55 | trying to access to address 0x20 and I'm not sure why qemu-kvm does | |
56 | this, but anyway as a result the SIGSEGV makes qemu-kvm exit and on the | |
57 | way we catch the bad page bug/warning because we try to free a locked | |
58 | page (which was the former head page.) | |
59 | ||
60 | To fix this, this patch suggests to shift page lock from head page to | |
61 | tail page just after thp split. SIGSEGV still happens, but it affects | |
62 | only error affected VMs, not a whole system. | |
63 | ||
64 | Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> | |
65 | Cc: Andi Kleen <andi@firstfloor.org> | |
66 | Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com> | |
67 | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | |
68 | Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> | |
69 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
70 | ||
71 | --- | |
72 | mm/memory-failure.c | 21 +++++++++++---------- | |
73 | 1 file changed, 11 insertions(+), 10 deletions(-) | |
74 | ||
75 | --- a/mm/memory-failure.c | |
76 | +++ b/mm/memory-failure.c | |
77 | @@ -854,14 +854,14 @@ static int page_action(struct page_state | |
78 | * the pages and send SIGBUS to the processes if the data was dirty. | |
79 | */ | |
80 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |
81 | - int trapno, int flags) | |
82 | + int trapno, int flags, struct page **hpagep) | |
83 | { | |
84 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | |
85 | struct address_space *mapping; | |
86 | LIST_HEAD(tokill); | |
87 | int ret; | |
88 | int kill = 1, forcekill; | |
89 | - struct page *hpage = compound_head(p); | |
90 | + struct page *hpage = *hpagep; | |
91 | struct page *ppage; | |
92 | ||
93 | if (PageReserved(p) || PageSlab(p)) | |
94 | @@ -940,11 +940,14 @@ static int hwpoison_user_mappings(struct | |
95 | * We pinned the head page for hwpoison handling, | |
96 | * now we split the thp and we are interested in | |
97 | * the hwpoisoned raw page, so move the refcount | |
98 | - * to it. | |
99 | + * to it. Similarly, page lock is shifted. | |
100 | */ | |
101 | if (hpage != p) { | |
102 | put_page(hpage); | |
103 | get_page(p); | |
104 | + lock_page(p); | |
105 | + unlock_page(hpage); | |
106 | + *hpagep = p; | |
107 | } | |
108 | /* THP is split, so ppage should be the real poisoned page. */ | |
109 | ppage = p; | |
110 | @@ -962,17 +965,11 @@ static int hwpoison_user_mappings(struct | |
111 | if (kill) | |
112 | collect_procs(ppage, &tokill); | |
113 | ||
114 | - if (hpage != ppage) | |
115 | - lock_page(ppage); | |
116 | - | |
117 | ret = try_to_unmap(ppage, ttu); | |
118 | if (ret != SWAP_SUCCESS) | |
119 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", | |
120 | pfn, page_mapcount(ppage)); | |
121 | ||
122 | - if (hpage != ppage) | |
123 | - unlock_page(ppage); | |
124 | - | |
125 | /* | |
126 | * Now that the dirty bit has been propagated to the | |
127 | * struct page and all unmaps done we can decide if | |
128 | @@ -1189,8 +1186,12 @@ int memory_failure(unsigned long pfn, in | |
129 | /* | |
130 | * Now take care of user space mappings. | |
131 | * Abort on fail: __delete_from_page_cache() assumes unmapped page. | |
132 | + * | |
133 | + * When the raw error page is thp tail page, hpage points to the raw | |
134 | + * page after thp split. | |
135 | */ | |
136 | - if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) { | |
137 | + if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) | |
138 | + != SWAP_SUCCESS) { | |
139 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); | |
140 | res = -EBUSY; | |
141 | goto out; |