]>
Commit | Line | Data |
---|---|---|
a1e97e95 GKH |
1 | From fa41ba0d08de7c975c3e94d0067553f9b934221f Mon Sep 17 00:00:00 2001 |
2 | From: Christian Borntraeger <borntraeger@de.ibm.com> | |
3 | Date: Thu, 24 Aug 2017 12:55:08 +0200 | |
4 | Subject: s390/mm: avoid empty zero pages for KVM guests to avoid postcopy hangs | |
5 | ||
6 | From: Christian Borntraeger <borntraeger@de.ibm.com> | |
7 | ||
8 | commit fa41ba0d08de7c975c3e94d0067553f9b934221f upstream. | |
9 | ||
10 | Right now there is a potential hang situation for postcopy migrations, | |
11 | if the guest is enabling storage keys on the target system during the | |
12 | postcopy process. | |
13 | ||
14 | For storage key virtualization, we have to forbid the empty zero page as | |
15 | the storage key is a property of the physical page frame. As we enable | |
16 | storage key handling lazily we then drop all mappings for empty zero | |
17 | pages for lazy refaulting later on. | |
18 | ||
19 | This does not work with the postcopy migration, which relies on the | |
20 | empty zero page never triggering a fault again in the future. The reason | |
21 | is that postcopy migration will simply read a page on the target system | |
22 | if that page is a known zero page to fault in an empty zero page. At | |
23 | the same time postcopy remembers that this page was already transferred | |
24 | - so any future userfault on that page will NOT be retransmitted again | |
25 | to avoid races. | |
26 | ||
27 | If now the guest enters the storage key mode while in postcopy, we will | |
28 | break this assumption of postcopy. | |
29 | ||
30 | The solution is to disable the empty zero page for KVM guests early on | |
31 | and not during storage key enablement. With this change, the postcopy | |
32 | migration process is guaranteed to start after no zero pages are left. | |
33 | ||
34 | As guest pages are very likely not empty zero pages anyway the memory | |
35 | overhead is also pretty small. | |
36 | ||
37 | While at it this also adds proper page table locking to the zero page | |
38 | removal. | |
39 | ||
40 | Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com> | |
41 | Acked-by: Janosch Frank <frankja@linux.vnet.ibm.com> | |
42 | Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com> | |
43 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
44 | ||
45 | --- | |
46 | arch/s390/include/asm/pgtable.h | 2 +- | |
47 | arch/s390/mm/gmap.c | 39 ++++++++++++++++++++++++++++++++------- | |
48 | 2 files changed, 33 insertions(+), 8 deletions(-) | |
49 | ||
50 | --- a/arch/s390/include/asm/pgtable.h | |
51 | +++ b/arch/s390/include/asm/pgtable.h | |
52 | @@ -480,7 +480,7 @@ static inline int mm_alloc_pgste(struct | |
53 | * In the case that a guest uses storage keys | |
54 | * faults should no longer be backed by zero pages | |
55 | */ | |
56 | -#define mm_forbids_zeropage mm_use_skey | |
57 | +#define mm_forbids_zeropage mm_has_pgste | |
58 | static inline int mm_use_skey(struct mm_struct *mm) | |
59 | { | |
60 | #ifdef CONFIG_PGSTE | |
61 | --- a/arch/s390/mm/gmap.c | |
62 | +++ b/arch/s390/mm/gmap.c | |
63 | @@ -2125,6 +2125,37 @@ static inline void thp_split_mm(struct m | |
64 | } | |
65 | ||
66 | /* | |
67 | + * Remove all empty zero pages from the mapping for lazy refaulting | |
68 | + * - This must be called after mm->context.has_pgste is set, to avoid | |
69 | + * future creation of zero pages | |
70 | + * - This must be called after THP was enabled | |
71 | + */ | |
72 | +static int __zap_zero_pages(pmd_t *pmd, unsigned long start, | |
73 | + unsigned long end, struct mm_walk *walk) | |
74 | +{ | |
75 | + unsigned long addr; | |
76 | + | |
77 | + for (addr = start; addr != end; addr += PAGE_SIZE) { | |
78 | + pte_t *ptep; | |
79 | + spinlock_t *ptl; | |
80 | + | |
81 | + ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); | |
82 | + if (is_zero_pfn(pte_pfn(*ptep))) | |
83 | + ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID)); | |
84 | + pte_unmap_unlock(ptep, ptl); | |
85 | + } | |
86 | + return 0; | |
87 | +} | |
88 | + | |
89 | +static inline void zap_zero_pages(struct mm_struct *mm) | |
90 | +{ | |
91 | + struct mm_walk walk = { .pmd_entry = __zap_zero_pages }; | |
92 | + | |
93 | + walk.mm = mm; | |
94 | + walk_page_range(0, TASK_SIZE, &walk); | |
95 | +} | |
96 | + | |
97 | +/* | |
98 | * switch on pgstes for its userspace process (for kvm) | |
99 | */ | |
100 | int s390_enable_sie(void) | |
101 | @@ -2141,6 +2172,7 @@ int s390_enable_sie(void) | |
102 | mm->context.has_pgste = 1; | |
103 | /* split thp mappings and disable thp for future mappings */ | |
104 | thp_split_mm(mm); | |
105 | + zap_zero_pages(mm); | |
106 | up_write(&mm->mmap_sem); | |
107 | return 0; | |
108 | } | |
109 | @@ -2153,13 +2185,6 @@ EXPORT_SYMBOL_GPL(s390_enable_sie); | |
110 | static int __s390_enable_skey(pte_t *pte, unsigned long addr, | |
111 | unsigned long next, struct mm_walk *walk) | |
112 | { | |
113 | - /* | |
114 | - * Remove all zero page mappings, | |
115 | - * after establishing a policy to forbid zero page mappings | |
116 | - * following faults for that page will get fresh anonymous pages | |
117 | - */ | |
118 | - if (is_zero_pfn(pte_pfn(*pte))) | |
119 | - ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID)); | |
120 | /* Clear storage key */ | |
121 | ptep_zap_key(walk->mm, addr, pte); | |
122 | return 0; |