]>
Commit | Line | Data |
---|---|---|
00e5a55c BS |
1 | From: Nick Piggin <npiggin@suse.de> |
2 | Subject: mm: /dev/zero optimisation | |
3 | References: bnc#430738 | |
4 | Patch-mainline: no (could be submit) | |
5 | ||
6 | Patch for removal of ZERO_PAGE from main VM paths also removed the | |
7 | /dev/zero optimisation to map directly from ZERO_PAGE when doing | |
8 | mmap() and also the interesting read(2) "hack" where the MMU was | |
9 | used to make zero-filling the target buffer zero-copy. | |
10 | ||
11 | Some benchmarks have run into issues with this. Customers sometimes | |
12 | use these benchmarks to qualify and test systems, so even if the | |
13 | benchmarks themselves are "stupid", it saves some trouble to retain | |
14 | this optimisation for them. Also, while I don't think it was established | |
15 | that there is a "real" workload where this helps, but it can't be proven | |
16 | that one does not exist. | |
17 | ||
18 | Signed-off-by: Nick Piggin <npiggin@suse.de> | |
19 | --- | |
20 | drivers/char/mem.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++- | |
21 | include/linux/mm.h | 2 + | |
22 | mm/memory.c | 90 ++++++++++++++++++++++++++++++++++++++++++++- | |
23 | 3 files changed, 195 insertions(+), 2 deletions(-) | |
24 | ||
25 | --- a/drivers/char/mem.c | |
26 | +++ b/drivers/char/mem.c | |
27 | @@ -702,6 +702,100 @@ static ssize_t splice_write_null(struct | |
28 | return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_null); | |
29 | } | |
30 | ||
31 | +#if 1 //ndef CONFIG_XEN | |
32 | +/* | |
33 | + * For fun, we are using the MMU for this. | |
34 | + */ | |
35 | +static inline size_t read_zero_pagealigned(char __user * buf, size_t size) | |
36 | +{ | |
37 | + struct mm_struct *mm; | |
38 | + struct vm_area_struct * vma; | |
39 | + unsigned long addr=(unsigned long)buf; | |
40 | + | |
41 | + mm = current->mm; | |
42 | + /* Oops, this was forgotten before. -ben */ | |
43 | + down_read(&mm->mmap_sem); | |
44 | + | |
45 | + /* For private mappings, just map in zero pages. */ | |
46 | + for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { | |
47 | + unsigned long count; | |
48 | + | |
49 | + if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0) | |
50 | + goto out_up; | |
51 | + if (vma->vm_flags & (VM_SHARED | VM_HUGETLB)) | |
52 | + break; | |
53 | + count = vma->vm_end - addr; | |
54 | + if (count > size) | |
55 | + count = size; | |
56 | + | |
57 | + zap_page_range(vma, addr, count, NULL); | |
58 | + if (zeromap_page_range(vma, addr, count, PAGE_COPY)) | |
59 | + break; | |
60 | + | |
61 | + size -= count; | |
62 | + buf += count; | |
63 | + addr += count; | |
64 | + if (size == 0) | |
65 | + goto out_up; | |
66 | + } | |
67 | + | |
68 | + up_read(&mm->mmap_sem); | |
69 | + | |
70 | + /* The shared case is hard. Let's do the conventional zeroing. */ | |
71 | + do { | |
72 | + unsigned long unwritten = clear_user(buf, PAGE_SIZE); | |
73 | + if (unwritten) | |
74 | + return size + unwritten - PAGE_SIZE; | |
75 | + cond_resched(); | |
76 | + buf += PAGE_SIZE; | |
77 | + size -= PAGE_SIZE; | |
78 | + } while (size); | |
79 | + | |
80 | + return size; | |
81 | +out_up: | |
82 | + up_read(&mm->mmap_sem); | |
83 | + return size; | |
84 | +} | |
85 | + | |
86 | +static ssize_t read_zero(struct file * file, char __user * buf, | |
87 | + size_t count, loff_t *ppos) | |
88 | +{ | |
89 | + unsigned long left, unwritten, written = 0; | |
90 | + | |
91 | + if (!count) | |
92 | + return 0; | |
93 | + | |
94 | + if (!access_ok(VERIFY_WRITE, buf, count)) | |
95 | + return -EFAULT; | |
96 | + | |
97 | + left = count; | |
98 | + | |
99 | + /* do we want to be clever? Arbitrary cut-off */ | |
100 | + if (count >= PAGE_SIZE*4) { | |
101 | + unsigned long partial; | |
102 | + | |
103 | + /* How much left of the page? */ | |
104 | + partial = (PAGE_SIZE-1) & -(unsigned long) buf; | |
105 | + unwritten = clear_user(buf, partial); | |
106 | + written = partial - unwritten; | |
107 | + if (unwritten) | |
108 | + goto out; | |
109 | + left -= partial; | |
110 | + buf += partial; | |
111 | + unwritten = read_zero_pagealigned(buf, left & PAGE_MASK); | |
112 | + written += (left & PAGE_MASK) - unwritten; | |
113 | + if (unwritten) | |
114 | + goto out; | |
115 | + buf += left & PAGE_MASK; | |
116 | + left &= ~PAGE_MASK; | |
117 | + } | |
118 | + unwritten = clear_user(buf, left); | |
119 | + written += left - unwritten; | |
120 | +out: | |
121 | + return written ? written : -EFAULT; | |
122 | +} | |
123 | + | |
124 | +#else /* CONFIG_XEN */ | |
125 | static ssize_t read_zero(struct file * file, char __user * buf, | |
126 | size_t count, loff_t *ppos) | |
127 | { | |
128 | @@ -730,15 +824,24 @@ static ssize_t read_zero(struct file * f | |
129 | } | |
130 | return written ? written : -EFAULT; | |
131 | } | |
132 | +#endif /* CONFIG_XEN */ | |
133 | ||
134 | static int mmap_zero(struct file * file, struct vm_area_struct * vma) | |
135 | { | |
136 | + int err = 0; | |
137 | + | |
138 | #ifndef CONFIG_MMU | |
139 | return -ENOSYS; | |
140 | #endif | |
141 | + | |
142 | if (vma->vm_flags & VM_SHARED) | |
143 | return shmem_zero_setup(vma); | |
144 | - return 0; | |
145 | +#if 1 //ndef CONFIG_XEN | |
146 | + err = zeromap_page_range(vma, vma->vm_start, | |
147 | + vma->vm_end - vma->vm_start, vma->vm_page_prot); | |
148 | + BUG_ON(err == -EEXIST); | |
149 | +#endif | |
150 | + return err; | |
151 | } | |
152 | ||
153 | static ssize_t write_full(struct file * file, const char __user * buf, | |
154 | --- a/include/linux/mm.h | |
155 | +++ b/include/linux/mm.h | |
156 | @@ -779,6 +779,8 @@ void free_pgd_range(struct mmu_gather *t | |
157 | unsigned long end, unsigned long floor, unsigned long ceiling); | |
158 | int copy_page_range(struct mm_struct *dst, struct mm_struct *src, | |
159 | struct vm_area_struct *vma); | |
160 | +int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, | |
161 | + unsigned long size, pgprot_t prot); | |
162 | void unmap_mapping_range(struct address_space *mapping, | |
163 | loff_t const holebegin, loff_t const holelen, int even_cows); | |
164 | int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, | |
165 | --- a/mm/memory.c | |
166 | +++ b/mm/memory.c | |
167 | @@ -1268,6 +1268,95 @@ int get_user_pages(struct task_struct *t | |
168 | } | |
169 | EXPORT_SYMBOL(get_user_pages); | |
170 | ||
171 | +static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, | |
172 | + unsigned long addr, unsigned long end, pgprot_t prot) | |
173 | +{ | |
174 | + pte_t *pte; | |
175 | + spinlock_t *ptl; | |
176 | + int err = 0; | |
177 | + | |
178 | + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); | |
179 | + if (!pte) | |
180 | + return -EAGAIN; | |
181 | + arch_enter_lazy_mmu_mode(); | |
182 | + do { | |
183 | + struct page *page = ZERO_PAGE(addr); | |
184 | + pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); | |
185 | + | |
186 | + if (unlikely(!pte_none(*pte))) { | |
187 | + err = -EEXIST; | |
188 | + pte++; | |
189 | + break; | |
190 | + } | |
191 | + page_cache_get(page); | |
192 | + page_add_file_rmap(page); | |
193 | + inc_mm_counter(mm, file_rss); | |
194 | + set_pte_at(mm, addr, pte, zero_pte); | |
195 | + } while (pte++, addr += PAGE_SIZE, addr != end); | |
196 | + arch_leave_lazy_mmu_mode(); | |
197 | + pte_unmap_unlock(pte - 1, ptl); | |
198 | + return err; | |
199 | +} | |
200 | + | |
201 | +static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud, | |
202 | + unsigned long addr, unsigned long end, pgprot_t prot) | |
203 | +{ | |
204 | + pmd_t *pmd; | |
205 | + unsigned long next; | |
206 | + int err; | |
207 | + | |
208 | + pmd = pmd_alloc(mm, pud, addr); | |
209 | + if (!pmd) | |
210 | + return -EAGAIN; | |
211 | + do { | |
212 | + next = pmd_addr_end(addr, end); | |
213 | + err = zeromap_pte_range(mm, pmd, addr, next, prot); | |
214 | + if (err) | |
215 | + break; | |
216 | + } while (pmd++, addr = next, addr != end); | |
217 | + return err; | |
218 | +} | |
219 | + | |
220 | +static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd, | |
221 | + unsigned long addr, unsigned long end, pgprot_t prot) | |
222 | +{ | |
223 | + pud_t *pud; | |
224 | + unsigned long next; | |
225 | + int err; | |
226 | + | |
227 | + pud = pud_alloc(mm, pgd, addr); | |
228 | + if (!pud) | |
229 | + return -EAGAIN; | |
230 | + do { | |
231 | + next = pud_addr_end(addr, end); | |
232 | + err = zeromap_pmd_range(mm, pud, addr, next, prot); | |
233 | + if (err) | |
234 | + break; | |
235 | + } while (pud++, addr = next, addr != end); | |
236 | + return err; | |
237 | +} | |
238 | + | |
239 | +int zeromap_page_range(struct vm_area_struct *vma, | |
240 | + unsigned long addr, unsigned long size, pgprot_t prot) | |
241 | +{ | |
242 | + pgd_t *pgd; | |
243 | + unsigned long next; | |
244 | + unsigned long end = addr + size; | |
245 | + struct mm_struct *mm = vma->vm_mm; | |
246 | + int err; | |
247 | + | |
248 | + BUG_ON(addr >= end); | |
249 | + pgd = pgd_offset(mm, addr); | |
250 | + flush_cache_range(vma, addr, end); | |
251 | + do { | |
252 | + next = pgd_addr_end(addr, end); | |
253 | + err = zeromap_pud_range(mm, pgd, addr, next, prot); | |
254 | + if (err) | |
255 | + break; | |
256 | + } while (pgd++, addr = next, addr != end); | |
257 | + return err; | |
258 | +} | |
259 | + | |
260 | pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, | |
261 | spinlock_t **ptl) | |
262 | { | |
263 | @@ -1878,7 +1967,6 @@ gotten: | |
264 | ||
265 | if (unlikely(anon_vma_prepare(vma))) | |
266 | goto oom; | |
267 | - VM_BUG_ON(old_page == ZERO_PAGE(0)); | |
268 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | |
269 | if (!new_page) | |
270 | goto oom; |