]> git.ipfire.org Git - people/pmueller/ipfire-2.x.git/blob - src/patches/suse-2.6.27.25/patches.suse/mm-devzero-optimisation.patch
Updated xen patches taken from suse.
[people/pmueller/ipfire-2.x.git] / src / patches / suse-2.6.27.25 / patches.suse / mm-devzero-optimisation.patch
1 From: Nick Piggin <npiggin@suse.de>
2 Subject: mm: /dev/zero optimisation
3 References: bnc#430738
4 Patch-mainline: no (could be submit)
5
6 Patch for removal of ZERO_PAGE from main VM paths also removed the
7 /dev/zero optimisation to map directly from ZERO_PAGE when doing
8 mmap() and also the interesting read(2) "hack" where the MMU was
9 used to make zero-filling the target buffer zero-copy.
10
11 Some benchmarks have run into issues with this. Customers sometimes
12 use these benchmarks to qualify and test systems, so even if the
13 benchmarks themselves are "stupid", it saves some trouble to retain
14 this optimisation for them. Also, while I don't think it was established
15 that there is a "real" workload where this helps, but it can't be proven
16 that one does not exist.
17
18 Signed-off-by: Nick Piggin <npiggin@suse.de>
19 ---
20 drivers/char/mem.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
21 include/linux/mm.h | 2 +
22 mm/memory.c | 90 ++++++++++++++++++++++++++++++++++++++++++++-
23 3 files changed, 195 insertions(+), 2 deletions(-)
24
25 --- a/drivers/char/mem.c
26 +++ b/drivers/char/mem.c
27 @@ -702,6 +702,100 @@ static ssize_t splice_write_null(struct
28 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_null);
29 }
30
31 +#if 1 //ndef CONFIG_XEN
32 +/*
33 + * For fun, we are using the MMU for this.
34 + */
35 +static inline size_t read_zero_pagealigned(char __user * buf, size_t size)
36 +{
37 + struct mm_struct *mm;
38 + struct vm_area_struct * vma;
39 + unsigned long addr=(unsigned long)buf;
40 +
41 + mm = current->mm;
42 + /* Oops, this was forgotten before. -ben */
43 + down_read(&mm->mmap_sem);
44 +
45 + /* For private mappings, just map in zero pages. */
46 + for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
47 + unsigned long count;
48 +
49 + if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0)
50 + goto out_up;
51 + if (vma->vm_flags & (VM_SHARED | VM_HUGETLB))
52 + break;
53 + count = vma->vm_end - addr;
54 + if (count > size)
55 + count = size;
56 +
57 + zap_page_range(vma, addr, count, NULL);
58 + if (zeromap_page_range(vma, addr, count, PAGE_COPY))
59 + break;
60 +
61 + size -= count;
62 + buf += count;
63 + addr += count;
64 + if (size == 0)
65 + goto out_up;
66 + }
67 +
68 + up_read(&mm->mmap_sem);
69 +
70 + /* The shared case is hard. Let's do the conventional zeroing. */
71 + do {
72 + unsigned long unwritten = clear_user(buf, PAGE_SIZE);
73 + if (unwritten)
74 + return size + unwritten - PAGE_SIZE;
75 + cond_resched();
76 + buf += PAGE_SIZE;
77 + size -= PAGE_SIZE;
78 + } while (size);
79 +
80 + return size;
81 +out_up:
82 + up_read(&mm->mmap_sem);
83 + return size;
84 +}
85 +
86 +static ssize_t read_zero(struct file * file, char __user * buf,
87 + size_t count, loff_t *ppos)
88 +{
89 + unsigned long left, unwritten, written = 0;
90 +
91 + if (!count)
92 + return 0;
93 +
94 + if (!access_ok(VERIFY_WRITE, buf, count))
95 + return -EFAULT;
96 +
97 + left = count;
98 +
99 + /* do we want to be clever? Arbitrary cut-off */
100 + if (count >= PAGE_SIZE*4) {
101 + unsigned long partial;
102 +
103 + /* How much left of the page? */
104 + partial = (PAGE_SIZE-1) & -(unsigned long) buf;
105 + unwritten = clear_user(buf, partial);
106 + written = partial - unwritten;
107 + if (unwritten)
108 + goto out;
109 + left -= partial;
110 + buf += partial;
111 + unwritten = read_zero_pagealigned(buf, left & PAGE_MASK);
112 + written += (left & PAGE_MASK) - unwritten;
113 + if (unwritten)
114 + goto out;
115 + buf += left & PAGE_MASK;
116 + left &= ~PAGE_MASK;
117 + }
118 + unwritten = clear_user(buf, left);
119 + written += left - unwritten;
120 +out:
121 + return written ? written : -EFAULT;
122 +}
123 +
124 +#else /* CONFIG_XEN */
125 static ssize_t read_zero(struct file * file, char __user * buf,
126 size_t count, loff_t *ppos)
127 {
128 @@ -730,15 +824,24 @@ static ssize_t read_zero(struct file * f
129 }
130 return written ? written : -EFAULT;
131 }
132 +#endif /* CONFIG_XEN */
133
134 static int mmap_zero(struct file * file, struct vm_area_struct * vma)
135 {
136 + int err = 0;
137 +
138 #ifndef CONFIG_MMU
139 return -ENOSYS;
140 #endif
141 +
142 if (vma->vm_flags & VM_SHARED)
143 return shmem_zero_setup(vma);
144 - return 0;
145 +#if 1 //ndef CONFIG_XEN
146 + err = zeromap_page_range(vma, vma->vm_start,
147 + vma->vm_end - vma->vm_start, vma->vm_page_prot);
148 + BUG_ON(err == -EEXIST);
149 +#endif
150 + return err;
151 }
152
153 static ssize_t write_full(struct file * file, const char __user * buf,
154 --- a/include/linux/mm.h
155 +++ b/include/linux/mm.h
156 @@ -779,6 +779,8 @@ void free_pgd_range(struct mmu_gather *t
157 unsigned long end, unsigned long floor, unsigned long ceiling);
158 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
159 struct vm_area_struct *vma);
160 +int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
161 + unsigned long size, pgprot_t prot);
162 void unmap_mapping_range(struct address_space *mapping,
163 loff_t const holebegin, loff_t const holelen, int even_cows);
164 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
165 --- a/mm/memory.c
166 +++ b/mm/memory.c
167 @@ -1268,6 +1268,95 @@ int get_user_pages(struct task_struct *t
168 }
169 EXPORT_SYMBOL(get_user_pages);
170
171 +static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
172 + unsigned long addr, unsigned long end, pgprot_t prot)
173 +{
174 + pte_t *pte;
175 + spinlock_t *ptl;
176 + int err = 0;
177 +
178 + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
179 + if (!pte)
180 + return -EAGAIN;
181 + arch_enter_lazy_mmu_mode();
182 + do {
183 + struct page *page = ZERO_PAGE(addr);
184 + pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
185 +
186 + if (unlikely(!pte_none(*pte))) {
187 + err = -EEXIST;
188 + pte++;
189 + break;
190 + }
191 + page_cache_get(page);
192 + page_add_file_rmap(page);
193 + inc_mm_counter(mm, file_rss);
194 + set_pte_at(mm, addr, pte, zero_pte);
195 + } while (pte++, addr += PAGE_SIZE, addr != end);
196 + arch_leave_lazy_mmu_mode();
197 + pte_unmap_unlock(pte - 1, ptl);
198 + return err;
199 +}
200 +
201 +static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
202 + unsigned long addr, unsigned long end, pgprot_t prot)
203 +{
204 + pmd_t *pmd;
205 + unsigned long next;
206 + int err;
207 +
208 + pmd = pmd_alloc(mm, pud, addr);
209 + if (!pmd)
210 + return -EAGAIN;
211 + do {
212 + next = pmd_addr_end(addr, end);
213 + err = zeromap_pte_range(mm, pmd, addr, next, prot);
214 + if (err)
215 + break;
216 + } while (pmd++, addr = next, addr != end);
217 + return err;
218 +}
219 +
220 +static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
221 + unsigned long addr, unsigned long end, pgprot_t prot)
222 +{
223 + pud_t *pud;
224 + unsigned long next;
225 + int err;
226 +
227 + pud = pud_alloc(mm, pgd, addr);
228 + if (!pud)
229 + return -EAGAIN;
230 + do {
231 + next = pud_addr_end(addr, end);
232 + err = zeromap_pmd_range(mm, pud, addr, next, prot);
233 + if (err)
234 + break;
235 + } while (pud++, addr = next, addr != end);
236 + return err;
237 +}
238 +
239 +int zeromap_page_range(struct vm_area_struct *vma,
240 + unsigned long addr, unsigned long size, pgprot_t prot)
241 +{
242 + pgd_t *pgd;
243 + unsigned long next;
244 + unsigned long end = addr + size;
245 + struct mm_struct *mm = vma->vm_mm;
246 + int err;
247 +
248 + BUG_ON(addr >= end);
249 + pgd = pgd_offset(mm, addr);
250 + flush_cache_range(vma, addr, end);
251 + do {
252 + next = pgd_addr_end(addr, end);
253 + err = zeromap_pud_range(mm, pgd, addr, next, prot);
254 + if (err)
255 + break;
256 + } while (pgd++, addr = next, addr != end);
257 + return err;
258 +}
259 +
260 pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
261 spinlock_t **ptl)
262 {
263 @@ -1878,7 +1967,6 @@ gotten:
264
265 if (unlikely(anon_vma_prepare(vma)))
266 goto oom;
267 - VM_BUG_ON(old_page == ZERO_PAGE(0));
268 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
269 if (!new_page)
270 goto oom;