1 From: Nick Piggin <npiggin@suse.de>
2 Subject: mm: /dev/zero optimisation
4 Patch-mainline: no (could be submit)
6 Patch for removal of ZERO_PAGE from main VM paths also removed the
7 /dev/zero optimisation to map directly from ZERO_PAGE when doing
8 mmap() and also the interesting read(2) "hack" where the MMU was
9 used to make zero-filling the target buffer zero-copy.
11 Some benchmarks have run into issues with this. Customers sometimes
12 use these benchmarks to qualify and test systems, so even if the
13 benchmarks themselves are "stupid", it saves some trouble to retain
14 this optimisation for them. Also, while I don't think it was established
15 that there is a "real" workload where this helps, but it can't be proven
16 that one does not exist.
18 Signed-off-by: Nick Piggin <npiggin@suse.de>
20 drivers/char/mem.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
21 include/linux/mm.h | 2 +
22 mm/memory.c | 90 ++++++++++++++++++++++++++++++++++++++++++++-
23 3 files changed, 195 insertions(+), 2 deletions(-)
25 --- a/drivers/char/mem.c
26 +++ b/drivers/char/mem.c
27 @@ -702,6 +702,100 @@ static ssize_t splice_write_null(struct
28 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_null);
31 +#if 1 //ndef CONFIG_XEN
33 + * For fun, we are using the MMU for this.
35 +static inline size_t read_zero_pagealigned(char __user * buf, size_t size)
37 + struct mm_struct *mm;
38 + struct vm_area_struct * vma;
39 + unsigned long addr=(unsigned long)buf;
42 + /* Oops, this was forgotten before. -ben */
43 + down_read(&mm->mmap_sem);
45 + /* For private mappings, just map in zero pages. */
46 + for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) {
47 + unsigned long count;
49 + if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0)
51 + if (vma->vm_flags & (VM_SHARED | VM_HUGETLB))
53 + count = vma->vm_end - addr;
57 + zap_page_range(vma, addr, count, NULL);
58 + if (zeromap_page_range(vma, addr, count, PAGE_COPY))
68 + up_read(&mm->mmap_sem);
70 + /* The shared case is hard. Let's do the conventional zeroing. */
72 + unsigned long unwritten = clear_user(buf, PAGE_SIZE);
74 + return size + unwritten - PAGE_SIZE;
82 + up_read(&mm->mmap_sem);
86 +static ssize_t read_zero(struct file * file, char __user * buf,
87 + size_t count, loff_t *ppos)
89 + unsigned long left, unwritten, written = 0;
94 + if (!access_ok(VERIFY_WRITE, buf, count))
99 + /* do we want to be clever? Arbitrary cut-off */
100 + if (count >= PAGE_SIZE*4) {
101 + unsigned long partial;
103 + /* How much left of the page? */
104 + partial = (PAGE_SIZE-1) & -(unsigned long) buf;
105 + unwritten = clear_user(buf, partial);
106 + written = partial - unwritten;
111 + unwritten = read_zero_pagealigned(buf, left & PAGE_MASK);
112 + written += (left & PAGE_MASK) - unwritten;
115 + buf += left & PAGE_MASK;
116 + left &= ~PAGE_MASK;
118 + unwritten = clear_user(buf, left);
119 + written += left - unwritten;
121 + return written ? written : -EFAULT;
124 +#else /* CONFIG_XEN */
125 static ssize_t read_zero(struct file * file, char __user * buf,
126 size_t count, loff_t *ppos)
128 @@ -730,15 +824,24 @@ static ssize_t read_zero(struct file * f
130 return written ? written : -EFAULT;
132 +#endif /* CONFIG_XEN */
134 static int mmap_zero(struct file * file, struct vm_area_struct * vma)
142 if (vma->vm_flags & VM_SHARED)
143 return shmem_zero_setup(vma);
145 +#if 1 //ndef CONFIG_XEN
146 + err = zeromap_page_range(vma, vma->vm_start,
147 + vma->vm_end - vma->vm_start, vma->vm_page_prot);
148 + BUG_ON(err == -EEXIST);
153 static ssize_t write_full(struct file * file, const char __user * buf,
154 --- a/include/linux/mm.h
155 +++ b/include/linux/mm.h
156 @@ -791,6 +791,8 @@ void free_pgd_range(struct mmu_gather *t
157 unsigned long end, unsigned long floor, unsigned long ceiling);
158 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
159 struct vm_area_struct *vma);
160 +int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
161 + unsigned long size, pgprot_t prot);
162 void unmap_mapping_range(struct address_space *mapping,
163 loff_t const holebegin, loff_t const holelen, int even_cows);
164 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
167 @@ -1268,6 +1268,95 @@ int get_user_pages(struct task_struct *t
169 EXPORT_SYMBOL(get_user_pages);
171 +static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
172 + unsigned long addr, unsigned long end, pgprot_t prot)
178 + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
181 + arch_enter_lazy_mmu_mode();
183 + struct page *page = ZERO_PAGE(addr);
184 + pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
186 + if (unlikely(!pte_none(*pte))) {
191 + page_cache_get(page);
192 + page_add_file_rmap(page);
193 + inc_mm_counter(mm, file_rss);
194 + set_pte_at(mm, addr, pte, zero_pte);
195 + } while (pte++, addr += PAGE_SIZE, addr != end);
196 + arch_leave_lazy_mmu_mode();
197 + pte_unmap_unlock(pte - 1, ptl);
201 +static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
202 + unsigned long addr, unsigned long end, pgprot_t prot)
205 + unsigned long next;
208 + pmd = pmd_alloc(mm, pud, addr);
212 + next = pmd_addr_end(addr, end);
213 + err = zeromap_pte_range(mm, pmd, addr, next, prot);
216 + } while (pmd++, addr = next, addr != end);
220 +static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
221 + unsigned long addr, unsigned long end, pgprot_t prot)
224 + unsigned long next;
227 + pud = pud_alloc(mm, pgd, addr);
231 + next = pud_addr_end(addr, end);
232 + err = zeromap_pmd_range(mm, pud, addr, next, prot);
235 + } while (pud++, addr = next, addr != end);
239 +int zeromap_page_range(struct vm_area_struct *vma,
240 + unsigned long addr, unsigned long size, pgprot_t prot)
243 + unsigned long next;
244 + unsigned long end = addr + size;
245 + struct mm_struct *mm = vma->vm_mm;
248 + BUG_ON(addr >= end);
249 + pgd = pgd_offset(mm, addr);
250 + flush_cache_range(vma, addr, end);
252 + next = pgd_addr_end(addr, end);
253 + err = zeromap_pud_range(mm, pgd, addr, next, prot);
256 + } while (pgd++, addr = next, addr != end);
260 pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
263 @@ -1899,7 +1988,6 @@ gotten:
265 if (unlikely(anon_vma_prepare(vma)))
267 - VM_BUG_ON(old_page == ZERO_PAGE(0));
268 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);