]> git.ipfire.org Git - thirdparty/kernel/stable.git/blame - arch/s390/mm/pgtable.c
s390/uaccess: rework uaccess code - fix locking issues
[thirdparty/kernel/stable.git] / arch / s390 / mm / pgtable.c
CommitLineData
3610cce8 1/*
a53c8fab 2 * Copyright IBM Corp. 2007, 2011
3610cce8
MS
3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
4 */
5
6#include <linux/sched.h>
7#include <linux/kernel.h>
8#include <linux/errno.h>
5a0e3ad6 9#include <linux/gfp.h>
3610cce8
MS
10#include <linux/mm.h>
11#include <linux/swap.h>
12#include <linux/smp.h>
13#include <linux/highmem.h>
3610cce8
MS
14#include <linux/pagemap.h>
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/quicklist.h>
80217147 18#include <linux/rcupdate.h>
e5992f2e 19#include <linux/slab.h>
b31288fa 20#include <linux/swapops.h>
3610cce8 21
3610cce8
MS
22#include <asm/pgtable.h>
23#include <asm/pgalloc.h>
24#include <asm/tlb.h>
25#include <asm/tlbflush.h>
6252d702 26#include <asm/mmu_context.h>
3610cce8
MS
27
28#ifndef CONFIG_64BIT
29#define ALLOC_ORDER 1
36409f63 30#define FRAG_MASK 0x0f
3610cce8
MS
31#else
32#define ALLOC_ORDER 2
36409f63 33#define FRAG_MASK 0x03
3610cce8
MS
34#endif
35
239a6425 36
043d0708 37unsigned long *crst_table_alloc(struct mm_struct *mm)
3610cce8
MS
38{
39 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
40
41 if (!page)
42 return NULL;
3610cce8
MS
43 return (unsigned long *) page_to_phys(page);
44}
45
80217147
MS
46void crst_table_free(struct mm_struct *mm, unsigned long *table)
47{
043d0708 48 free_pages((unsigned long) table, ALLOC_ORDER);
80217147
MS
49}
50
6252d702 51#ifdef CONFIG_64BIT
10607864
MS
52static void __crst_table_upgrade(void *arg)
53{
54 struct mm_struct *mm = arg;
55
56 if (current->active_mm == mm)
457f2180 57 update_user_asce(mm, 1);
10607864
MS
58 __tlb_flush_local();
59}
60
6252d702
MS
61int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
62{
63 unsigned long *table, *pgd;
64 unsigned long entry;
10607864 65 int flush;
6252d702
MS
66
67 BUG_ON(limit > (1UL << 53));
10607864 68 flush = 0;
6252d702 69repeat:
043d0708 70 table = crst_table_alloc(mm);
6252d702
MS
71 if (!table)
72 return -ENOMEM;
80217147 73 spin_lock_bh(&mm->page_table_lock);
6252d702
MS
74 if (mm->context.asce_limit < limit) {
75 pgd = (unsigned long *) mm->pgd;
76 if (mm->context.asce_limit <= (1UL << 31)) {
77 entry = _REGION3_ENTRY_EMPTY;
78 mm->context.asce_limit = 1UL << 42;
79 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
80 _ASCE_USER_BITS |
81 _ASCE_TYPE_REGION3;
82 } else {
83 entry = _REGION2_ENTRY_EMPTY;
84 mm->context.asce_limit = 1UL << 53;
85 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
86 _ASCE_USER_BITS |
87 _ASCE_TYPE_REGION2;
88 }
89 crst_table_init(table, entry);
90 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
91 mm->pgd = (pgd_t *) table;
f481bfaf 92 mm->task_size = mm->context.asce_limit;
6252d702 93 table = NULL;
10607864 94 flush = 1;
6252d702 95 }
80217147 96 spin_unlock_bh(&mm->page_table_lock);
6252d702
MS
97 if (table)
98 crst_table_free(mm, table);
99 if (mm->context.asce_limit < limit)
100 goto repeat;
10607864
MS
101 if (flush)
102 on_each_cpu(__crst_table_upgrade, mm, 0);
6252d702
MS
103 return 0;
104}
105
106void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
107{
108 pgd_t *pgd;
109
02a8f3ab 110 if (current->active_mm == mm) {
457f2180 111 clear_user_asce(mm, 1);
10607864 112 __tlb_flush_mm(mm);
02a8f3ab 113 }
6252d702
MS
114 while (mm->context.asce_limit > limit) {
115 pgd = mm->pgd;
116 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
117 case _REGION_ENTRY_TYPE_R2:
118 mm->context.asce_limit = 1UL << 42;
119 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
120 _ASCE_USER_BITS |
121 _ASCE_TYPE_REGION3;
122 break;
123 case _REGION_ENTRY_TYPE_R3:
124 mm->context.asce_limit = 1UL << 31;
125 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
126 _ASCE_USER_BITS |
127 _ASCE_TYPE_SEGMENT;
128 break;
129 default:
130 BUG();
131 }
132 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
f481bfaf 133 mm->task_size = mm->context.asce_limit;
6252d702
MS
134 crst_table_free(mm, (unsigned long *) pgd);
135 }
10607864 136 if (current->active_mm == mm)
457f2180 137 update_user_asce(mm, 1);
6252d702
MS
138}
139#endif
140
e5992f2e
MS
141#ifdef CONFIG_PGSTE
142
143/**
144 * gmap_alloc - allocate a guest address space
145 * @mm: pointer to the parent mm_struct
146 *
147 * Returns a guest address space structure.
148 */
149struct gmap *gmap_alloc(struct mm_struct *mm)
36409f63 150{
e5992f2e
MS
151 struct gmap *gmap;
152 struct page *page;
153 unsigned long *table;
36409f63 154
e5992f2e
MS
155 gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
156 if (!gmap)
157 goto out;
158 INIT_LIST_HEAD(&gmap->crst_list);
159 gmap->mm = mm;
160 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
161 if (!page)
162 goto out_free;
163 list_add(&page->lru, &gmap->crst_list);
164 table = (unsigned long *) page_to_phys(page);
165 crst_table_init(table, _REGION1_ENTRY_EMPTY);
166 gmap->table = table;
480e5926
CB
167 gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH |
168 _ASCE_USER_BITS | __pa(table);
e5992f2e
MS
169 list_add(&gmap->list, &mm->context.gmap_list);
170 return gmap;
171
172out_free:
173 kfree(gmap);
174out:
175 return NULL;
36409f63 176}
e5992f2e 177EXPORT_SYMBOL_GPL(gmap_alloc);
36409f63 178
e5992f2e
MS
179static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table)
180{
181 struct gmap_pgtable *mp;
182 struct gmap_rmap *rmap;
183 struct page *page;
184
e5098611 185 if (*table & _SEGMENT_ENTRY_INVALID)
e5992f2e
MS
186 return 0;
187 page = pfn_to_page(*table >> PAGE_SHIFT);
188 mp = (struct gmap_pgtable *) page->index;
189 list_for_each_entry(rmap, &mp->mapper, list) {
190 if (rmap->entry != table)
191 continue;
192 list_del(&rmap->list);
193 kfree(rmap);
194 break;
195 }
e5098611 196 *table = mp->vmaddr | _SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_PROTECT;
e5992f2e
MS
197 return 1;
198}
199
200static void gmap_flush_tlb(struct gmap *gmap)
201{
202 if (MACHINE_HAS_IDTE)
1b948d6c 203 __tlb_flush_asce(gmap->mm, (unsigned long) gmap->table |
e5992f2e
MS
204 _ASCE_TYPE_REGION1);
205 else
206 __tlb_flush_global();
207}
208
209/**
210 * gmap_free - free a guest address space
211 * @gmap: pointer to the guest address space structure
3610cce8 212 */
e5992f2e
MS
213void gmap_free(struct gmap *gmap)
214{
215 struct page *page, *next;
216 unsigned long *table;
217 int i;
218
219
220 /* Flush tlb. */
221 if (MACHINE_HAS_IDTE)
1b948d6c 222 __tlb_flush_asce(gmap->mm, (unsigned long) gmap->table |
e5992f2e
MS
223 _ASCE_TYPE_REGION1);
224 else
225 __tlb_flush_global();
226
227 /* Free all segment & region tables. */
228 down_read(&gmap->mm->mmap_sem);
cc772456 229 spin_lock(&gmap->mm->page_table_lock);
e5992f2e
MS
230 list_for_each_entry_safe(page, next, &gmap->crst_list, lru) {
231 table = (unsigned long *) page_to_phys(page);
232 if ((*table & _REGION_ENTRY_TYPE_MASK) == 0)
233 /* Remove gmap rmap structures for segment table. */
234 for (i = 0; i < PTRS_PER_PMD; i++, table++)
235 gmap_unlink_segment(gmap, table);
236 __free_pages(page, ALLOC_ORDER);
237 }
cc772456 238 spin_unlock(&gmap->mm->page_table_lock);
e5992f2e
MS
239 up_read(&gmap->mm->mmap_sem);
240 list_del(&gmap->list);
241 kfree(gmap);
242}
243EXPORT_SYMBOL_GPL(gmap_free);
244
245/**
246 * gmap_enable - switch primary space to the guest address space
247 * @gmap: pointer to the guest address space structure
248 */
249void gmap_enable(struct gmap *gmap)
250{
e5992f2e
MS
251 S390_lowcore.gmap = (unsigned long) gmap;
252}
253EXPORT_SYMBOL_GPL(gmap_enable);
254
255/**
256 * gmap_disable - switch back to the standard primary address space
257 * @gmap: pointer to the guest address space structure
258 */
259void gmap_disable(struct gmap *gmap)
260{
e5992f2e
MS
261 S390_lowcore.gmap = 0UL;
262}
263EXPORT_SYMBOL_GPL(gmap_disable);
264
a9162f23
CO
265/*
266 * gmap_alloc_table is assumed to be called with mmap_sem held
267 */
e5992f2e 268static int gmap_alloc_table(struct gmap *gmap,
984e2a59
HC
269 unsigned long *table, unsigned long init)
270 __releases(&gmap->mm->page_table_lock)
271 __acquires(&gmap->mm->page_table_lock)
e5992f2e
MS
272{
273 struct page *page;
274 unsigned long *new;
275
c86cce2a
CB
276 /* since we dont free the gmap table until gmap_free we can unlock */
277 spin_unlock(&gmap->mm->page_table_lock);
e5992f2e 278 page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
c86cce2a 279 spin_lock(&gmap->mm->page_table_lock);
e5992f2e
MS
280 if (!page)
281 return -ENOMEM;
282 new = (unsigned long *) page_to_phys(page);
283 crst_table_init(new, init);
e5098611 284 if (*table & _REGION_ENTRY_INVALID) {
e5992f2e
MS
285 list_add(&page->lru, &gmap->crst_list);
286 *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
287 (*table & _REGION_ENTRY_TYPE_MASK);
288 } else
289 __free_pages(page, ALLOC_ORDER);
e5992f2e
MS
290 return 0;
291}
292
293/**
294 * gmap_unmap_segment - unmap segment from the guest address space
295 * @gmap: pointer to the guest address space structure
296 * @addr: address in the guest address space
297 * @len: length of the memory area to unmap
298 *
b4a96015 299 * Returns 0 if the unmap succeeded, -EINVAL if not.
e5992f2e
MS
300 */
301int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
302{
303 unsigned long *table;
304 unsigned long off;
305 int flush;
306
307 if ((to | len) & (PMD_SIZE - 1))
308 return -EINVAL;
309 if (len == 0 || to + len < to)
310 return -EINVAL;
311
312 flush = 0;
313 down_read(&gmap->mm->mmap_sem);
cc772456 314 spin_lock(&gmap->mm->page_table_lock);
e5992f2e
MS
315 for (off = 0; off < len; off += PMD_SIZE) {
316 /* Walk the guest addr space page table */
317 table = gmap->table + (((to + off) >> 53) & 0x7ff);
e5098611 318 if (*table & _REGION_ENTRY_INVALID)
05873df9 319 goto out;
e5992f2e
MS
320 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
321 table = table + (((to + off) >> 42) & 0x7ff);
e5098611 322 if (*table & _REGION_ENTRY_INVALID)
05873df9 323 goto out;
e5992f2e
MS
324 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
325 table = table + (((to + off) >> 31) & 0x7ff);
e5098611 326 if (*table & _REGION_ENTRY_INVALID)
05873df9 327 goto out;
e5992f2e
MS
328 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
329 table = table + (((to + off) >> 20) & 0x7ff);
330
331 /* Clear segment table entry in guest address space. */
332 flush |= gmap_unlink_segment(gmap, table);
e5098611 333 *table = _SEGMENT_ENTRY_INVALID;
e5992f2e 334 }
05873df9 335out:
cc772456 336 spin_unlock(&gmap->mm->page_table_lock);
e5992f2e
MS
337 up_read(&gmap->mm->mmap_sem);
338 if (flush)
339 gmap_flush_tlb(gmap);
340 return 0;
341}
342EXPORT_SYMBOL_GPL(gmap_unmap_segment);
343
344/**
345 * gmap_mmap_segment - map a segment to the guest address space
346 * @gmap: pointer to the guest address space structure
347 * @from: source address in the parent address space
348 * @to: target address in the guest address space
349 *
b4a96015 350 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
e5992f2e
MS
351 */
352int gmap_map_segment(struct gmap *gmap, unsigned long from,
353 unsigned long to, unsigned long len)
354{
355 unsigned long *table;
356 unsigned long off;
357 int flush;
358
359 if ((from | to | len) & (PMD_SIZE - 1))
360 return -EINVAL;
ee6ee55b 361 if (len == 0 || from + len > TASK_MAX_SIZE ||
e5992f2e
MS
362 from + len < from || to + len < to)
363 return -EINVAL;
364
365 flush = 0;
366 down_read(&gmap->mm->mmap_sem);
cc772456 367 spin_lock(&gmap->mm->page_table_lock);
e5992f2e
MS
368 for (off = 0; off < len; off += PMD_SIZE) {
369 /* Walk the gmap address space page table */
370 table = gmap->table + (((to + off) >> 53) & 0x7ff);
e5098611 371 if ((*table & _REGION_ENTRY_INVALID) &&
e5992f2e
MS
372 gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY))
373 goto out_unmap;
374 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
375 table = table + (((to + off) >> 42) & 0x7ff);
e5098611 376 if ((*table & _REGION_ENTRY_INVALID) &&
e5992f2e
MS
377 gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY))
378 goto out_unmap;
379 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
380 table = table + (((to + off) >> 31) & 0x7ff);
e5098611 381 if ((*table & _REGION_ENTRY_INVALID) &&
e5992f2e
MS
382 gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY))
383 goto out_unmap;
384 table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN);
385 table = table + (((to + off) >> 20) & 0x7ff);
386
387 /* Store 'from' address in an invalid segment table entry. */
388 flush |= gmap_unlink_segment(gmap, table);
e5098611
MS
389 *table = (from + off) | (_SEGMENT_ENTRY_INVALID |
390 _SEGMENT_ENTRY_PROTECT);
e5992f2e 391 }
cc772456 392 spin_unlock(&gmap->mm->page_table_lock);
e5992f2e
MS
393 up_read(&gmap->mm->mmap_sem);
394 if (flush)
395 gmap_flush_tlb(gmap);
396 return 0;
397
398out_unmap:
cc772456 399 spin_unlock(&gmap->mm->page_table_lock);
e5992f2e
MS
400 up_read(&gmap->mm->mmap_sem);
401 gmap_unmap_segment(gmap, to, len);
402 return -ENOMEM;
403}
404EXPORT_SYMBOL_GPL(gmap_map_segment);
405
c5034945
HC
406static unsigned long *gmap_table_walk(unsigned long address, struct gmap *gmap)
407{
408 unsigned long *table;
409
410 table = gmap->table + ((address >> 53) & 0x7ff);
e5098611 411 if (unlikely(*table & _REGION_ENTRY_INVALID))
c5034945
HC
412 return ERR_PTR(-EFAULT);
413 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
414 table = table + ((address >> 42) & 0x7ff);
e5098611 415 if (unlikely(*table & _REGION_ENTRY_INVALID))
c5034945
HC
416 return ERR_PTR(-EFAULT);
417 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
418 table = table + ((address >> 31) & 0x7ff);
e5098611 419 if (unlikely(*table & _REGION_ENTRY_INVALID))
c5034945
HC
420 return ERR_PTR(-EFAULT);
421 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
422 table = table + ((address >> 20) & 0x7ff);
423 return table;
424}
425
426/**
427 * __gmap_translate - translate a guest address to a user space address
428 * @address: guest address
429 * @gmap: pointer to guest mapping meta data structure
430 *
431 * Returns user space address which corresponds to the guest address or
432 * -EFAULT if no such mapping exists.
433 * This function does not establish potentially missing page table entries.
434 * The mmap_sem of the mm that belongs to the address space must be held
435 * when this function gets called.
436 */
437unsigned long __gmap_translate(unsigned long address, struct gmap *gmap)
438{
439 unsigned long *segment_ptr, vmaddr, segment;
440 struct gmap_pgtable *mp;
441 struct page *page;
442
443 current->thread.gmap_addr = address;
444 segment_ptr = gmap_table_walk(address, gmap);
445 if (IS_ERR(segment_ptr))
446 return PTR_ERR(segment_ptr);
447 /* Convert the gmap address to an mm address. */
448 segment = *segment_ptr;
e5098611 449 if (!(segment & _SEGMENT_ENTRY_INVALID)) {
c5034945
HC
450 page = pfn_to_page(segment >> PAGE_SHIFT);
451 mp = (struct gmap_pgtable *) page->index;
452 return mp->vmaddr | (address & ~PMD_MASK);
e5098611 453 } else if (segment & _SEGMENT_ENTRY_PROTECT) {
c5034945
HC
454 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
455 return vmaddr | (address & ~PMD_MASK);
456 }
457 return -EFAULT;
458}
459EXPORT_SYMBOL_GPL(__gmap_translate);
460
461/**
462 * gmap_translate - translate a guest address to a user space address
463 * @address: guest address
464 * @gmap: pointer to guest mapping meta data structure
465 *
466 * Returns user space address which corresponds to the guest address or
467 * -EFAULT if no such mapping exists.
468 * This function does not establish potentially missing page table entries.
469 */
470unsigned long gmap_translate(unsigned long address, struct gmap *gmap)
471{
472 unsigned long rc;
473
474 down_read(&gmap->mm->mmap_sem);
475 rc = __gmap_translate(address, gmap);
476 up_read(&gmap->mm->mmap_sem);
477 return rc;
478}
479EXPORT_SYMBOL_GPL(gmap_translate);
480
d3383632
MS
481static int gmap_connect_pgtable(unsigned long address, unsigned long segment,
482 unsigned long *segment_ptr, struct gmap *gmap)
e5992f2e 483{
ab8e5235 484 unsigned long vmaddr;
c5034945 485 struct vm_area_struct *vma;
e5992f2e
MS
486 struct gmap_pgtable *mp;
487 struct gmap_rmap *rmap;
c5034945 488 struct mm_struct *mm;
e5992f2e
MS
489 struct page *page;
490 pgd_t *pgd;
491 pud_t *pud;
492 pmd_t *pmd;
493
ab8e5235
MS
494 mm = gmap->mm;
495 vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
496 vma = find_vma(mm, vmaddr);
497 if (!vma || vma->vm_start > vmaddr)
498 return -EFAULT;
499 /* Walk the parent mm page table */
500 pgd = pgd_offset(mm, vmaddr);
501 pud = pud_alloc(mm, pgd, vmaddr);
502 if (!pud)
503 return -ENOMEM;
504 pmd = pmd_alloc(mm, pud, vmaddr);
505 if (!pmd)
506 return -ENOMEM;
507 if (!pmd_present(*pmd) &&
508 __pte_alloc(mm, vma, pmd, vmaddr))
509 return -ENOMEM;
510 /* pmd now points to a valid segment table entry. */
511 rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT);
512 if (!rmap)
513 return -ENOMEM;
514 /* Link gmap segment table entry location to page table. */
515 page = pmd_page(*pmd);
516 mp = (struct gmap_pgtable *) page->index;
d3383632 517 rmap->gmap = gmap;
ab8e5235 518 rmap->entry = segment_ptr;
e86cbd87 519 rmap->vmaddr = address & PMD_MASK;
ab8e5235
MS
520 spin_lock(&mm->page_table_lock);
521 if (*segment_ptr == segment) {
522 list_add(&rmap->list, &mp->mapper);
523 /* Set gmap segment table entry to page table. */
524 *segment_ptr = pmd_val(*pmd) & PAGE_MASK;
525 rmap = NULL;
526 }
527 spin_unlock(&mm->page_table_lock);
528 kfree(rmap);
529 return 0;
530}
531
532static void gmap_disconnect_pgtable(struct mm_struct *mm, unsigned long *table)
533{
534 struct gmap_rmap *rmap, *next;
535 struct gmap_pgtable *mp;
536 struct page *page;
537 int flush;
538
539 flush = 0;
540 spin_lock(&mm->page_table_lock);
541 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
542 mp = (struct gmap_pgtable *) page->index;
543 list_for_each_entry_safe(rmap, next, &mp->mapper, list) {
e5098611
MS
544 *rmap->entry = mp->vmaddr | (_SEGMENT_ENTRY_INVALID |
545 _SEGMENT_ENTRY_PROTECT);
ab8e5235
MS
546 list_del(&rmap->list);
547 kfree(rmap);
548 flush = 1;
549 }
550 spin_unlock(&mm->page_table_lock);
551 if (flush)
552 __tlb_flush_global();
553}
554
555/*
556 * this function is assumed to be called with mmap_sem held
557 */
558unsigned long __gmap_fault(unsigned long address, struct gmap *gmap)
559{
560 unsigned long *segment_ptr, segment;
561 struct gmap_pgtable *mp;
562 struct page *page;
563 int rc;
564
e5992f2e 565 current->thread.gmap_addr = address;
c5034945
HC
566 segment_ptr = gmap_table_walk(address, gmap);
567 if (IS_ERR(segment_ptr))
e5992f2e 568 return -EFAULT;
e5992f2e 569 /* Convert the gmap address to an mm address. */
ab8e5235
MS
570 while (1) {
571 segment = *segment_ptr;
e5098611 572 if (!(segment & _SEGMENT_ENTRY_INVALID)) {
ab8e5235
MS
573 /* Page table is present */
574 page = pfn_to_page(segment >> PAGE_SHIFT);
575 mp = (struct gmap_pgtable *) page->index;
576 return mp->vmaddr | (address & ~PMD_MASK);
577 }
e5098611 578 if (!(segment & _SEGMENT_ENTRY_PROTECT))
ab8e5235
MS
579 /* Nothing mapped in the gmap address space. */
580 break;
d3383632 581 rc = gmap_connect_pgtable(address, segment, segment_ptr, gmap);
ab8e5235
MS
582 if (rc)
583 return rc;
e5992f2e
MS
584 }
585 return -EFAULT;
499069e1
CO
586}
587
588unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
589{
590 unsigned long rc;
591
592 down_read(&gmap->mm->mmap_sem);
593 rc = __gmap_fault(address, gmap);
594 up_read(&gmap->mm->mmap_sem);
e5992f2e 595
499069e1 596 return rc;
e5992f2e
MS
597}
598EXPORT_SYMBOL_GPL(gmap_fault);
599
b31288fa
KW
600static void gmap_zap_swap_entry(swp_entry_t entry, struct mm_struct *mm)
601{
602 if (!non_swap_entry(entry))
603 dec_mm_counter(mm, MM_SWAPENTS);
604 else if (is_migration_entry(entry)) {
605 struct page *page = migration_entry_to_page(entry);
606
607 if (PageAnon(page))
608 dec_mm_counter(mm, MM_ANONPAGES);
609 else
610 dec_mm_counter(mm, MM_FILEPAGES);
611 }
612 free_swap_and_cache(entry);
613}
614
615/**
616 * The mm->mmap_sem lock must be held
617 */
618static void gmap_zap_unused(struct mm_struct *mm, unsigned long address)
619{
620 unsigned long ptev, pgstev;
621 spinlock_t *ptl;
622 pgste_t pgste;
623 pte_t *ptep, pte;
624
625 ptep = get_locked_pte(mm, address, &ptl);
626 if (unlikely(!ptep))
627 return;
628 pte = *ptep;
629 if (!pte_swap(pte))
630 goto out_pte;
631 /* Zap unused and logically-zero pages */
632 pgste = pgste_get_lock(ptep);
633 pgstev = pgste_val(pgste);
634 ptev = pte_val(pte);
635 if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
636 ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID))) {
637 gmap_zap_swap_entry(pte_to_swp_entry(pte), mm);
638 pte_clear(mm, address, ptep);
639 }
640 pgste_set_unlock(ptep, pgste);
641out_pte:
642 pte_unmap_unlock(*ptep, ptl);
643}
644
645/*
646 * this function is assumed to be called with mmap_sem held
647 */
648void __gmap_zap(unsigned long address, struct gmap *gmap)
649{
650 unsigned long *table, *segment_ptr;
651 unsigned long segment, pgstev, ptev;
652 struct gmap_pgtable *mp;
653 struct page *page;
654
655 segment_ptr = gmap_table_walk(address, gmap);
656 if (IS_ERR(segment_ptr))
657 return;
658 segment = *segment_ptr;
659 if (segment & _SEGMENT_ENTRY_INVALID)
660 return;
661 page = pfn_to_page(segment >> PAGE_SHIFT);
662 mp = (struct gmap_pgtable *) page->index;
663 address = mp->vmaddr | (address & ~PMD_MASK);
664 /* Page table is present */
665 table = (unsigned long *)(segment & _SEGMENT_ENTRY_ORIGIN);
666 table = table + ((address >> 12) & 0xff);
667 pgstev = table[PTRS_PER_PTE];
668 ptev = table[0];
669 /* quick check, checked again with locks held */
670 if (((pgstev & _PGSTE_GPS_USAGE_MASK) == _PGSTE_GPS_USAGE_UNUSED) ||
671 ((pgstev & _PGSTE_GPS_ZERO) && (ptev & _PAGE_INVALID)))
672 gmap_zap_unused(gmap->mm, address);
673}
674EXPORT_SYMBOL_GPL(__gmap_zap);
675
388186bc
CB
676void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap)
677{
678
679 unsigned long *table, address, size;
680 struct vm_area_struct *vma;
681 struct gmap_pgtable *mp;
682 struct page *page;
683
684 down_read(&gmap->mm->mmap_sem);
685 address = from;
686 while (address < to) {
687 /* Walk the gmap address space page table */
688 table = gmap->table + ((address >> 53) & 0x7ff);
e5098611 689 if (unlikely(*table & _REGION_ENTRY_INVALID)) {
388186bc
CB
690 address = (address + PMD_SIZE) & PMD_MASK;
691 continue;
692 }
693 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
694 table = table + ((address >> 42) & 0x7ff);
e5098611 695 if (unlikely(*table & _REGION_ENTRY_INVALID)) {
388186bc
CB
696 address = (address + PMD_SIZE) & PMD_MASK;
697 continue;
698 }
699 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
700 table = table + ((address >> 31) & 0x7ff);
e5098611 701 if (unlikely(*table & _REGION_ENTRY_INVALID)) {
388186bc
CB
702 address = (address + PMD_SIZE) & PMD_MASK;
703 continue;
704 }
705 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
706 table = table + ((address >> 20) & 0x7ff);
e5098611 707 if (unlikely(*table & _SEGMENT_ENTRY_INVALID)) {
388186bc
CB
708 address = (address + PMD_SIZE) & PMD_MASK;
709 continue;
710 }
711 page = pfn_to_page(*table >> PAGE_SHIFT);
712 mp = (struct gmap_pgtable *) page->index;
713 vma = find_vma(gmap->mm, mp->vmaddr);
714 size = min(to - address, PMD_SIZE - (address & ~PMD_MASK));
715 zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK),
716 size, NULL);
717 address = (address + PMD_SIZE) & PMD_MASK;
718 }
719 up_read(&gmap->mm->mmap_sem);
720}
721EXPORT_SYMBOL_GPL(gmap_discard);
722
d3383632
MS
723static LIST_HEAD(gmap_notifier_list);
724static DEFINE_SPINLOCK(gmap_notifier_lock);
725
726/**
727 * gmap_register_ipte_notifier - register a pte invalidation callback
728 * @nb: pointer to the gmap notifier block
729 */
730void gmap_register_ipte_notifier(struct gmap_notifier *nb)
731{
732 spin_lock(&gmap_notifier_lock);
733 list_add(&nb->list, &gmap_notifier_list);
734 spin_unlock(&gmap_notifier_lock);
735}
736EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
737
738/**
739 * gmap_unregister_ipte_notifier - remove a pte invalidation callback
740 * @nb: pointer to the gmap notifier block
741 */
742void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
743{
744 spin_lock(&gmap_notifier_lock);
745 list_del_init(&nb->list);
746 spin_unlock(&gmap_notifier_lock);
747}
748EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
749
750/**
751 * gmap_ipte_notify - mark a range of ptes for invalidation notification
752 * @gmap: pointer to guest mapping meta data structure
c7c5be73 753 * @start: virtual address in the guest address space
d3383632
MS
754 * @len: size of area
755 *
756 * Returns 0 if for each page in the given range a gmap mapping exists and
757 * the invalidation notification could be set. If the gmap mapping is missing
758 * for one or more pages -EFAULT is returned. If no memory could be allocated
759 * -ENOMEM is returned. This function establishes missing page table entries.
760 */
761int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len)
762{
763 unsigned long addr;
764 spinlock_t *ptl;
765 pte_t *ptep, entry;
766 pgste_t pgste;
767 int rc = 0;
768
769 if ((start & ~PAGE_MASK) || (len & ~PAGE_MASK))
770 return -EINVAL;
771 down_read(&gmap->mm->mmap_sem);
772 while (len) {
773 /* Convert gmap address and connect the page tables */
774 addr = __gmap_fault(start, gmap);
775 if (IS_ERR_VALUE(addr)) {
776 rc = addr;
777 break;
778 }
779 /* Get the page mapped */
bb4b42ce 780 if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) {
d3383632
MS
781 rc = -EFAULT;
782 break;
783 }
784 /* Walk the process page table, lock and get pte pointer */
785 ptep = get_locked_pte(gmap->mm, addr, &ptl);
786 if (unlikely(!ptep))
787 continue;
788 /* Set notification bit in the pgste of the pte */
789 entry = *ptep;
e5098611 790 if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
d3383632 791 pgste = pgste_get_lock(ptep);
0d0dafc1 792 pgste_val(pgste) |= PGSTE_IN_BIT;
d3383632
MS
793 pgste_set_unlock(ptep, pgste);
794 start += PAGE_SIZE;
795 len -= PAGE_SIZE;
796 }
797 spin_unlock(ptl);
798 }
799 up_read(&gmap->mm->mmap_sem);
800 return rc;
801}
802EXPORT_SYMBOL_GPL(gmap_ipte_notify);
803
804/**
805 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte.
806 * @mm: pointer to the process mm_struct
d3383632
MS
807 * @pte: pointer to the page table entry
808 *
809 * This function is assumed to be called with the page table lock held
810 * for the pte to notify.
811 */
aaeff84a 812void gmap_do_ipte_notify(struct mm_struct *mm, pte_t *pte)
d3383632
MS
813{
814 unsigned long segment_offset;
815 struct gmap_notifier *nb;
816 struct gmap_pgtable *mp;
817 struct gmap_rmap *rmap;
818 struct page *page;
819
820 segment_offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
821 segment_offset = segment_offset * (4096 / sizeof(pte_t));
822 page = pfn_to_page(__pa(pte) >> PAGE_SHIFT);
823 mp = (struct gmap_pgtable *) page->index;
824 spin_lock(&gmap_notifier_lock);
825 list_for_each_entry(rmap, &mp->mapper, list) {
826 list_for_each_entry(nb, &gmap_notifier_list, list)
827 nb->notifier_call(rmap->gmap,
828 rmap->vmaddr + segment_offset);
829 }
830 spin_unlock(&gmap_notifier_lock);
831}
832
3eabaee9
MS
833static inline int page_table_with_pgste(struct page *page)
834{
835 return atomic_read(&page->_mapcount) == 0;
836}
837
e5992f2e
MS
838static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
839 unsigned long vmaddr)
36409f63
MS
840{
841 struct page *page;
842 unsigned long *table;
e5992f2e 843 struct gmap_pgtable *mp;
36409f63
MS
844
845 page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
846 if (!page)
847 return NULL;
e5992f2e
MS
848 mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT);
849 if (!mp) {
850 __free_page(page);
851 return NULL;
852 }
e89cfa58
KS
853 if (!pgtable_page_ctor(page)) {
854 kfree(mp);
855 __free_page(page);
856 return NULL;
857 }
e5992f2e
MS
858 mp->vmaddr = vmaddr & PMD_MASK;
859 INIT_LIST_HEAD(&mp->mapper);
860 page->index = (unsigned long) mp;
3eabaee9 861 atomic_set(&page->_mapcount, 0);
36409f63 862 table = (unsigned long *) page_to_phys(page);
e5098611 863 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
0944fe3f
MS
864 clear_table(table + PTRS_PER_PTE, PGSTE_HR_BIT | PGSTE_HC_BIT,
865 PAGE_SIZE/2);
36409f63
MS
866 return table;
867}
868
869static inline void page_table_free_pgste(unsigned long *table)
870{
871 struct page *page;
e5992f2e 872 struct gmap_pgtable *mp;
36409f63
MS
873
874 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
e5992f2e
MS
875 mp = (struct gmap_pgtable *) page->index;
876 BUG_ON(!list_empty(&mp->mapper));
2320c579 877 pgtable_page_dtor(page);
36409f63 878 atomic_set(&page->_mapcount, -1);
e5992f2e 879 kfree(mp);
36409f63
MS
880 __free_page(page);
881}
36409f63 882
deedabb2
MS
883static inline unsigned long page_table_reset_pte(struct mm_struct *mm,
884 pmd_t *pmd, unsigned long addr, unsigned long end)
885{
886 pte_t *start_pte, *pte;
887 spinlock_t *ptl;
888 pgste_t pgste;
889
890 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
891 pte = start_pte;
892 do {
893 pgste = pgste_get_lock(pte);
894 pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
895 pgste_set_unlock(pte, pgste);
896 } while (pte++, addr += PAGE_SIZE, addr != end);
897 pte_unmap_unlock(start_pte, ptl);
898
899 return addr;
900}
901
902static inline unsigned long page_table_reset_pmd(struct mm_struct *mm,
903 pud_t *pud, unsigned long addr, unsigned long end)
904{
905 unsigned long next;
906 pmd_t *pmd;
907
908 pmd = pmd_offset(pud, addr);
909 do {
910 next = pmd_addr_end(addr, end);
911 if (pmd_none_or_clear_bad(pmd))
912 continue;
913 next = page_table_reset_pte(mm, pmd, addr, next);
914 } while (pmd++, addr = next, addr != end);
915
916 return addr;
917}
918
919static inline unsigned long page_table_reset_pud(struct mm_struct *mm,
920 pgd_t *pgd, unsigned long addr, unsigned long end)
921{
922 unsigned long next;
923 pud_t *pud;
924
925 pud = pud_offset(pgd, addr);
926 do {
927 next = pud_addr_end(addr, end);
928 if (pud_none_or_clear_bad(pud))
929 continue;
930 next = page_table_reset_pmd(mm, pud, addr, next);
931 } while (pud++, addr = next, addr != end);
932
933 return addr;
934}
935
936void page_table_reset_pgste(struct mm_struct *mm,
937 unsigned long start, unsigned long end)
938{
939 unsigned long addr, next;
940 pgd_t *pgd;
941
942 addr = start;
943 down_read(&mm->mmap_sem);
944 pgd = pgd_offset(mm, addr);
945 do {
946 next = pgd_addr_end(addr, end);
947 if (pgd_none_or_clear_bad(pgd))
948 continue;
949 next = page_table_reset_pud(mm, pgd, addr, next);
950 } while (pgd++, addr = next, addr != end);
951 up_read(&mm->mmap_sem);
952}
953EXPORT_SYMBOL(page_table_reset_pgste);
954
24d5dd02
CB
955int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
956 unsigned long key, bool nq)
957{
958 spinlock_t *ptl;
959 pgste_t old, new;
960 pte_t *ptep;
961
962 down_read(&mm->mmap_sem);
963 ptep = get_locked_pte(current->mm, addr, &ptl);
964 if (unlikely(!ptep)) {
965 up_read(&mm->mmap_sem);
966 return -EFAULT;
967 }
968
969 new = old = pgste_get_lock(ptep);
970 pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
971 PGSTE_ACC_BITS | PGSTE_FP_BIT);
972 pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
973 pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
974 if (!(pte_val(*ptep) & _PAGE_INVALID)) {
0944fe3f 975 unsigned long address, bits, skey;
24d5dd02
CB
976
977 address = pte_val(*ptep) & PAGE_MASK;
0944fe3f 978 skey = (unsigned long) page_get_storage_key(address);
24d5dd02 979 bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
0944fe3f 980 skey = key & (_PAGE_ACC_BITS | _PAGE_FP_BIT);
24d5dd02 981 /* Set storage key ACC and FP */
0944fe3f 982 page_set_storage_key(address, skey, !nq);
24d5dd02
CB
983 /* Merge host changed & referenced into pgste */
984 pgste_val(new) |= bits << 52;
24d5dd02
CB
985 }
986 /* changing the guest storage key is considered a change of the page */
987 if ((pgste_val(new) ^ pgste_val(old)) &
988 (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
0944fe3f 989 pgste_val(new) |= PGSTE_HC_BIT;
24d5dd02
CB
990
991 pgste_set_unlock(ptep, new);
992 pte_unmap_unlock(*ptep, ptl);
993 up_read(&mm->mmap_sem);
994 return 0;
995}
996EXPORT_SYMBOL(set_guest_storage_key);
997
e5992f2e
MS
998#else /* CONFIG_PGSTE */
999
3eabaee9
MS
1000static inline int page_table_with_pgste(struct page *page)
1001{
1002 return 0;
1003}
1004
e5992f2e
MS
1005static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
1006 unsigned long vmaddr)
1007{
944291de 1008 return NULL;
e5992f2e
MS
1009}
1010
1011static inline void page_table_free_pgste(unsigned long *table)
1012{
1013}
1014
ab8e5235
MS
1015static inline void gmap_disconnect_pgtable(struct mm_struct *mm,
1016 unsigned long *table)
e5992f2e
MS
1017{
1018}
1019
1020#endif /* CONFIG_PGSTE */
1021
1022static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
1023{
1024 unsigned int old, new;
1025
1026 do {
1027 old = atomic_read(v);
1028 new = old ^ bits;
1029 } while (atomic_cmpxchg(v, old, new) != old);
1030 return new;
1031}
1032
1033/*
1034 * page table entry allocation/free routines.
1035 */
1036unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr)
3610cce8 1037{
41459d36
HC
1038 unsigned long *uninitialized_var(table);
1039 struct page *uninitialized_var(page);
36409f63 1040 unsigned int mask, bit;
3610cce8 1041
36409f63 1042 if (mm_has_pgste(mm))
e5992f2e 1043 return page_table_alloc_pgste(mm, vmaddr);
36409f63 1044 /* Allocate fragments of a 4K page as 1K/2K page table */
80217147 1045 spin_lock_bh(&mm->context.list_lock);
36409f63 1046 mask = FRAG_MASK;
146e4b3c
MS
1047 if (!list_empty(&mm->context.pgtable_list)) {
1048 page = list_first_entry(&mm->context.pgtable_list,
1049 struct page, lru);
36409f63
MS
1050 table = (unsigned long *) page_to_phys(page);
1051 mask = atomic_read(&page->_mapcount);
1052 mask = mask | (mask >> 4);
146e4b3c 1053 }
36409f63 1054 if ((mask & FRAG_MASK) == FRAG_MASK) {
80217147 1055 spin_unlock_bh(&mm->context.list_lock);
146e4b3c
MS
1056 page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
1057 if (!page)
3610cce8 1058 return NULL;
e89cfa58
KS
1059 if (!pgtable_page_ctor(page)) {
1060 __free_page(page);
1061 return NULL;
1062 }
36409f63 1063 atomic_set(&page->_mapcount, 1);
146e4b3c 1064 table = (unsigned long *) page_to_phys(page);
e5098611 1065 clear_table(table, _PAGE_INVALID, PAGE_SIZE);
80217147 1066 spin_lock_bh(&mm->context.list_lock);
146e4b3c 1067 list_add(&page->lru, &mm->context.pgtable_list);
36409f63
MS
1068 } else {
1069 for (bit = 1; mask & bit; bit <<= 1)
1070 table += PTRS_PER_PTE;
1071 mask = atomic_xor_bits(&page->_mapcount, bit);
1072 if ((mask & FRAG_MASK) == FRAG_MASK)
1073 list_del(&page->lru);
3610cce8 1074 }
80217147 1075 spin_unlock_bh(&mm->context.list_lock);
3610cce8
MS
1076 return table;
1077}
1078
36409f63 1079void page_table_free(struct mm_struct *mm, unsigned long *table)
80217147
MS
1080{
1081 struct page *page;
36409f63 1082 unsigned int bit, mask;
80217147 1083
3eabaee9
MS
1084 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1085 if (page_table_with_pgste(page)) {
ab8e5235 1086 gmap_disconnect_pgtable(mm, table);
36409f63 1087 return page_table_free_pgste(table);
e5992f2e 1088 }
36409f63 1089 /* Free 1K/2K page table fragment of a 4K page */
36409f63
MS
1090 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
1091 spin_lock_bh(&mm->context.list_lock);
1092 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
1093 list_del(&page->lru);
1094 mask = atomic_xor_bits(&page->_mapcount, bit);
1095 if (mask & FRAG_MASK)
1096 list_add(&page->lru, &mm->context.pgtable_list);
1097 spin_unlock_bh(&mm->context.list_lock);
1098 if (mask == 0) {
80217147 1099 pgtable_page_dtor(page);
36409f63 1100 atomic_set(&page->_mapcount, -1);
80217147
MS
1101 __free_page(page);
1102 }
1103}
1104
36409f63 1105static void __page_table_free_rcu(void *table, unsigned bit)
3610cce8 1106{
146e4b3c 1107 struct page *page;
3610cce8 1108
36409f63
MS
1109 if (bit == FRAG_MASK)
1110 return page_table_free_pgste(table);
36409f63 1111 /* Free 1K/2K page table fragment of a 4K page */
146e4b3c 1112 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
36409f63 1113 if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
146e4b3c 1114 pgtable_page_dtor(page);
36409f63 1115 atomic_set(&page->_mapcount, -1);
146e4b3c
MS
1116 __free_page(page);
1117 }
1118}
3610cce8 1119
36409f63 1120void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
80217147 1121{
36409f63 1122 struct mm_struct *mm;
80217147 1123 struct page *page;
36409f63 1124 unsigned int bit, mask;
80217147 1125
36409f63 1126 mm = tlb->mm;
3eabaee9
MS
1127 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1128 if (page_table_with_pgste(page)) {
ab8e5235 1129 gmap_disconnect_pgtable(mm, table);
36409f63
MS
1130 table = (unsigned long *) (__pa(table) | FRAG_MASK);
1131 tlb_remove_table(tlb, table);
1132 return;
80217147 1133 }
36409f63 1134 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
80217147 1135 spin_lock_bh(&mm->context.list_lock);
36409f63
MS
1136 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
1137 list_del(&page->lru);
1138 mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
1139 if (mask & FRAG_MASK)
1140 list_add_tail(&page->lru, &mm->context.pgtable_list);
80217147 1141 spin_unlock_bh(&mm->context.list_lock);
36409f63
MS
1142 table = (unsigned long *) (__pa(table) | (bit << 4));
1143 tlb_remove_table(tlb, table);
1144}
1145
63df41d6 1146static void __tlb_remove_table(void *_table)
36409f63 1147{
e73b7fff
MS
1148 const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK;
1149 void *table = (void *)((unsigned long) _table & ~mask);
1150 unsigned type = (unsigned long) _table & mask;
36409f63
MS
1151
1152 if (type)
1153 __page_table_free_rcu(table, type);
1154 else
1155 free_pages((unsigned long) table, ALLOC_ORDER);
80217147
MS
1156}
1157
cd94154c
MS
1158static void tlb_remove_table_smp_sync(void *arg)
1159{
1160 /* Simply deliver the interrupt */
1161}
1162
1163static void tlb_remove_table_one(void *table)
1164{
1165 /*
1166 * This isn't an RCU grace period and hence the page-tables cannot be
1167 * assumed to be actually RCU-freed.
1168 *
1169 * It is however sufficient for software page-table walkers that rely
1170 * on IRQ disabling. See the comment near struct mmu_table_batch.
1171 */
1172 smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
1173 __tlb_remove_table(table);
1174}
1175
1176static void tlb_remove_table_rcu(struct rcu_head *head)
1177{
1178 struct mmu_table_batch *batch;
1179 int i;
1180
1181 batch = container_of(head, struct mmu_table_batch, rcu);
1182
1183 for (i = 0; i < batch->nr; i++)
1184 __tlb_remove_table(batch->tables[i]);
1185
1186 free_page((unsigned long)batch);
1187}
1188
1189void tlb_table_flush(struct mmu_gather *tlb)
1190{
1191 struct mmu_table_batch **batch = &tlb->batch;
1192
1193 if (*batch) {
cd94154c
MS
1194 call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
1195 *batch = NULL;
1196 }
1197}
1198
1199void tlb_remove_table(struct mmu_gather *tlb, void *table)
1200{
1201 struct mmu_table_batch **batch = &tlb->batch;
1202
5c474a1e 1203 tlb->mm->context.flush_mm = 1;
cd94154c
MS
1204 if (*batch == NULL) {
1205 *batch = (struct mmu_table_batch *)
1206 __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
1207 if (*batch == NULL) {
5c474a1e 1208 __tlb_flush_mm_lazy(tlb->mm);
cd94154c
MS
1209 tlb_remove_table_one(table);
1210 return;
1211 }
1212 (*batch)->nr = 0;
1213 }
1214 (*batch)->tables[(*batch)->nr++] = table;
1215 if ((*batch)->nr == MAX_TABLE_BATCH)
5c474a1e 1216 tlb_flush_mmu(tlb);
cd94154c 1217}
36409f63 1218
274023da 1219#ifdef CONFIG_TRANSPARENT_HUGEPAGE
3eabaee9 1220static inline void thp_split_vma(struct vm_area_struct *vma)
274023da
GS
1221{
1222 unsigned long addr;
274023da 1223
3eabaee9
MS
1224 for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE)
1225 follow_page(vma, addr, FOLL_SPLIT);
274023da
GS
1226}
1227
3eabaee9 1228static inline void thp_split_mm(struct mm_struct *mm)
274023da 1229{
3eabaee9 1230 struct vm_area_struct *vma;
274023da 1231
3eabaee9 1232 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
274023da
GS
1233 thp_split_vma(vma);
1234 vma->vm_flags &= ~VM_HUGEPAGE;
1235 vma->vm_flags |= VM_NOHUGEPAGE;
274023da 1236 }
3eabaee9
MS
1237 mm->def_flags |= VM_NOHUGEPAGE;
1238}
1239#else
1240static inline void thp_split_mm(struct mm_struct *mm)
1241{
274023da
GS
1242}
1243#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1244
3eabaee9
MS
1245static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb,
1246 struct mm_struct *mm, pud_t *pud,
1247 unsigned long addr, unsigned long end)
1248{
1249 unsigned long next, *table, *new;
1250 struct page *page;
1251 pmd_t *pmd;
1252
1253 pmd = pmd_offset(pud, addr);
1254 do {
1255 next = pmd_addr_end(addr, end);
1256again:
1257 if (pmd_none_or_clear_bad(pmd))
1258 continue;
1259 table = (unsigned long *) pmd_deref(*pmd);
1260 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
1261 if (page_table_with_pgste(page))
1262 continue;
1263 /* Allocate new page table with pgstes */
1264 new = page_table_alloc_pgste(mm, addr);
be39f196
DD
1265 if (!new)
1266 return -ENOMEM;
1267
3eabaee9
MS
1268 spin_lock(&mm->page_table_lock);
1269 if (likely((unsigned long *) pmd_deref(*pmd) == table)) {
1270 /* Nuke pmd entry pointing to the "short" page table */
1271 pmdp_flush_lazy(mm, addr, pmd);
1272 pmd_clear(pmd);
1273 /* Copy ptes from old table to new table */
1274 memcpy(new, table, PAGE_SIZE/2);
1275 clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
1276 /* Establish new table */
1277 pmd_populate(mm, pmd, (pte_t *) new);
1278 /* Free old table with rcu, there might be a walker! */
1279 page_table_free_rcu(tlb, table);
1280 new = NULL;
1281 }
1282 spin_unlock(&mm->page_table_lock);
1283 if (new) {
1284 page_table_free_pgste(new);
1285 goto again;
1286 }
1287 } while (pmd++, addr = next, addr != end);
1288
1289 return addr;
1290}
1291
1292static unsigned long page_table_realloc_pud(struct mmu_gather *tlb,
1293 struct mm_struct *mm, pgd_t *pgd,
1294 unsigned long addr, unsigned long end)
1295{
1296 unsigned long next;
1297 pud_t *pud;
1298
1299 pud = pud_offset(pgd, addr);
1300 do {
1301 next = pud_addr_end(addr, end);
1302 if (pud_none_or_clear_bad(pud))
1303 continue;
1304 next = page_table_realloc_pmd(tlb, mm, pud, addr, next);
be39f196
DD
1305 if (unlikely(IS_ERR_VALUE(next)))
1306 return next;
3eabaee9
MS
1307 } while (pud++, addr = next, addr != end);
1308
1309 return addr;
1310}
1311
be39f196
DD
1312static unsigned long page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm,
1313 unsigned long addr, unsigned long end)
3eabaee9
MS
1314{
1315 unsigned long next;
1316 pgd_t *pgd;
1317
1318 pgd = pgd_offset(mm, addr);
1319 do {
1320 next = pgd_addr_end(addr, end);
1321 if (pgd_none_or_clear_bad(pgd))
1322 continue;
1323 next = page_table_realloc_pud(tlb, mm, pgd, addr, next);
be39f196
DD
1324 if (unlikely(IS_ERR_VALUE(next)))
1325 return next;
3eabaee9 1326 } while (pgd++, addr = next, addr != end);
be39f196
DD
1327
1328 return 0;
3eabaee9
MS
1329}
1330
402b0862
CO
1331/*
1332 * switch on pgstes for its userspace process (for kvm)
1333 */
1334int s390_enable_sie(void)
1335{
1336 struct task_struct *tsk = current;
3eabaee9
MS
1337 struct mm_struct *mm = tsk->mm;
1338 struct mmu_gather tlb;
402b0862 1339
74b6b522 1340 /* Do we have pgstes? if yes, we are done */
36409f63 1341 if (mm_has_pgste(tsk->mm))
74b6b522 1342 return 0;
402b0862 1343
3eabaee9 1344 down_write(&mm->mmap_sem);
274023da
GS
1345 /* split thp mappings and disable thp for future mappings */
1346 thp_split_mm(mm);
3eabaee9 1347 /* Reallocate the page tables with pgstes */
ae7a835c 1348 tlb_gather_mmu(&tlb, mm, 0, TASK_SIZE);
be39f196
DD
1349 if (!page_table_realloc(&tlb, mm, 0, TASK_SIZE))
1350 mm->context.has_pgste = 1;
ae7a835c 1351 tlb_finish_mmu(&tlb, 0, TASK_SIZE);
3eabaee9
MS
1352 up_write(&mm->mmap_sem);
1353 return mm->context.has_pgste ? 0 : -ENOMEM;
402b0862
CO
1354}
1355EXPORT_SYMBOL_GPL(s390_enable_sie);
7db11a36 1356
75077afb 1357#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1ae1c1d0
GS
1358int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
1359 pmd_t *pmdp)
1360{
1361 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1362 /* No need to flush TLB
1363 * On s390 reference bits are in storage key and never in TLB */
1364 return pmdp_test_and_clear_young(vma, address, pmdp);
1365}
1366
1367int pmdp_set_access_flags(struct vm_area_struct *vma,
1368 unsigned long address, pmd_t *pmdp,
1369 pmd_t entry, int dirty)
1370{
1371 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1372
1373 if (pmd_same(*pmdp, entry))
1374 return 0;
1375 pmdp_invalidate(vma, address, pmdp);
1376 set_pmd_at(vma->vm_mm, address, pmdp, entry);
1377 return 1;
1378}
1379
75077afb
GS
1380static void pmdp_splitting_flush_sync(void *arg)
1381{
1382 /* Simply deliver the interrupt */
1383}
1384
1385void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
1386 pmd_t *pmdp)
1387{
1388 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1389 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
1390 (unsigned long *) pmdp)) {
1391 /* need to serialize against gup-fast (IRQ disabled) */
1392 smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
1393 }
1394}
9501d09f 1395
6b0b50b0
AK
1396void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1397 pgtable_t pgtable)
9501d09f
GS
1398{
1399 struct list_head *lh = (struct list_head *) pgtable;
1400
ec66ad66 1401 assert_spin_locked(pmd_lockptr(mm, pmdp));
9501d09f
GS
1402
1403 /* FIFO */
c389a250 1404 if (!pmd_huge_pte(mm, pmdp))
9501d09f
GS
1405 INIT_LIST_HEAD(lh);
1406 else
c389a250
KS
1407 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
1408 pmd_huge_pte(mm, pmdp) = pgtable;
9501d09f
GS
1409}
1410
6b0b50b0 1411pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
9501d09f
GS
1412{
1413 struct list_head *lh;
1414 pgtable_t pgtable;
1415 pte_t *ptep;
1416
ec66ad66 1417 assert_spin_locked(pmd_lockptr(mm, pmdp));
9501d09f
GS
1418
1419 /* FIFO */
c389a250 1420 pgtable = pmd_huge_pte(mm, pmdp);
9501d09f
GS
1421 lh = (struct list_head *) pgtable;
1422 if (list_empty(lh))
c389a250 1423 pmd_huge_pte(mm, pmdp) = NULL;
9501d09f 1424 else {
c389a250 1425 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
9501d09f
GS
1426 list_del(lh);
1427 }
1428 ptep = (pte_t *) pgtable;
e5098611 1429 pte_val(*ptep) = _PAGE_INVALID;
9501d09f 1430 ptep++;
e5098611 1431 pte_val(*ptep) = _PAGE_INVALID;
9501d09f
GS
1432 return pgtable;
1433}
75077afb 1434#endif /* CONFIG_TRANSPARENT_HUGEPAGE */