]> git.ipfire.org Git - thirdparty/linux.git/blame - arch/x86_64/mm/init.c
[PATCH] x86/x86_64: mark rodata section read only: generic x86-64 bugfix
[thirdparty/linux.git] / arch / x86_64 / mm / init.c
CommitLineData
1da177e4
LT
1/*
2 * linux/arch/x86_64/mm/init.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
8
9#include <linux/config.h>
10#include <linux/signal.h>
11#include <linux/sched.h>
12#include <linux/kernel.h>
13#include <linux/errno.h>
14#include <linux/string.h>
15#include <linux/types.h>
16#include <linux/ptrace.h>
17#include <linux/mman.h>
18#include <linux/mm.h>
19#include <linux/swap.h>
20#include <linux/smp.h>
21#include <linux/init.h>
22#include <linux/pagemap.h>
23#include <linux/bootmem.h>
24#include <linux/proc_fs.h>
59170891 25#include <linux/pci.h>
1da177e4
LT
26
27#include <asm/processor.h>
28#include <asm/system.h>
29#include <asm/uaccess.h>
30#include <asm/pgtable.h>
31#include <asm/pgalloc.h>
32#include <asm/dma.h>
33#include <asm/fixmap.h>
34#include <asm/e820.h>
35#include <asm/apic.h>
36#include <asm/tlb.h>
37#include <asm/mmu_context.h>
38#include <asm/proto.h>
39#include <asm/smp.h>
2bc0414e 40#include <asm/sections.h>
1da177e4
LT
41
42#ifndef Dprintk
43#define Dprintk(x...)
44#endif
45
e18c6874
AK
46static unsigned long dma_reserve __initdata;
47
1da177e4
LT
48DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
49
50/*
51 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
52 * physical space so we can cache the place of the first one and move
53 * around without checking the pgd every time.
54 */
55
56void show_mem(void)
57{
e92343cc
AK
58 long i, total = 0, reserved = 0;
59 long shared = 0, cached = 0;
1da177e4
LT
60 pg_data_t *pgdat;
61 struct page *page;
62
e92343cc 63 printk(KERN_INFO "Mem-info:\n");
1da177e4 64 show_free_areas();
e92343cc 65 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
1da177e4
LT
66
67 for_each_pgdat(pgdat) {
68 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
69 page = pfn_to_page(pgdat->node_start_pfn + i);
70 total++;
e92343cc
AK
71 if (PageReserved(page))
72 reserved++;
73 else if (PageSwapCache(page))
74 cached++;
75 else if (page_count(page))
76 shared += page_count(page) - 1;
1da177e4
LT
77 }
78 }
e92343cc
AK
79 printk(KERN_INFO "%lu pages of RAM\n", total);
80 printk(KERN_INFO "%lu reserved pages\n",reserved);
81 printk(KERN_INFO "%lu pages shared\n",shared);
82 printk(KERN_INFO "%lu pages swap cached\n",cached);
1da177e4
LT
83}
84
85/* References to section boundaries */
86
1da177e4
LT
87int after_bootmem;
88
89static void *spp_getpage(void)
90{
91 void *ptr;
92 if (after_bootmem)
93 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
94 else
95 ptr = alloc_bootmem_pages(PAGE_SIZE);
96 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
97 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
98
99 Dprintk("spp_getpage %p\n", ptr);
100 return ptr;
101}
102
103static void set_pte_phys(unsigned long vaddr,
104 unsigned long phys, pgprot_t prot)
105{
106 pgd_t *pgd;
107 pud_t *pud;
108 pmd_t *pmd;
109 pte_t *pte, new_pte;
110
111 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
112
113 pgd = pgd_offset_k(vaddr);
114 if (pgd_none(*pgd)) {
115 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
116 return;
117 }
118 pud = pud_offset(pgd, vaddr);
119 if (pud_none(*pud)) {
120 pmd = (pmd_t *) spp_getpage();
121 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
122 if (pmd != pmd_offset(pud, 0)) {
123 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
124 return;
125 }
126 }
127 pmd = pmd_offset(pud, vaddr);
128 if (pmd_none(*pmd)) {
129 pte = (pte_t *) spp_getpage();
130 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
131 if (pte != pte_offset_kernel(pmd, 0)) {
132 printk("PAGETABLE BUG #02!\n");
133 return;
134 }
135 }
136 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
137
138 pte = pte_offset_kernel(pmd, vaddr);
139 if (!pte_none(*pte) &&
140 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
141 pte_ERROR(*pte);
142 set_pte(pte, new_pte);
143
144 /*
145 * It's enough to flush this one mapping.
146 * (PGE mappings get flushed as well)
147 */
148 __flush_tlb_one(vaddr);
149}
150
151/* NOTE: this is meant to be run only at boot */
152void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
153{
154 unsigned long address = __fix_to_virt(idx);
155
156 if (idx >= __end_of_fixed_addresses) {
157 printk("Invalid __set_fixmap\n");
158 return;
159 }
160 set_pte_phys(address, phys, prot);
161}
162
163unsigned long __initdata table_start, table_end;
164
165extern pmd_t temp_boot_pmds[];
166
167static struct temp_map {
168 pmd_t *pmd;
169 void *address;
170 int allocated;
171} temp_mappings[] __initdata = {
172 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
173 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
174 {}
175};
176
177static __init void *alloc_low_page(int *index, unsigned long *phys)
178{
179 struct temp_map *ti;
180 int i;
181 unsigned long pfn = table_end++, paddr;
182 void *adr;
183
184 if (pfn >= end_pfn)
185 panic("alloc_low_page: ran out of memory");
186 for (i = 0; temp_mappings[i].allocated; i++) {
187 if (!temp_mappings[i].pmd)
188 panic("alloc_low_page: ran out of temp mappings");
189 }
190 ti = &temp_mappings[i];
191 paddr = (pfn << PAGE_SHIFT) & PMD_MASK;
192 set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE));
193 ti->allocated = 1;
194 __flush_tlb();
195 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
196 *index = i;
197 *phys = pfn * PAGE_SIZE;
198 return adr;
199}
200
201static __init void unmap_low_page(int i)
202{
203 struct temp_map *ti = &temp_mappings[i];
204 set_pmd(ti->pmd, __pmd(0));
205 ti->allocated = 0;
206}
207
208static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
209{
210 long i, j;
211
212 i = pud_index(address);
213 pud = pud + i;
214 for (; i < PTRS_PER_PUD; pud++, i++) {
215 int map;
216 unsigned long paddr, pmd_phys;
217 pmd_t *pmd;
218
219 paddr = address + i*PUD_SIZE;
220 if (paddr >= end) {
221 for (; i < PTRS_PER_PUD; i++, pud++)
222 set_pud(pud, __pud(0));
223 break;
224 }
225
226 if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
227 set_pud(pud, __pud(0));
228 continue;
229 }
230
231 pmd = alloc_low_page(&map, &pmd_phys);
232 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
233 for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
234 unsigned long pe;
235
236 if (paddr >= end) {
237 for (; j < PTRS_PER_PMD; j++, pmd++)
238 set_pmd(pmd, __pmd(0));
239 break;
240 }
241 pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
242 pe &= __supported_pte_mask;
243 set_pmd(pmd, __pmd(pe));
244 }
245 unmap_low_page(map);
246 }
247 __flush_tlb();
248}
249
250static void __init find_early_table_space(unsigned long end)
251{
252 unsigned long puds, pmds, tables;
253
254 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
255 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
256 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
257 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
258
259 table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
260 if (table_start == -1UL)
261 panic("Cannot find space for the kernel page tables");
262
263 table_start >>= PAGE_SHIFT;
264 table_end = table_start;
265}
266
267/* Setup the direct mapping of the physical memory at PAGE_OFFSET.
268 This runs before bootmem is initialized and gets pages directly from the
269 physical memory. To access them they are temporarily mapped. */
270void __init init_memory_mapping(unsigned long start, unsigned long end)
271{
272 unsigned long next;
273
274 Dprintk("init_memory_mapping\n");
275
276 /*
277 * Find space for the kernel direct mapping tables.
278 * Later we should allocate these tables in the local node of the memory
279 * mapped. Unfortunately this is done currently before the nodes are
280 * discovered.
281 */
282 find_early_table_space(end);
283
284 start = (unsigned long)__va(start);
285 end = (unsigned long)__va(end);
286
287 for (; start < end; start = next) {
288 int map;
289 unsigned long pud_phys;
290 pud_t *pud = alloc_low_page(&map, &pud_phys);
291 next = start + PGDIR_SIZE;
292 if (next > end)
293 next = end;
294 phys_pud_init(pud, __pa(start), __pa(next));
295 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
296 unmap_low_page(map);
297 }
298
299 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
300 __flush_tlb_all();
301 early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
302 table_start<<PAGE_SHIFT,
303 table_end<<PAGE_SHIFT);
304}
305
f6c2e333 306void __cpuinit zap_low_mappings(int cpu)
1da177e4 307{
f6c2e333
SS
308 if (cpu == 0) {
309 pgd_t *pgd = pgd_offset_k(0UL);
310 pgd_clear(pgd);
311 } else {
312 /*
313 * For AP's, zap the low identity mappings by changing the cr3
314 * to init_level4_pgt and doing local flush tlb all
315 */
316 asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
317 }
318 __flush_tlb_all();
1da177e4
LT
319}
320
a2f1b424
AK
321/* Compute zone sizes for the DMA and DMA32 zones in a node. */
322__init void
323size_zones(unsigned long *z, unsigned long *h,
324 unsigned long start_pfn, unsigned long end_pfn)
325{
326 int i;
327 unsigned long w;
328
329 for (i = 0; i < MAX_NR_ZONES; i++)
330 z[i] = 0;
331
332 if (start_pfn < MAX_DMA_PFN)
333 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
334 if (start_pfn < MAX_DMA32_PFN) {
335 unsigned long dma32_pfn = MAX_DMA32_PFN;
336 if (dma32_pfn > end_pfn)
337 dma32_pfn = end_pfn;
338 z[ZONE_DMA32] = dma32_pfn - start_pfn;
339 }
340 z[ZONE_NORMAL] = end_pfn - start_pfn;
341
342 /* Remove lower zones from higher ones. */
343 w = 0;
344 for (i = 0; i < MAX_NR_ZONES; i++) {
345 if (z[i])
346 z[i] -= w;
347 w += z[i];
348 }
349
350 /* Compute holes */
576fc097 351 w = start_pfn;
a2f1b424
AK
352 for (i = 0; i < MAX_NR_ZONES; i++) {
353 unsigned long s = w;
354 w += z[i];
355 h[i] = e820_hole_size(s, w);
356 }
e18c6874
AK
357
358 /* Add the space pace needed for mem_map to the holes too. */
359 for (i = 0; i < MAX_NR_ZONES; i++)
360 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
361
362 /* The 16MB DMA zone has the kernel and other misc mappings.
363 Account them too */
364 if (h[ZONE_DMA]) {
365 h[ZONE_DMA] += dma_reserve;
366 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
367 printk(KERN_WARNING
368 "Kernel too large and filling up ZONE_DMA?\n");
369 h[ZONE_DMA] = z[ZONE_DMA];
370 }
371 }
a2f1b424
AK
372}
373
2b97690f 374#ifndef CONFIG_NUMA
1da177e4
LT
375void __init paging_init(void)
376{
a2f1b424
AK
377 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
378 size_zones(zones, holes, 0, end_pfn);
379 free_area_init_node(0, NODE_DATA(0), zones,
380 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
1da177e4
LT
381}
382#endif
383
384/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
385 from the CPU leading to inconsistent cache lines. address and size
386 must be aligned to 2MB boundaries.
387 Does nothing when the mapping doesn't exist. */
388void __init clear_kernel_mapping(unsigned long address, unsigned long size)
389{
390 unsigned long end = address + size;
391
392 BUG_ON(address & ~LARGE_PAGE_MASK);
393 BUG_ON(size & ~LARGE_PAGE_MASK);
394
395 for (; address < end; address += LARGE_PAGE_SIZE) {
396 pgd_t *pgd = pgd_offset_k(address);
397 pud_t *pud;
398 pmd_t *pmd;
399 if (pgd_none(*pgd))
400 continue;
401 pud = pud_offset(pgd, address);
402 if (pud_none(*pud))
403 continue;
404 pmd = pmd_offset(pud, address);
405 if (!pmd || pmd_none(*pmd))
406 continue;
407 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
408 /* Could handle this, but it should not happen currently. */
409 printk(KERN_ERR
410 "clear_kernel_mapping: mapping has been split. will leak memory\n");
411 pmd_ERROR(*pmd);
412 }
413 set_pmd(pmd, __pmd(0));
414 }
415 __flush_tlb_all();
416}
417
1da177e4
LT
418static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
419 kcore_vsyscall;
420
421void __init mem_init(void)
422{
0a43e4bf 423 long codesize, reservedpages, datasize, initsize;
1da177e4
LT
424
425#ifdef CONFIG_SWIOTLB
1da177e4
LT
426 if (!iommu_aperture &&
427 (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu))
428 swiotlb = 1;
429 if (swiotlb)
430 swiotlb_init();
431#endif
432
433 /* How many end-of-memory variables you have, grandma! */
434 max_low_pfn = end_pfn;
435 max_pfn = end_pfn;
436 num_physpages = end_pfn;
437 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
438
439 /* clear the zero-page */
440 memset(empty_zero_page, 0, PAGE_SIZE);
441
442 reservedpages = 0;
443
444 /* this will put all low memory onto the freelists */
2b97690f 445#ifdef CONFIG_NUMA
0a43e4bf 446 totalram_pages = numa_free_all_bootmem();
1da177e4 447#else
0a43e4bf 448 totalram_pages = free_all_bootmem();
1da177e4 449#endif
0a43e4bf 450 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
1da177e4
LT
451
452 after_bootmem = 1;
453
454 codesize = (unsigned long) &_etext - (unsigned long) &_text;
455 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
456 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
457
458 /* Register memory areas for /proc/kcore */
459 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
460 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
461 VMALLOC_END-VMALLOC_START);
462 kclist_add(&kcore_kernel, &_stext, _end - _stext);
463 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
464 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
465 VSYSCALL_END - VSYSCALL_START);
466
0a43e4bf 467 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
1da177e4
LT
468 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
469 end_pfn << (PAGE_SHIFT-10),
470 codesize >> 10,
471 reservedpages << (PAGE_SHIFT-10),
472 datasize >> 10,
473 initsize >> 10);
474
f6c2e333 475#ifdef CONFIG_SMP
1da177e4 476 /*
f6c2e333
SS
477 * Sync boot_level4_pgt mappings with the init_level4_pgt
478 * except for the low identity mappings which are already zapped
479 * in init_level4_pgt. This sync-up is essential for AP's bringup
1da177e4 480 */
f6c2e333 481 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
1da177e4
LT
482#endif
483}
484
1da177e4
LT
485void free_initmem(void)
486{
487 unsigned long addr;
488
489 addr = (unsigned long)(&__init_begin);
490 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
491 ClearPageReserved(virt_to_page(addr));
492 set_page_count(virt_to_page(addr), 1);
493 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
494 free_page(addr);
495 totalram_pages++;
496 }
497 memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
2bc0414e 498 printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
1da177e4
LT
499}
500
501#ifdef CONFIG_BLK_DEV_INITRD
502void free_initrd_mem(unsigned long start, unsigned long end)
503{
504 if (start < (unsigned long)&_end)
505 return;
506 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
507 for (; start < end; start += PAGE_SIZE) {
508 ClearPageReserved(virt_to_page(start));
509 set_page_count(virt_to_page(start), 1);
510 free_page(start);
511 totalram_pages++;
512 }
513}
514#endif
515
516void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
517{
518 /* Should check here against the e820 map to avoid double free */
2b97690f 519#ifdef CONFIG_NUMA
1da177e4
LT
520 int nid = phys_to_nid(phys);
521 reserve_bootmem_node(NODE_DATA(nid), phys, len);
522#else
523 reserve_bootmem(phys, len);
524#endif
e18c6874
AK
525 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
526 dma_reserve += len / PAGE_SIZE;
1da177e4
LT
527}
528
529int kern_addr_valid(unsigned long addr)
530{
531 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
532 pgd_t *pgd;
533 pud_t *pud;
534 pmd_t *pmd;
535 pte_t *pte;
536
537 if (above != 0 && above != -1UL)
538 return 0;
539
540 pgd = pgd_offset_k(addr);
541 if (pgd_none(*pgd))
542 return 0;
543
544 pud = pud_offset(pgd, addr);
545 if (pud_none(*pud))
546 return 0;
547
548 pmd = pmd_offset(pud, addr);
549 if (pmd_none(*pmd))
550 return 0;
551 if (pmd_large(*pmd))
552 return pfn_valid(pmd_pfn(*pmd));
553
554 pte = pte_offset_kernel(pmd, addr);
555 if (pte_none(*pte))
556 return 0;
557 return pfn_valid(pte_pfn(*pte));
558}
559
560#ifdef CONFIG_SYSCTL
561#include <linux/sysctl.h>
562
563extern int exception_trace, page_fault_trace;
564
565static ctl_table debug_table2[] = {
566 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
567 proc_dointvec },
1da177e4
LT
568 { 0, }
569};
570
571static ctl_table debug_root_table2[] = {
572 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
573 .child = debug_table2 },
574 { 0 },
575};
576
577static __init int x8664_sysctl_init(void)
578{
579 register_sysctl_table(debug_root_table2, 1);
580 return 0;
581}
582__initcall(x8664_sysctl_init);
583#endif
584
1e014410
AK
585/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
586 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
587 not need special handling anymore. */
1da177e4
LT
588
589static struct vm_area_struct gate_vma = {
590 .vm_start = VSYSCALL_START,
591 .vm_end = VSYSCALL_END,
592 .vm_page_prot = PAGE_READONLY
593};
594
1da177e4
LT
595struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
596{
597#ifdef CONFIG_IA32_EMULATION
1e014410
AK
598 if (test_tsk_thread_flag(tsk, TIF_IA32))
599 return NULL;
1da177e4
LT
600#endif
601 return &gate_vma;
602}
603
604int in_gate_area(struct task_struct *task, unsigned long addr)
605{
606 struct vm_area_struct *vma = get_gate_vma(task);
1e014410
AK
607 if (!vma)
608 return 0;
1da177e4
LT
609 return (addr >= vma->vm_start) && (addr < vma->vm_end);
610}
611
612/* Use this when you have no reliable task/vma, typically from interrupt
613 * context. It is less reliable than using the task's vma and may give
614 * false positives.
615 */
616int in_gate_area_no_task(unsigned long addr)
617{
1e014410 618 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
1da177e4 619}