+++ /dev/null
-From 57ea7c6c5051ca62883a775bd4c44264987ee3a9 Mon Sep 17 00:00:00 2001
-From: Sasha Levin <sashal@kernel.org>
-Date: Wed, 2 Sep 2020 17:59:04 +0200
-Subject: x86/mm/32: Bring back vmalloc faulting on x86_32
-
-From: Joerg Roedel <jroedel@suse.de>
-
-[ Upstream commit 4819e15f740ec884a50bdc431d7f1e7638b6f7d9 ]
-
-One can not simply remove vmalloc faulting on x86-32. Upstream
-
- commit: 7f0a002b5a21 ("x86/mm: remove vmalloc faulting")
-
-removed it on x86 alltogether because previously the
-arch_sync_kernel_mappings() interface was introduced. This interface
-added synchronization of vmalloc/ioremap page-table updates to all
-page-tables in the system at creation time and was thought to make
-vmalloc faulting obsolete.
-
-But that assumption was incredibly naive.
-
-It turned out that there is a race window between the time the vmalloc
-or ioremap code establishes a mapping and the time it synchronizes
-this change to other page-tables in the system.
-
-During this race window another CPU or thread can establish a vmalloc
-mapping which uses the same intermediate page-table entries (e.g. PMD
-or PUD) and does no synchronization in the end, because it found all
-necessary mappings already present in the kernel reference page-table.
-
-But when these intermediate page-table entries are not yet
-synchronized, the other CPU or thread will continue with a vmalloc
-address that is not yet mapped in the page-table it currently uses,
-causing an unhandled page fault and oops like below:
-
- BUG: unable to handle page fault for address: fe80c000
- #PF: supervisor write access in kernel mode
- #PF: error_code(0x0002) - not-present page
- *pde = 33183067 *pte = a8648163
- Oops: 0002 [#1] SMP
- CPU: 1 PID: 13514 Comm: cve-2017-17053 Tainted: G
- ...
- Call Trace:
- ldt_dup_context+0x66/0x80
- dup_mm+0x2b3/0x480
- copy_process+0x133b/0x15c0
- _do_fork+0x94/0x3e0
- __ia32_sys_clone+0x67/0x80
- __do_fast_syscall_32+0x3f/0x70
- do_fast_syscall_32+0x29/0x60
- do_SYSENTER_32+0x15/0x20
- entry_SYSENTER_32+0x9f/0xf2
- EIP: 0xb7eef549
-
-So the arch_sync_kernel_mappings() interface is racy, but removing it
-would mean to re-introduce the vmalloc_sync_all() interface, which is
-even more awful. Keep arch_sync_kernel_mappings() in place and catch
-the race condition in the page-fault handler instead.
-
-Do a partial revert of above commit to get vmalloc faulting on x86-32
-back in place.
-
-Fixes: 7f0a002b5a21 ("x86/mm: remove vmalloc faulting")
-Reported-by: Naresh Kamboju <naresh.kamboju@linaro.org>
-Signed-off-by: Joerg Roedel <jroedel@suse.de>
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Link: https://lore.kernel.org/r/20200902155904.17544-1-joro@8bytes.org
-Signed-off-by: Sasha Levin <sashal@kernel.org>
----
- arch/x86/mm/fault.c | 78 +++++++++++++++++++++++++++++++++++++++++++++
- 1 file changed, 78 insertions(+)
-
-diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
-index 1ead568c01012..7cf290701a7c6 100644
---- a/arch/x86/mm/fault.c
-+++ b/arch/x86/mm/fault.c
-@@ -191,6 +191,53 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
- return pmd_k;
- }
-
-+/*
-+ * Handle a fault on the vmalloc or module mapping area
-+ *
-+ * This is needed because there is a race condition between the time
-+ * when the vmalloc mapping code updates the PMD to the point in time
-+ * where it synchronizes this update with the other page-tables in the
-+ * system.
-+ *
-+ * In this race window another thread/CPU can map an area on the same
-+ * PMD, finds it already present and does not synchronize it with the
-+ * rest of the system yet. As a result v[mz]alloc might return areas
-+ * which are not mapped in every page-table in the system, causing an
-+ * unhandled page-fault when they are accessed.
-+ */
-+static noinline int vmalloc_fault(unsigned long address)
-+{
-+ unsigned long pgd_paddr;
-+ pmd_t *pmd_k;
-+ pte_t *pte_k;
-+
-+ /* Make sure we are in vmalloc area: */
-+ if (!(address >= VMALLOC_START && address < VMALLOC_END))
-+ return -1;
-+
-+ /*
-+ * Synchronize this task's top level page-table
-+ * with the 'reference' page table.
-+ *
-+ * Do _not_ use "current" here. We might be inside
-+ * an interrupt in the middle of a task switch..
-+ */
-+ pgd_paddr = read_cr3_pa();
-+ pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
-+ if (!pmd_k)
-+ return -1;
-+
-+ if (pmd_large(*pmd_k))
-+ return 0;
-+
-+ pte_k = pte_offset_kernel(pmd_k, address);
-+ if (!pte_present(*pte_k))
-+ return -1;
-+
-+ return 0;
-+}
-+NOKPROBE_SYMBOL(vmalloc_fault);
-+
- void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
- {
- unsigned long addr;
-@@ -1111,6 +1158,37 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
- */
- WARN_ON_ONCE(hw_error_code & X86_PF_PK);
-
-+#ifdef CONFIG_X86_32
-+ /*
-+ * We can fault-in kernel-space virtual memory on-demand. The
-+ * 'reference' page table is init_mm.pgd.
-+ *
-+ * NOTE! We MUST NOT take any locks for this case. We may
-+ * be in an interrupt or a critical region, and should
-+ * only copy the information from the master page table,
-+ * nothing more.
-+ *
-+ * Before doing this on-demand faulting, ensure that the
-+ * fault is not any of the following:
-+ * 1. A fault on a PTE with a reserved bit set.
-+ * 2. A fault caused by a user-mode access. (Do not demand-
-+ * fault kernel memory due to user-mode accesses).
-+ * 3. A fault caused by a page-level protection violation.
-+ * (A demand fault would be on a non-present page which
-+ * would have X86_PF_PROT==0).
-+ *
-+ * This is only needed to close a race condition on x86-32 in
-+ * the vmalloc mapping/unmapping code. See the comment above
-+ * vmalloc_fault() for details. On x86-64 the race does not
-+ * exist as the vmalloc mappings don't need to be synchronized
-+ * there.
-+ */
-+ if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
-+ if (vmalloc_fault(address) >= 0)
-+ return;
-+ }
-+#endif
-+
- /* Was the fault spurious, caused by lazy TLB invalidation? */
- if (spurious_kernel_fault(hw_error_code, address))
- return;
---
-2.25.1
-