+++ /dev/null
-From 67f87463d3a3362424efcbe8b40e4772fd34fc61 Mon Sep 17 00:00:00 2001
-From: Mel Gorman <mgorman@suse.de>
-Date: Wed, 18 Dec 2013 17:08:34 -0800
-Subject: mm: clear pmd_numa before invalidating
-
-From: Mel Gorman <mgorman@suse.de>
-
-commit 67f87463d3a3362424efcbe8b40e4772fd34fc61 upstream.
-
-On x86, PMD entries are similar to _PAGE_PROTNONE protection and are
-handled as NUMA hinting faults. The following two page table protection
-bits are what defines them
-
- _PAGE_NUMA:set _PAGE_PRESENT:clear
-
-A PMD is considered present if any of the _PAGE_PRESENT, _PAGE_PROTNONE,
-_PAGE_PSE or _PAGE_NUMA bits are set. If pmdp_invalidate encounters a
-pmd_numa, it clears the present bit leaving _PAGE_NUMA which will be
-considered not present by the CPU but present by pmd_present. The
-existing caller of pmdp_invalidate should handle it but it's an
-inconsistent state for a PMD. This patch keeps the state consistent
-when calling pmdp_invalidate.
-
-Signed-off-by: Mel Gorman <mgorman@suse.de>
-Reviewed-by: Rik van Riel <riel@redhat.com>
-Cc: Alex Thorlton <athorlton@sgi.com>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- mm/pgtable-generic.c | 3 +++
- 1 file changed, 3 insertions(+)
-
---- a/mm/pgtable-generic.c
-+++ b/mm/pgtable-generic.c
-@@ -191,6 +191,9 @@ pgtable_t pgtable_trans_huge_withdraw(st
- void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
- {
-+ pmd_t entry = *pmdp;
-+ if (pmd_numa(entry))
-+ entry = pmd_mknonnuma(entry);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
- flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
- }
--- /dev/null
+From mgorman@suse.de Tue Jan 7 09:45:19 2014
+From: Mel Gorman <mgorman@suse.de>
+Date: Tue, 7 Jan 2014 14:00:37 +0000
+Subject: mm: numa: call MMU notifiers on THP migration
+To: gregkh@linuxfoundation.org
+Cc: athorlton@sgi.com, riel@redhat.com, chegu_vinod@hp.com, Mel Gorman <mgorman@suse.de>, stable@vger.kernel.org
+Message-ID: <1389103248-17617-3-git-send-email-mgorman@suse.de>
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit f714f4f20e59ea6eea264a86b9a51fd51b88fc54 upstream.
+
+MMU notifiers must be called on THP page migration or secondary MMUs
+will get very confused.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Alex Thorlton <athorlton@sgi.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/migrate.c | 22 ++++++++++++++--------
+ 1 file changed, 14 insertions(+), 8 deletions(-)
+
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -36,6 +36,7 @@
+ #include <linux/hugetlb_cgroup.h>
+ #include <linux/gfp.h>
+ #include <linux/balloon_compaction.h>
++#include <linux/mmu_notifier.h>
+
+ #include <asm/tlbflush.h>
+
+@@ -1655,12 +1656,13 @@ int migrate_misplaced_transhuge_page(str
+ unsigned long address,
+ struct page *page, int node)
+ {
+- unsigned long haddr = address & HPAGE_PMD_MASK;
+ pg_data_t *pgdat = NODE_DATA(node);
+ int isolated = 0;
+ struct page *new_page = NULL;
+ struct mem_cgroup *memcg = NULL;
+ int page_lru = page_is_file_cache(page);
++ unsigned long mmun_start = address & HPAGE_PMD_MASK;
++ unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
+ pmd_t orig_entry;
+
+ /*
+@@ -1702,10 +1704,12 @@ int migrate_misplaced_transhuge_page(str
+ WARN_ON(PageLRU(new_page));
+
+ /* Recheck the target PMD */
++ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
+ fail_putback:
+ spin_unlock(&mm->page_table_lock);
++ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ /* Reverse changes made by migrate_page_copy() */
+ if (TestClearPageActive(new_page))
+@@ -1746,15 +1750,16 @@ fail_putback:
+ * The SetPageUptodate on the new page and page_add_new_anon_rmap
+ * guarantee the copy is visible before the pagetable update.
+ */
+- flush_cache_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+- page_add_new_anon_rmap(new_page, vma, haddr);
+- pmdp_clear_flush(vma, haddr, pmd);
+- set_pmd_at(mm, haddr, pmd, entry);
++ flush_cache_range(vma, mmun_start, mmun_end);
++ page_add_new_anon_rmap(new_page, vma, mmun_start);
++ pmdp_clear_flush(vma, mmun_start, pmd);
++ set_pmd_at(mm, mmun_start, pmd, entry);
++ flush_tlb_range(vma, mmun_start, mmun_end);
+ update_mmu_cache_pmd(vma, address, &entry);
+
+ if (page_count(page) != 2) {
+- set_pmd_at(mm, haddr, pmd, orig_entry);
+- flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
++ set_pmd_at(mm, mmun_start, pmd, orig_entry);
++ flush_tlb_range(vma, mmun_start, mmun_end);
+ update_mmu_cache_pmd(vma, address, &entry);
+ page_remove_rmap(new_page);
+ goto fail_putback;
+@@ -1769,6 +1774,7 @@ fail_putback:
+ */
+ mem_cgroup_end_migration(memcg, page, new_page, true);
+ spin_unlock(&mm->page_table_lock);
++ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
+ unlock_page(new_page);
+ unlock_page(page);
+@@ -1789,7 +1795,7 @@ out_dropref:
+ spin_lock(&mm->page_table_lock);
+ if (pmd_same(*pmd, entry)) {
+ entry = pmd_mknonnuma(entry);
+- set_pmd_at(mm, haddr, pmd, entry);
++ set_pmd_at(mm, mmun_start, pmd, entry);
+ update_mmu_cache_pmd(vma, address, &entry);
+ }
+ spin_unlock(&mm->page_table_lock);
--- /dev/null
+From mgorman@suse.de Tue Jan 7 09:44:16 2014
+From: Mel Gorman <mgorman@suse.de>
+Date: Tue, 7 Jan 2014 14:00:36 +0000
+Subject: mm: numa: serialise parallel get_user_page against THP migration
+To: gregkh@linuxfoundation.org
+Cc: athorlton@sgi.com, riel@redhat.com, chegu_vinod@hp.com, Mel Gorman <mgorman@suse.de>, stable@vger.kernel.org
+Message-ID: <1389103248-17617-2-git-send-email-mgorman@suse.de>
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 2b4847e73004c10ae6666c2e27b5c5430aed8698 upstream.
+
+Base pages are unmapped and flushed from cache and TLB during normal
+page migration and replaced with a migration entry that causes any
+parallel NUMA hinting fault or gup to block until migration completes.
+
+THP does not unmap pages due to a lack of support for migration entries
+at a PMD level. This allows races with get_user_pages and
+get_user_pages_fast which commit 3f926ab945b6 ("mm: Close races between
+THP migration and PMD numa clearing") made worse by introducing a
+pmd_clear_flush().
+
+This patch forces get_user_page (fast and normal) on a pmd_numa page to
+go through the slow get_user_page path where it will serialise against
+THP migration and properly account for the NUMA hinting fault. On the
+migration side the page table lock is taken for each PTE update.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Alex Thorlton <athorlton@sgi.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/gup.c | 13 +++++++++++++
+ mm/huge_memory.c | 24 ++++++++++++++++--------
+ mm/migrate.c | 38 +++++++++++++++++++++++++++++++-------
+ 3 files changed, 60 insertions(+), 15 deletions(-)
+
+--- a/arch/x86/mm/gup.c
++++ b/arch/x86/mm/gup.c
+@@ -83,6 +83,12 @@ static noinline int gup_pte_range(pmd_t
+ pte_t pte = gup_get_pte(ptep);
+ struct page *page;
+
++ /* Similar to the PMD case, NUMA hinting must take slow path */
++ if (pte_numa(pte)) {
++ pte_unmap(ptep);
++ return 0;
++ }
++
+ if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+ pte_unmap(ptep);
+ return 0;
+@@ -167,6 +173,13 @@ static int gup_pmd_range(pud_t pud, unsi
+ if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ return 0;
+ if (unlikely(pmd_large(pmd))) {
++ /*
++ * NUMA hinting faults need to be handled in the GUP
++ * slowpath for accounting purposes and so that they
++ * can be serialised against THP migration.
++ */
++ if (pmd_numa(pmd))
++ return 0;
+ if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
+ return 0;
+ } else {
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1240,6 +1240,10 @@ struct page *follow_trans_huge_pmd(struc
+ if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
+ return ERR_PTR(-EFAULT);
+
++ /* Full NUMA hinting faults to serialise migration in fault paths */
++ if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
++ goto out;
++
+ page = pmd_page(*pmd);
+ VM_BUG_ON(!PageHead(page));
+ if (flags & FOLL_TOUCH) {
+@@ -1306,23 +1310,27 @@ int do_huge_pmd_numa_page(struct mm_stru
+ /* If the page was locked, there are no parallel migrations */
+ if (page_locked)
+ goto clear_pmdnuma;
++ }
+
+- /*
+- * Otherwise wait for potential migrations and retry. We do
+- * relock and check_same as the page may no longer be mapped.
+- * As the fault is being retried, do not account for it.
+- */
++ /*
++ * If there are potential migrations, wait for completion and retry. We
++ * do not relock and check_same as the page may no longer be mapped.
++ * Furtermore, even if the page is currently misplaced, there is no
++ * guarantee it is still misplaced after the migration completes.
++ */
++ if (!page_locked) {
+ spin_unlock(&mm->page_table_lock);
+ wait_on_page_locked(page);
+ page_nid = -1;
+ goto out;
+ }
+
+- /* Page is misplaced, serialise migrations and parallel THP splits */
++ /*
++ * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
++ * to serialises splits
++ */
+ get_page(page);
+ spin_unlock(&mm->page_table_lock);
+- if (!page_locked)
+- lock_page(page);
+ anon_vma = page_lock_anon_vma_read(page);
+
+ /* Confirm the PTE did not while locked */
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -1661,6 +1661,7 @@ int migrate_misplaced_transhuge_page(str
+ struct page *new_page = NULL;
+ struct mem_cgroup *memcg = NULL;
+ int page_lru = page_is_file_cache(page);
++ pmd_t orig_entry;
+
+ /*
+ * Don't migrate pages that are mapped in multiple processes.
+@@ -1702,7 +1703,8 @@ int migrate_misplaced_transhuge_page(str
+
+ /* Recheck the target PMD */
+ spin_lock(&mm->page_table_lock);
+- if (unlikely(!pmd_same(*pmd, entry))) {
++ if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
++fail_putback:
+ spin_unlock(&mm->page_table_lock);
+
+ /* Reverse changes made by migrate_page_copy() */
+@@ -1732,16 +1734,34 @@ int migrate_misplaced_transhuge_page(str
+ */
+ mem_cgroup_prepare_migration(page, new_page, &memcg);
+
++ orig_entry = *pmd;
+ entry = mk_pmd(new_page, vma->vm_page_prot);
+- entry = pmd_mknonnuma(entry);
+- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
++ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+
++ /*
++ * Clear the old entry under pagetable lock and establish the new PTE.
++ * Any parallel GUP will either observe the old page blocking on the
++ * page lock, block on the page table lock or observe the new page.
++ * The SetPageUptodate on the new page and page_add_new_anon_rmap
++ * guarantee the copy is visible before the pagetable update.
++ */
++ flush_cache_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
++ page_add_new_anon_rmap(new_page, vma, haddr);
+ pmdp_clear_flush(vma, haddr, pmd);
+ set_pmd_at(mm, haddr, pmd, entry);
+- page_add_new_anon_rmap(new_page, vma, haddr);
+ update_mmu_cache_pmd(vma, address, &entry);
++
++ if (page_count(page) != 2) {
++ set_pmd_at(mm, haddr, pmd, orig_entry);
++ flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
++ update_mmu_cache_pmd(vma, address, &entry);
++ page_remove_rmap(new_page);
++ goto fail_putback;
++ }
++
+ page_remove_rmap(page);
++
+ /*
+ * Finish the charge transaction under the page table lock to
+ * prevent split_huge_page() from dividing up the charge
+@@ -1766,9 +1786,13 @@ int migrate_misplaced_transhuge_page(str
+ out_fail:
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ out_dropref:
+- entry = pmd_mknonnuma(entry);
+- set_pmd_at(mm, haddr, pmd, entry);
+- update_mmu_cache_pmd(vma, address, &entry);
++ spin_lock(&mm->page_table_lock);
++ if (pmd_same(*pmd, entry)) {
++ entry = pmd_mknonnuma(entry);
++ set_pmd_at(mm, haddr, pmd, entry);
++ update_mmu_cache_pmd(vma, address, &entry);
++ }
++ spin_unlock(&mm->page_table_lock);
+
+ unlock_page(page);
+ put_page(page);
acpi-pci-hotplug-avoid-warning-when-_adr-not-present.patch
intel_pstate-fail-initialization-if-p-state-information-is-missing.patch
revert-of-address-handle-address-cells-2-specially.patch
-mm-clear-pmd_numa-before-invalidating.patch
+mm-numa-serialise-parallel-get_user_page-against-thp-migration.patch
+mm-numa-call-mmu-notifiers-on-thp-migration.patch
mm-numa-ensure-anon_vma-is-locked-to-prevent-parallel-thp-splits.patch
mm-numa-avoid-unnecessary-work-on-the-failure-path.patch
mm-fix-tlb-flush-race-between-migration-and-change_protection_range.patch