From f78fb89ea532a96f17585a80132f6f5c9ed12afa Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 7 Jan 2014 09:47:30 -0800
Subject: [PATCH] 3.12-stable patches

added patches:
	mm-numa-call-mmu-notifiers-on-thp-migration.patch
	mm-numa-serialise-parallel-get_user_page-against-thp-migration.patch
---
 ...m-clear-pmd_numa-before-invalidating.patch |  46 -----
 ...-call-mmu-notifiers-on-thp-migration.patch | 103 ++++++++++
 ...-get_user_page-against-thp-migration.patch | 193 ++++++++++++++++++
 queue-3.12/series                             |   3 +-
 4 files changed, 298 insertions(+), 47 deletions(-)
 delete mode 100644 queue-3.12/mm-clear-pmd_numa-before-invalidating.patch
 create mode 100644 queue-3.12/mm-numa-call-mmu-notifiers-on-thp-migration.patch
 create mode 100644 queue-3.12/mm-numa-serialise-parallel-get_user_page-against-thp-migration.patch

diff --git a/queue-3.12/mm-clear-pmd_numa-before-invalidating.patch b/queue-3.12/mm-clear-pmd_numa-before-invalidating.patch
deleted file mode 100644
index 0b6bd84ec1d..00000000000
--- a/queue-3.12/mm-clear-pmd_numa-before-invalidating.patch
+++ /dev/null
@@ -1,46 +0,0 @@
-From 67f87463d3a3362424efcbe8b40e4772fd34fc61 Mon Sep 17 00:00:00 2001
-From: Mel Gorman <mgorman@suse.de>
-Date: Wed, 18 Dec 2013 17:08:34 -0800
-Subject: mm: clear pmd_numa before invalidating
-
-From: Mel Gorman <mgorman@suse.de>
-
-commit 67f87463d3a3362424efcbe8b40e4772fd34fc61 upstream.
-
-On x86, PMD entries are similar to _PAGE_PROTNONE protection and are
-handled as NUMA hinting faults.  The following two page table protection
-bits are what defines them
-
-	_PAGE_NUMA:set	_PAGE_PRESENT:clear
-
-A PMD is considered present if any of the _PAGE_PRESENT, _PAGE_PROTNONE,
-_PAGE_PSE or _PAGE_NUMA bits are set.  If pmdp_invalidate encounters a
-pmd_numa, it clears the present bit leaving _PAGE_NUMA which will be
-considered not present by the CPU but present by pmd_present.  The
-existing caller of pmdp_invalidate should handle it but it's an
-inconsistent state for a PMD.  This patch keeps the state consistent
-when calling pmdp_invalidate.
-
-Signed-off-by: Mel Gorman <mgorman@suse.de>
-Reviewed-by: Rik van Riel <riel@redhat.com>
-Cc: Alex Thorlton <athorlton@sgi.com>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- mm/pgtable-generic.c |    3 +++
- 1 file changed, 3 insertions(+)
-
---- a/mm/pgtable-generic.c
-+++ b/mm/pgtable-generic.c
-@@ -191,6 +191,9 @@ pgtable_t pgtable_trans_huge_withdraw(st
- void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
- 		     pmd_t *pmdp)
- {
-+	pmd_t entry = *pmdp;
-+	if (pmd_numa(entry))
-+		entry = pmd_mknonnuma(entry);
- 	set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
- 	flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
- }
diff --git a/queue-3.12/mm-numa-call-mmu-notifiers-on-thp-migration.patch b/queue-3.12/mm-numa-call-mmu-notifiers-on-thp-migration.patch
new file mode 100644
index 00000000000..708630c804b
--- /dev/null
+++ b/queue-3.12/mm-numa-call-mmu-notifiers-on-thp-migration.patch
@@ -0,0 +1,103 @@
+From mgorman@suse.de  Tue Jan  7 09:45:19 2014
+From: Mel Gorman <mgorman@suse.de>
+Date: Tue,  7 Jan 2014 14:00:37 +0000
+Subject: mm: numa: call MMU notifiers on THP migration
+To: gregkh@linuxfoundation.org
+Cc: athorlton@sgi.com, riel@redhat.com, chegu_vinod@hp.com, Mel Gorman <mgorman@suse.de>, stable@vger.kernel.org
+Message-ID: <1389103248-17617-3-git-send-email-mgorman@suse.de>
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit f714f4f20e59ea6eea264a86b9a51fd51b88fc54 upstream.
+
+MMU notifiers must be called on THP page migration or secondary MMUs
+will get very confused.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Alex Thorlton <athorlton@sgi.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/migrate.c |   22 ++++++++++++++--------
+ 1 file changed, 14 insertions(+), 8 deletions(-)
+
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -36,6 +36,7 @@
+ #include <linux/hugetlb_cgroup.h>
+ #include <linux/gfp.h>
+ #include <linux/balloon_compaction.h>
++#include <linux/mmu_notifier.h>
+ 
+ #include <asm/tlbflush.h>
+ 
+@@ -1655,12 +1656,13 @@ int migrate_misplaced_transhuge_page(str
+ 				unsigned long address,
+ 				struct page *page, int node)
+ {
+-	unsigned long haddr = address & HPAGE_PMD_MASK;
+ 	pg_data_t *pgdat = NODE_DATA(node);
+ 	int isolated = 0;
+ 	struct page *new_page = NULL;
+ 	struct mem_cgroup *memcg = NULL;
+ 	int page_lru = page_is_file_cache(page);
++	unsigned long mmun_start = address & HPAGE_PMD_MASK;
++	unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
+ 	pmd_t orig_entry;
+ 
+ 	/*
+@@ -1702,10 +1704,12 @@ int migrate_misplaced_transhuge_page(str
+ 	WARN_ON(PageLRU(new_page));
+ 
+ 	/* Recheck the target PMD */
++	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ 	spin_lock(&mm->page_table_lock);
+ 	if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
+ fail_putback:
+ 		spin_unlock(&mm->page_table_lock);
++		mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ 
+ 		/* Reverse changes made by migrate_page_copy() */
+ 		if (TestClearPageActive(new_page))
+@@ -1746,15 +1750,16 @@ fail_putback:
+ 	 * The SetPageUptodate on the new page and page_add_new_anon_rmap
+ 	 * guarantee the copy is visible before the pagetable update.
+ 	 */
+-	flush_cache_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+-	page_add_new_anon_rmap(new_page, vma, haddr);
+-	pmdp_clear_flush(vma, haddr, pmd);
+-	set_pmd_at(mm, haddr, pmd, entry);
++	flush_cache_range(vma, mmun_start, mmun_end);
++	page_add_new_anon_rmap(new_page, vma, mmun_start);
++	pmdp_clear_flush(vma, mmun_start, pmd);
++	set_pmd_at(mm, mmun_start, pmd, entry);
++	flush_tlb_range(vma, mmun_start, mmun_end);
+ 	update_mmu_cache_pmd(vma, address, &entry);
+ 
+ 	if (page_count(page) != 2) {
+-		set_pmd_at(mm, haddr, pmd, orig_entry);
+-		flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
++		set_pmd_at(mm, mmun_start, pmd, orig_entry);
++		flush_tlb_range(vma, mmun_start, mmun_end);
+ 		update_mmu_cache_pmd(vma, address, &entry);
+ 		page_remove_rmap(new_page);
+ 		goto fail_putback;
+@@ -1769,6 +1774,7 @@ fail_putback:
+ 	 */
+ 	mem_cgroup_end_migration(memcg, page, new_page, true);
+ 	spin_unlock(&mm->page_table_lock);
++	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ 
+ 	unlock_page(new_page);
+ 	unlock_page(page);
+@@ -1789,7 +1795,7 @@ out_dropref:
+ 	spin_lock(&mm->page_table_lock);
+ 	if (pmd_same(*pmd, entry)) {
+ 		entry = pmd_mknonnuma(entry);
+-		set_pmd_at(mm, haddr, pmd, entry);
++		set_pmd_at(mm, mmun_start, pmd, entry);
+ 		update_mmu_cache_pmd(vma, address, &entry);
+ 	}
+ 	spin_unlock(&mm->page_table_lock);
diff --git a/queue-3.12/mm-numa-serialise-parallel-get_user_page-against-thp-migration.patch b/queue-3.12/mm-numa-serialise-parallel-get_user_page-against-thp-migration.patch
new file mode 100644
index 00000000000..a6723c7de86
--- /dev/null
+++ b/queue-3.12/mm-numa-serialise-parallel-get_user_page-against-thp-migration.patch
@@ -0,0 +1,193 @@
+From mgorman@suse.de  Tue Jan  7 09:44:16 2014
+From: Mel Gorman <mgorman@suse.de>
+Date: Tue,  7 Jan 2014 14:00:36 +0000
+Subject: mm: numa: serialise parallel get_user_page against THP migration
+To: gregkh@linuxfoundation.org
+Cc: athorlton@sgi.com, riel@redhat.com, chegu_vinod@hp.com, Mel Gorman <mgorman@suse.de>, stable@vger.kernel.org
+Message-ID: <1389103248-17617-2-git-send-email-mgorman@suse.de>
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 2b4847e73004c10ae6666c2e27b5c5430aed8698 upstream.
+
+Base pages are unmapped and flushed from cache and TLB during normal
+page migration and replaced with a migration entry that causes any
+parallel NUMA hinting fault or gup to block until migration completes.
+
+THP does not unmap pages due to a lack of support for migration entries
+at a PMD level.  This allows races with get_user_pages and
+get_user_pages_fast which commit 3f926ab945b6 ("mm: Close races between
+THP migration and PMD numa clearing") made worse by introducing a
+pmd_clear_flush().
+
+This patch forces get_user_page (fast and normal) on a pmd_numa page to
+go through the slow get_user_page path where it will serialise against
+THP migration and properly account for the NUMA hinting fault.  On the
+migration side the page table lock is taken for each PTE update.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Alex Thorlton <athorlton@sgi.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/gup.c |   13 +++++++++++++
+ mm/huge_memory.c  |   24 ++++++++++++++++--------
+ mm/migrate.c      |   38 +++++++++++++++++++++++++++++++-------
+ 3 files changed, 60 insertions(+), 15 deletions(-)
+
+--- a/arch/x86/mm/gup.c
++++ b/arch/x86/mm/gup.c
+@@ -83,6 +83,12 @@ static noinline int gup_pte_range(pmd_t
+ 		pte_t pte = gup_get_pte(ptep);
+ 		struct page *page;
+ 
++		/* Similar to the PMD case, NUMA hinting must take slow path */
++		if (pte_numa(pte)) {
++			pte_unmap(ptep);
++			return 0;
++		}
++
+ 		if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+ 			pte_unmap(ptep);
+ 			return 0;
+@@ -167,6 +173,13 @@ static int gup_pmd_range(pud_t pud, unsi
+ 		if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ 			return 0;
+ 		if (unlikely(pmd_large(pmd))) {
++			/*
++			 * NUMA hinting faults need to be handled in the GUP
++			 * slowpath for accounting purposes and so that they
++			 * can be serialised against THP migration.
++			 */
++			if (pmd_numa(pmd))
++				return 0;
+ 			if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
+ 				return 0;
+ 		} else {
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1240,6 +1240,10 @@ struct page *follow_trans_huge_pmd(struc
+ 	if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
+ 		return ERR_PTR(-EFAULT);
+ 
++	/* Full NUMA hinting faults to serialise migration in fault paths */
++	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
++		goto out;
++
+ 	page = pmd_page(*pmd);
+ 	VM_BUG_ON(!PageHead(page));
+ 	if (flags & FOLL_TOUCH) {
+@@ -1306,23 +1310,27 @@ int do_huge_pmd_numa_page(struct mm_stru
+ 		/* If the page was locked, there are no parallel migrations */
+ 		if (page_locked)
+ 			goto clear_pmdnuma;
++	}
+ 
+-		/*
+-		 * Otherwise wait for potential migrations and retry. We do
+-		 * relock and check_same as the page may no longer be mapped.
+-		 * As the fault is being retried, do not account for it.
+-		 */
++	/*
++	 * If there are potential migrations, wait for completion and retry. We
++	 * do not relock and check_same as the page may no longer be mapped.
++	 * Furtermore, even if the page is currently misplaced, there is no
++	 * guarantee it is still misplaced after the migration completes.
++	 */
++	if (!page_locked) {
+ 		spin_unlock(&mm->page_table_lock);
+ 		wait_on_page_locked(page);
+ 		page_nid = -1;
+ 		goto out;
+ 	}
+ 
+-	/* Page is misplaced, serialise migrations and parallel THP splits */
++	/*
++	 * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
++	 * to serialises splits
++	 */
+ 	get_page(page);
+ 	spin_unlock(&mm->page_table_lock);
+-	if (!page_locked)
+-		lock_page(page);
+ 	anon_vma = page_lock_anon_vma_read(page);
+ 
+ 	/* Confirm the PTE did not while locked */
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -1661,6 +1661,7 @@ int migrate_misplaced_transhuge_page(str
+ 	struct page *new_page = NULL;
+ 	struct mem_cgroup *memcg = NULL;
+ 	int page_lru = page_is_file_cache(page);
++	pmd_t orig_entry;
+ 
+ 	/*
+ 	 * Don't migrate pages that are mapped in multiple processes.
+@@ -1702,7 +1703,8 @@ int migrate_misplaced_transhuge_page(str
+ 
+ 	/* Recheck the target PMD */
+ 	spin_lock(&mm->page_table_lock);
+-	if (unlikely(!pmd_same(*pmd, entry))) {
++	if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
++fail_putback:
+ 		spin_unlock(&mm->page_table_lock);
+ 
+ 		/* Reverse changes made by migrate_page_copy() */
+@@ -1732,16 +1734,34 @@ int migrate_misplaced_transhuge_page(str
+ 	 */
+ 	mem_cgroup_prepare_migration(page, new_page, &memcg);
+ 
++	orig_entry = *pmd;
+ 	entry = mk_pmd(new_page, vma->vm_page_prot);
+-	entry = pmd_mknonnuma(entry);
+-	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ 	entry = pmd_mkhuge(entry);
++	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ 
++	/*
++	 * Clear the old entry under pagetable lock and establish the new PTE.
++	 * Any parallel GUP will either observe the old page blocking on the
++	 * page lock, block on the page table lock or observe the new page.
++	 * The SetPageUptodate on the new page and page_add_new_anon_rmap
++	 * guarantee the copy is visible before the pagetable update.
++	 */
++	flush_cache_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
++	page_add_new_anon_rmap(new_page, vma, haddr);
+ 	pmdp_clear_flush(vma, haddr, pmd);
+ 	set_pmd_at(mm, haddr, pmd, entry);
+-	page_add_new_anon_rmap(new_page, vma, haddr);
+ 	update_mmu_cache_pmd(vma, address, &entry);
++
++	if (page_count(page) != 2) {
++		set_pmd_at(mm, haddr, pmd, orig_entry);
++		flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
++		update_mmu_cache_pmd(vma, address, &entry);
++		page_remove_rmap(new_page);
++		goto fail_putback;
++	}
++
+ 	page_remove_rmap(page);
++
+ 	/*
+ 	 * Finish the charge transaction under the page table lock to
+ 	 * prevent split_huge_page() from dividing up the charge
+@@ -1766,9 +1786,13 @@ int migrate_misplaced_transhuge_page(str
+ out_fail:
+ 	count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ out_dropref:
+-	entry = pmd_mknonnuma(entry);
+-	set_pmd_at(mm, haddr, pmd, entry);
+-	update_mmu_cache_pmd(vma, address, &entry);
++	spin_lock(&mm->page_table_lock);
++	if (pmd_same(*pmd, entry)) {
++		entry = pmd_mknonnuma(entry);
++		set_pmd_at(mm, haddr, pmd, entry);
++		update_mmu_cache_pmd(vma, address, &entry);
++	}
++	spin_unlock(&mm->page_table_lock);
+ 
+ 	unlock_page(page);
+ 	put_page(page);
diff --git a/queue-3.12/series b/queue-3.12/series
index 0b9f3b1922b..d7408c6dcf3 100644
--- a/queue-3.12/series
+++ b/queue-3.12/series
@@ -107,7 +107,8 @@ ext2-fix-oops-in-ext2_get_block-called-from-ext2_quota_write.patch
 acpi-pci-hotplug-avoid-warning-when-_adr-not-present.patch
 intel_pstate-fail-initialization-if-p-state-information-is-missing.patch
 revert-of-address-handle-address-cells-2-specially.patch
-mm-clear-pmd_numa-before-invalidating.patch
+mm-numa-serialise-parallel-get_user_page-against-thp-migration.patch
+mm-numa-call-mmu-notifiers-on-thp-migration.patch
 mm-numa-ensure-anon_vma-is-locked-to-prevent-parallel-thp-splits.patch
 mm-numa-avoid-unnecessary-work-on-the-failure-path.patch
 mm-fix-tlb-flush-race-between-migration-and-change_protection_range.patch
-- 
2.47.3