From 78e1b45d12d9ceedb576f81e3ca99e8aa13e7e14 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 8 Nov 2013 21:55:19 -0800
Subject: [PATCH] 3.10-stable patches

added patches:
	mm-account-for-a-thp-numa-hinting-update-as-one-pte-update.patch
	mm-close-races-between-thp-migration-and-pmd-numa-clearing.patch
	mm-numa-do-not-account-for-a-hinting-fault-if-we-raced.patch
	mm-numa-sanitize-task_numa_fault-callsites.patch
	mm-prevent-parallel-splits-during-thp-migration.patch
	mm-wait-for-thp-migrations-to-complete-during-numa-hinting-faults.patch
---
 ...uma-hinting-update-as-one-pte-update.patch |  39 +++
 ...-thp-migration-and-pmd-numa-clearing.patch | 173 +++++++++++++
 ...ount-for-a-hinting-fault-if-we-raced.patch |  41 ++++
 ...a-sanitize-task_numa_fault-callsites.patch | 230 ++++++++++++++++++
 ...parallel-splits-during-thp-migration.patch | 122 ++++++++++
 ...-complete-during-numa-hinting-faults.patch |  81 ++++++
 queue-3.10/series                             |   6 +
 7 files changed, 692 insertions(+)
 create mode 100644 queue-3.10/mm-account-for-a-thp-numa-hinting-update-as-one-pte-update.patch
 create mode 100644 queue-3.10/mm-close-races-between-thp-migration-and-pmd-numa-clearing.patch
 create mode 100644 queue-3.10/mm-numa-do-not-account-for-a-hinting-fault-if-we-raced.patch
 create mode 100644 queue-3.10/mm-numa-sanitize-task_numa_fault-callsites.patch
 create mode 100644 queue-3.10/mm-prevent-parallel-splits-during-thp-migration.patch
 create mode 100644 queue-3.10/mm-wait-for-thp-migrations-to-complete-during-numa-hinting-faults.patch

diff --git a/queue-3.10/mm-account-for-a-thp-numa-hinting-update-as-one-pte-update.patch b/queue-3.10/mm-account-for-a-thp-numa-hinting-update-as-one-pte-update.patch
new file mode 100644
index 00000000000..2619c0fdf9b
--- /dev/null
+++ b/queue-3.10/mm-account-for-a-thp-numa-hinting-update-as-one-pte-update.patch
@@ -0,0 +1,39 @@
+From 0255d491848032f6c601b6410c3b8ebded3a37b1 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:47 +0100
+Subject: mm: Account for a THP NUMA hinting update as one PTE update
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 0255d491848032f6c601b6410c3b8ebded3a37b1 upstream.
+
+A THP PMD update is accounted for as 512 pages updated in vmstat.  This is
+large difference when estimating the cost of automatic NUMA balancing and
+can be misleading when comparing results that had collapsed versus split
+THP. This patch addresses the accounting issue.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-10-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/mprotect.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -145,7 +145,7 @@ static inline unsigned long change_pmd_r
+ 				split_huge_page_pmd(vma, addr, pmd);
+ 			else if (change_huge_pmd(vma, pmd, addr, newprot,
+ 						 prot_numa)) {
+-				pages += HPAGE_PMD_NR;
++				pages++;
+ 				continue;
+ 			}
+ 			/* fall through */
diff --git a/queue-3.10/mm-close-races-between-thp-migration-and-pmd-numa-clearing.patch b/queue-3.10/mm-close-races-between-thp-migration-and-pmd-numa-clearing.patch
new file mode 100644
index 00000000000..6077e011e2c
--- /dev/null
+++ b/queue-3.10/mm-close-races-between-thp-migration-and-pmd-numa-clearing.patch
@@ -0,0 +1,173 @@
+From 3f926ab945b60a5824369d21add7710622a2eac0 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:46 +0100
+Subject: mm: Close races between THP migration and PMD numa clearing
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 3f926ab945b60a5824369d21add7710622a2eac0 upstream.
+
+THP migration uses the page lock to guard against parallel allocations
+but there are cases like this still open
+
+  Task A					Task B
+  ---------------------				---------------------
+  do_huge_pmd_numa_page				do_huge_pmd_numa_page
+  lock_page
+  mpol_misplaced == -1
+  unlock_page
+  goto clear_pmdnuma
+						lock_page
+						mpol_misplaced == 2
+						migrate_misplaced_transhuge
+  pmd = pmd_mknonnuma
+  set_pmd_at
+
+During hours of testing, one crashed with weird errors and while I have
+no direct evidence, I suspect something like the race above happened.
+This patch extends the page lock to being held until the pmd_numa is
+cleared to prevent migration starting in parallel while the pmd_numa is
+being cleared. It also flushes the old pmd entry and orders pagetable
+insertion before rmap insertion.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-9-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c |   33 +++++++++++++++------------------
+ mm/migrate.c     |   19 +++++++++++--------
+ 2 files changed, 26 insertions(+), 26 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1314,24 +1314,25 @@ int do_huge_pmd_numa_page(struct mm_stru
+ 	target_nid = mpol_misplaced(page, vma, haddr);
+ 	if (target_nid == -1) {
+ 		/* If the page was locked, there are no parallel migrations */
+-		if (page_locked) {
+-			unlock_page(page);
++		if (page_locked)
+ 			goto clear_pmdnuma;
+-		}
+ 
+-		/* Otherwise wait for potential migrations and retry fault */
++		/*
++		 * Otherwise wait for potential migrations and retry. We do
++		 * relock and check_same as the page may no longer be mapped.
++		 * As the fault is being retried, do not account for it.
++		 */
+ 		spin_unlock(&mm->page_table_lock);
+ 		wait_on_page_locked(page);
++		page_nid = -1;
+ 		goto out;
+ 	}
+ 
+ 	/* Page is misplaced, serialise migrations and parallel THP splits */
+ 	get_page(page);
+ 	spin_unlock(&mm->page_table_lock);
+-	if (!page_locked) {
++	if (!page_locked)
+ 		lock_page(page);
+-		page_locked = true;
+-	}
+ 	anon_vma = page_lock_anon_vma_read(page);
+ 
+ 	/* Confirm the PTE did not while locked */
+@@ -1339,32 +1340,28 @@ int do_huge_pmd_numa_page(struct mm_stru
+ 	if (unlikely(!pmd_same(pmd, *pmdp))) {
+ 		unlock_page(page);
+ 		put_page(page);
++		page_nid = -1;
+ 		goto out_unlock;
+ 	}
+ 
+-	/* Migrate the THP to the requested node */
++	/*
++	 * Migrate the THP to the requested node, returns with page unlocked
++	 * and pmd_numa cleared.
++	 */
+ 	spin_unlock(&mm->page_table_lock);
+ 	migrated = migrate_misplaced_transhuge_page(mm, vma,
+ 				pmdp, pmd, addr, page, target_nid);
+ 	if (migrated)
+ 		page_nid = target_nid;
+-	else
+-		goto check_same;
+ 
+ 	goto out;
+-
+-check_same:
+-	spin_lock(&mm->page_table_lock);
+-	if (unlikely(!pmd_same(pmd, *pmdp))) {
+-		/* Someone else took our fault */
+-		page_nid = -1;
+-		goto out_unlock;
+-	}
+ clear_pmdnuma:
++	BUG_ON(!PageLocked(page));
+ 	pmd = pmd_mknonnuma(pmd);
+ 	set_pmd_at(mm, haddr, pmdp, pmd);
+ 	VM_BUG_ON(pmd_numa(*pmdp));
+ 	update_mmu_cache_pmd(vma, addr, pmdp);
++	unlock_page(page);
+ out_unlock:
+ 	spin_unlock(&mm->page_table_lock);
+ 
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -1710,12 +1710,12 @@ int migrate_misplaced_transhuge_page(str
+ 		unlock_page(new_page);
+ 		put_page(new_page);		/* Free it */
+ 
+-		unlock_page(page);
++		/* Retake the callers reference and putback on LRU */
++		get_page(page);
+ 		putback_lru_page(page);
+-
+-		count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+-		isolated = 0;
+-		goto out;
++		mod_zone_page_state(page_zone(page),
++			 NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
++		goto out_fail;
+ 	}
+ 
+ 	/*
+@@ -1732,9 +1732,9 @@ int migrate_misplaced_transhuge_page(str
+ 	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ 	entry = pmd_mkhuge(entry);
+ 
+-	page_add_new_anon_rmap(new_page, vma, haddr);
+-
++	pmdp_clear_flush(vma, haddr, pmd);
+ 	set_pmd_at(mm, haddr, pmd, entry);
++	page_add_new_anon_rmap(new_page, vma, haddr);
+ 	update_mmu_cache_pmd(vma, address, &entry);
+ 	page_remove_rmap(page);
+ 	/*
+@@ -1753,7 +1753,6 @@ int migrate_misplaced_transhuge_page(str
+ 	count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
+ 	count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
+ 
+-out:
+ 	mod_zone_page_state(page_zone(page),
+ 			NR_ISOLATED_ANON + page_lru,
+ 			-HPAGE_PMD_NR);
+@@ -1762,6 +1761,10 @@ out:
+ out_fail:
+ 	count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ out_dropref:
++	entry = pmd_mknonnuma(entry);
++	set_pmd_at(mm, haddr, pmd, entry);
++	update_mmu_cache_pmd(vma, address, &entry);
++
+ 	unlock_page(page);
+ 	put_page(page);
+ 	return 0;
diff --git a/queue-3.10/mm-numa-do-not-account-for-a-hinting-fault-if-we-raced.patch b/queue-3.10/mm-numa-do-not-account-for-a-hinting-fault-if-we-raced.patch
new file mode 100644
index 00000000000..774753d20c3
--- /dev/null
+++ b/queue-3.10/mm-numa-do-not-account-for-a-hinting-fault-if-we-raced.patch
@@ -0,0 +1,41 @@
+From 1dd49bfa3465756b3ce72214b58a33e4afb67aa3 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:42 +0100
+Subject: mm: numa: Do not account for a hinting fault if we raced
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 1dd49bfa3465756b3ce72214b58a33e4afb67aa3 upstream.
+
+If another task handled a hinting fault in parallel then do not double
+account for it.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-5-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c |    5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1335,8 +1335,11 @@ int do_huge_pmd_numa_page(struct mm_stru
+ 
+ check_same:
+ 	spin_lock(&mm->page_table_lock);
+-	if (unlikely(!pmd_same(pmd, *pmdp)))
++	if (unlikely(!pmd_same(pmd, *pmdp))) {
++		/* Someone else took our fault */
++		current_nid = -1;
+ 		goto out_unlock;
++	}
+ clear_pmdnuma:
+ 	pmd = pmd_mknonnuma(pmd);
+ 	set_pmd_at(mm, haddr, pmdp, pmd);
diff --git a/queue-3.10/mm-numa-sanitize-task_numa_fault-callsites.patch b/queue-3.10/mm-numa-sanitize-task_numa_fault-callsites.patch
new file mode 100644
index 00000000000..93bc3c2be67
--- /dev/null
+++ b/queue-3.10/mm-numa-sanitize-task_numa_fault-callsites.patch
@@ -0,0 +1,230 @@
+From c61109e34f60f6e85bb43c5a1cd51c0e3db40847 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:45 +0100
+Subject: mm: numa: Sanitize task_numa_fault() callsites
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit c61109e34f60f6e85bb43c5a1cd51c0e3db40847 upstream.
+
+There are three callers of task_numa_fault():
+
+ - do_huge_pmd_numa_page():
+     Accounts against the current node, not the node where the
+     page resides, unless we migrated, in which case it accounts
+     against the node we migrated to.
+
+ - do_numa_page():
+     Accounts against the current node, not the node where the
+     page resides, unless we migrated, in which case it accounts
+     against the node we migrated to.
+
+ - do_pmd_numa_page():
+     Accounts not at all when the page isn't migrated, otherwise
+     accounts against the node we migrated towards.
+
+This seems wrong to me; all three sites should have the same
+sementaics, furthermore we should accounts against where the page
+really is, we already know where the task is.
+
+So modify all three sites to always account; we did after all receive
+the fault; and always account to where the page is after migration,
+regardless of success.
+
+They all still differ on when they clear the PTE/PMD; ideally that
+would get sorted too.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-8-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c |   25 +++++++++++++------------
+ mm/memory.c      |   53 +++++++++++++++++++++--------------------------------
+ 2 files changed, 34 insertions(+), 44 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1291,18 +1291,19 @@ int do_huge_pmd_numa_page(struct mm_stru
+ 	struct anon_vma *anon_vma = NULL;
+ 	struct page *page;
+ 	unsigned long haddr = addr & HPAGE_PMD_MASK;
++	int page_nid = -1, this_nid = numa_node_id();
+ 	int target_nid;
+-	int current_nid = -1;
+-	bool migrated, page_locked;
++	bool page_locked;
++	bool migrated = false;
+ 
+ 	spin_lock(&mm->page_table_lock);
+ 	if (unlikely(!pmd_same(pmd, *pmdp)))
+ 		goto out_unlock;
+ 
+ 	page = pmd_page(pmd);
+-	current_nid = page_to_nid(page);
++	page_nid = page_to_nid(page);
+ 	count_vm_numa_event(NUMA_HINT_FAULTS);
+-	if (current_nid == numa_node_id())
++	if (page_nid == this_nid)
+ 		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ 
+ 	/*
+@@ -1345,19 +1346,18 @@ int do_huge_pmd_numa_page(struct mm_stru
+ 	spin_unlock(&mm->page_table_lock);
+ 	migrated = migrate_misplaced_transhuge_page(mm, vma,
+ 				pmdp, pmd, addr, page, target_nid);
+-	if (!migrated)
++	if (migrated)
++		page_nid = target_nid;
++	else
+ 		goto check_same;
+ 
+-	task_numa_fault(target_nid, HPAGE_PMD_NR, true);
+-	if (anon_vma)
+-		page_unlock_anon_vma_read(anon_vma);
+-	return 0;
++	goto out;
+ 
+ check_same:
+ 	spin_lock(&mm->page_table_lock);
+ 	if (unlikely(!pmd_same(pmd, *pmdp))) {
+ 		/* Someone else took our fault */
+-		current_nid = -1;
++		page_nid = -1;
+ 		goto out_unlock;
+ 	}
+ clear_pmdnuma:
+@@ -1372,8 +1372,9 @@ out:
+ 	if (anon_vma)
+ 		page_unlock_anon_vma_read(anon_vma);
+ 
+-	if (current_nid != -1)
+-		task_numa_fault(current_nid, HPAGE_PMD_NR, false);
++	if (page_nid != -1)
++		task_numa_fault(page_nid, HPAGE_PMD_NR, migrated);
++
+ 	return 0;
+ }
+ 
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -3525,12 +3525,12 @@ static int do_nonlinear_fault(struct mm_
+ }
+ 
+ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+-				unsigned long addr, int current_nid)
++				unsigned long addr, int page_nid)
+ {
+ 	get_page(page);
+ 
+ 	count_vm_numa_event(NUMA_HINT_FAULTS);
+-	if (current_nid == numa_node_id())
++	if (page_nid == numa_node_id())
+ 		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ 
+ 	return mpol_misplaced(page, vma, addr);
+@@ -3541,7 +3541,7 @@ int do_numa_page(struct mm_struct *mm, s
+ {
+ 	struct page *page = NULL;
+ 	spinlock_t *ptl;
+-	int current_nid = -1;
++	int page_nid = -1;
+ 	int target_nid;
+ 	bool migrated = false;
+ 
+@@ -3571,15 +3571,10 @@ int do_numa_page(struct mm_struct *mm, s
+ 		return 0;
+ 	}
+ 
+-	current_nid = page_to_nid(page);
+-	target_nid = numa_migrate_prep(page, vma, addr, current_nid);
++	page_nid = page_to_nid(page);
++	target_nid = numa_migrate_prep(page, vma, addr, page_nid);
+ 	pte_unmap_unlock(ptep, ptl);
+ 	if (target_nid == -1) {
+-		/*
+-		 * Account for the fault against the current node if it not
+-		 * being replaced regardless of where the page is located.
+-		 */
+-		current_nid = numa_node_id();
+ 		put_page(page);
+ 		goto out;
+ 	}
+@@ -3587,11 +3582,11 @@ int do_numa_page(struct mm_struct *mm, s
+ 	/* Migrate to the requested node */
+ 	migrated = migrate_misplaced_page(page, target_nid);
+ 	if (migrated)
+-		current_nid = target_nid;
++		page_nid = target_nid;
+ 
+ out:
+-	if (current_nid != -1)
+-		task_numa_fault(current_nid, 1, migrated);
++	if (page_nid != -1)
++		task_numa_fault(page_nid, 1, migrated);
+ 	return 0;
+ }
+ 
+@@ -3606,7 +3601,6 @@ static int do_pmd_numa_page(struct mm_st
+ 	unsigned long offset;
+ 	spinlock_t *ptl;
+ 	bool numa = false;
+-	int local_nid = numa_node_id();
+ 
+ 	spin_lock(&mm->page_table_lock);
+ 	pmd = *pmdp;
+@@ -3629,9 +3623,10 @@ static int do_pmd_numa_page(struct mm_st
+ 	for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
+ 		pte_t pteval = *pte;
+ 		struct page *page;
+-		int curr_nid = local_nid;
++		int page_nid = -1;
+ 		int target_nid;
+-		bool migrated;
++		bool migrated = false;
++
+ 		if (!pte_present(pteval))
+ 			continue;
+ 		if (!pte_numa(pteval))
+@@ -3653,25 +3648,19 @@ static int do_pmd_numa_page(struct mm_st
+ 		if (unlikely(page_mapcount(page) != 1))
+ 			continue;
+ 
+-		/*
+-		 * Note that the NUMA fault is later accounted to either
+-		 * the node that is currently running or where the page is
+-		 * migrated to.
+-		 */
+-		curr_nid = local_nid;
+-		target_nid = numa_migrate_prep(page, vma, addr,
+-					       page_to_nid(page));
+-		if (target_nid == -1) {
++		page_nid = page_to_nid(page);
++		target_nid = numa_migrate_prep(page, vma, addr, page_nid);
++		pte_unmap_unlock(pte, ptl);
++		if (target_nid != -1) {
++			migrated = migrate_misplaced_page(page, target_nid);
++			if (migrated)
++				page_nid = target_nid;
++		} else {
+ 			put_page(page);
+-			continue;
+ 		}
+ 
+-		/* Migrate to the requested node */
+-		pte_unmap_unlock(pte, ptl);
+-		migrated = migrate_misplaced_page(page, target_nid);
+-		if (migrated)
+-			curr_nid = target_nid;
+-		task_numa_fault(curr_nid, 1, migrated);
++		if (page_nid != -1)
++			task_numa_fault(page_nid, 1, migrated);
+ 
+ 		pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ 	}
diff --git a/queue-3.10/mm-prevent-parallel-splits-during-thp-migration.patch b/queue-3.10/mm-prevent-parallel-splits-during-thp-migration.patch
new file mode 100644
index 00000000000..01dd846f21b
--- /dev/null
+++ b/queue-3.10/mm-prevent-parallel-splits-during-thp-migration.patch
@@ -0,0 +1,122 @@
+From 587fe586f44a48f9691001ba6c45b86c8e4ba21f Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:44 +0100
+Subject: mm: Prevent parallel splits during THP migration
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 587fe586f44a48f9691001ba6c45b86c8e4ba21f upstream.
+
+THP migrations are serialised by the page lock but on its own that does
+not prevent THP splits. If the page is split during THP migration then
+the pmd_same checks will prevent page table corruption but the unlock page
+and other fix-ups potentially will cause corruption. This patch takes the
+anon_vma lock to prevent parallel splits during migration.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-7-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c |   44 ++++++++++++++++++++++++++++++--------------
+ 1 file changed, 30 insertions(+), 14 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1288,18 +1288,18 @@ out:
+ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ 				unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+ {
++	struct anon_vma *anon_vma = NULL;
+ 	struct page *page;
+ 	unsigned long haddr = addr & HPAGE_PMD_MASK;
+ 	int target_nid;
+ 	int current_nid = -1;
+-	bool migrated;
++	bool migrated, page_locked;
+ 
+ 	spin_lock(&mm->page_table_lock);
+ 	if (unlikely(!pmd_same(pmd, *pmdp)))
+ 		goto out_unlock;
+ 
+ 	page = pmd_page(pmd);
+-	get_page(page);
+ 	current_nid = page_to_nid(page);
+ 	count_vm_numa_event(NUMA_HINT_FAULTS);
+ 	if (current_nid == numa_node_id())
+@@ -1309,12 +1309,29 @@ int do_huge_pmd_numa_page(struct mm_stru
+ 	 * Acquire the page lock to serialise THP migrations but avoid dropping
+ 	 * page_table_lock if at all possible
+ 	 */
+-	if (trylock_page(page))
+-		goto got_lock;
++	page_locked = trylock_page(page);
++	target_nid = mpol_misplaced(page, vma, haddr);
++	if (target_nid == -1) {
++		/* If the page was locked, there are no parallel migrations */
++		if (page_locked) {
++			unlock_page(page);
++			goto clear_pmdnuma;
++		}
++
++		/* Otherwise wait for potential migrations and retry fault */
++		spin_unlock(&mm->page_table_lock);
++		wait_on_page_locked(page);
++		goto out;
++	}
+ 
+-	/* Serialise against migrationa and check placement check placement */
++	/* Page is misplaced, serialise migrations and parallel THP splits */
++	get_page(page);
+ 	spin_unlock(&mm->page_table_lock);
+-	lock_page(page);
++	if (!page_locked) {
++		lock_page(page);
++		page_locked = true;
++	}
++	anon_vma = page_lock_anon_vma_read(page);
+ 
+ 	/* Confirm the PTE did not while locked */
+ 	spin_lock(&mm->page_table_lock);
+@@ -1324,14 +1341,6 @@ int do_huge_pmd_numa_page(struct mm_stru
+ 		goto out_unlock;
+ 	}
+ 
+-got_lock:
+-	target_nid = mpol_misplaced(page, vma, haddr);
+-	if (target_nid == -1) {
+-		unlock_page(page);
+-		put_page(page);
+-		goto clear_pmdnuma;
+-	}
+-
+ 	/* Migrate the THP to the requested node */
+ 	spin_unlock(&mm->page_table_lock);
+ 	migrated = migrate_misplaced_transhuge_page(mm, vma,
+@@ -1340,6 +1349,8 @@ got_lock:
+ 		goto check_same;
+ 
+ 	task_numa_fault(target_nid, HPAGE_PMD_NR, true);
++	if (anon_vma)
++		page_unlock_anon_vma_read(anon_vma);
+ 	return 0;
+ 
+ check_same:
+@@ -1356,6 +1367,11 @@ clear_pmdnuma:
+ 	update_mmu_cache_pmd(vma, addr, pmdp);
+ out_unlock:
+ 	spin_unlock(&mm->page_table_lock);
++
++out:
++	if (anon_vma)
++		page_unlock_anon_vma_read(anon_vma);
++
+ 	if (current_nid != -1)
+ 		task_numa_fault(current_nid, HPAGE_PMD_NR, false);
+ 	return 0;
diff --git a/queue-3.10/mm-wait-for-thp-migrations-to-complete-during-numa-hinting-faults.patch b/queue-3.10/mm-wait-for-thp-migrations-to-complete-during-numa-hinting-faults.patch
new file mode 100644
index 00000000000..05f1a00b6a0
--- /dev/null
+++ b/queue-3.10/mm-wait-for-thp-migrations-to-complete-during-numa-hinting-faults.patch
@@ -0,0 +1,81 @@
+From 42836f5f8baa33085f547098b74aa98991ee9216 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:43 +0100
+Subject: mm: Wait for THP migrations to complete during NUMA hinting faults
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 42836f5f8baa33085f547098b74aa98991ee9216 upstream.
+
+The locking for migrating THP is unusual. While normal page migration
+prevents parallel accesses using a migration PTE, THP migration relies on
+a combination of the page_table_lock, the page lock and the existance of
+the NUMA hinting PTE to guarantee safety but there is a bug in the scheme.
+
+If a THP page is currently being migrated and another thread traps a
+fault on the same page it checks if the page is misplaced. If it is not,
+then pmd_numa is cleared. The problem is that it checks if the page is
+misplaced without holding the page lock meaning that the racing thread
+can be migrating the THP when the second thread clears the NUMA bit
+and faults a stale page.
+
+This patch checks if the page is potentially being migrated and stalls
+using the lock_page if it is potentially being migrated before checking
+if the page is misplaced or not.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-6-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c |   23 ++++++++++++++++-------
+ 1 file changed, 16 insertions(+), 7 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1305,13 +1305,14 @@ int do_huge_pmd_numa_page(struct mm_stru
+ 	if (current_nid == numa_node_id())
+ 		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ 
+-	target_nid = mpol_misplaced(page, vma, haddr);
+-	if (target_nid == -1) {
+-		put_page(page);
+-		goto clear_pmdnuma;
+-	}
++	/*
++	 * Acquire the page lock to serialise THP migrations but avoid dropping
++	 * page_table_lock if at all possible
++	 */
++	if (trylock_page(page))
++		goto got_lock;
+ 
+-	/* Acquire the page lock to serialise THP migrations */
++	/* Serialise against migrationa and check placement check placement */
+ 	spin_unlock(&mm->page_table_lock);
+ 	lock_page(page);
+ 
+@@ -1322,9 +1323,17 @@ int do_huge_pmd_numa_page(struct mm_stru
+ 		put_page(page);
+ 		goto out_unlock;
+ 	}
+-	spin_unlock(&mm->page_table_lock);
++
++got_lock:
++	target_nid = mpol_misplaced(page, vma, haddr);
++	if (target_nid == -1) {
++		unlock_page(page);
++		put_page(page);
++		goto clear_pmdnuma;
++	}
+ 
+ 	/* Migrate the THP to the requested node */
++	spin_unlock(&mm->page_table_lock);
+ 	migrated = migrate_misplaced_transhuge_page(mm, vma,
+ 				pmdp, pmd, addr, page, target_nid);
+ 	if (!migrated)
diff --git a/queue-3.10/series b/queue-3.10/series
index d9a89241889..b9767842747 100644
--- a/queue-3.10/series
+++ b/queue-3.10/series
@@ -54,3 +54,9 @@ fix-a-few-incorrectly-checked-remap_pfn_range-calls.patch
 lib-scatterlist.c-don-t-flush_kernel_dcache_page-on-slab-page.patch
 aacraid-missing-capable-check-in-compat-ioctl.patch
 clk-fixup-argument-order-when-setting-vco-parameters.patch
+mm-numa-do-not-account-for-a-hinting-fault-if-we-raced.patch
+mm-wait-for-thp-migrations-to-complete-during-numa-hinting-faults.patch
+mm-prevent-parallel-splits-during-thp-migration.patch
+mm-numa-sanitize-task_numa_fault-callsites.patch
+mm-close-races-between-thp-migration-and-pmd-numa-clearing.patch
+mm-account-for-a-thp-numa-hinting-update-as-one-pte-update.patch
-- 
2.47.3