--- /dev/null
+From 0255d491848032f6c601b6410c3b8ebded3a37b1 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:47 +0100
+Subject: mm: Account for a THP NUMA hinting update as one PTE update
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 0255d491848032f6c601b6410c3b8ebded3a37b1 upstream.
+
+A THP PMD update is accounted for as 512 pages updated in vmstat. This is
+large difference when estimating the cost of automatic NUMA balancing and
+can be misleading when comparing results that had collapsed versus split
+THP. This patch addresses the accounting issue.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-10-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/mprotect.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -145,7 +145,7 @@ static inline unsigned long change_pmd_r
+ split_huge_page_pmd(vma, addr, pmd);
+ else if (change_huge_pmd(vma, pmd, addr, newprot,
+ prot_numa)) {
+- pages += HPAGE_PMD_NR;
++ pages++;
+ continue;
+ }
+ /* fall through */
--- /dev/null
+From 3f926ab945b60a5824369d21add7710622a2eac0 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:46 +0100
+Subject: mm: Close races between THP migration and PMD numa clearing
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 3f926ab945b60a5824369d21add7710622a2eac0 upstream.
+
+THP migration uses the page lock to guard against parallel allocations
+but there are cases like this still open
+
+ Task A Task B
+ --------------------- ---------------------
+ do_huge_pmd_numa_page do_huge_pmd_numa_page
+ lock_page
+ mpol_misplaced == -1
+ unlock_page
+ goto clear_pmdnuma
+ lock_page
+ mpol_misplaced == 2
+ migrate_misplaced_transhuge
+ pmd = pmd_mknonnuma
+ set_pmd_at
+
+During hours of testing, one crashed with weird errors and while I have
+no direct evidence, I suspect something like the race above happened.
+This patch extends the page lock to being held until the pmd_numa is
+cleared to prevent migration starting in parallel while the pmd_numa is
+being cleared. It also flushes the old pmd entry and orders pagetable
+insertion before rmap insertion.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-9-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c | 33 +++++++++++++++------------------
+ mm/migrate.c | 19 +++++++++++--------
+ 2 files changed, 26 insertions(+), 26 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1314,24 +1314,25 @@ int do_huge_pmd_numa_page(struct mm_stru
+ target_nid = mpol_misplaced(page, vma, haddr);
+ if (target_nid == -1) {
+ /* If the page was locked, there are no parallel migrations */
+- if (page_locked) {
+- unlock_page(page);
++ if (page_locked)
+ goto clear_pmdnuma;
+- }
+
+- /* Otherwise wait for potential migrations and retry fault */
++ /*
++ * Otherwise wait for potential migrations and retry. We do
++ * relock and check_same as the page may no longer be mapped.
++ * As the fault is being retried, do not account for it.
++ */
+ spin_unlock(&mm->page_table_lock);
+ wait_on_page_locked(page);
++ page_nid = -1;
+ goto out;
+ }
+
+ /* Page is misplaced, serialise migrations and parallel THP splits */
+ get_page(page);
+ spin_unlock(&mm->page_table_lock);
+- if (!page_locked) {
++ if (!page_locked)
+ lock_page(page);
+- page_locked = true;
+- }
+ anon_vma = page_lock_anon_vma_read(page);
+
+ /* Confirm the PTE did not while locked */
+@@ -1339,32 +1340,28 @@ int do_huge_pmd_numa_page(struct mm_stru
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ unlock_page(page);
+ put_page(page);
++ page_nid = -1;
+ goto out_unlock;
+ }
+
+- /* Migrate the THP to the requested node */
++ /*
++ * Migrate the THP to the requested node, returns with page unlocked
++ * and pmd_numa cleared.
++ */
+ spin_unlock(&mm->page_table_lock);
+ migrated = migrate_misplaced_transhuge_page(mm, vma,
+ pmdp, pmd, addr, page, target_nid);
+ if (migrated)
+ page_nid = target_nid;
+- else
+- goto check_same;
+
+ goto out;
+-
+-check_same:
+- spin_lock(&mm->page_table_lock);
+- if (unlikely(!pmd_same(pmd, *pmdp))) {
+- /* Someone else took our fault */
+- page_nid = -1;
+- goto out_unlock;
+- }
+ clear_pmdnuma:
++ BUG_ON(!PageLocked(page));
+ pmd = pmd_mknonnuma(pmd);
+ set_pmd_at(mm, haddr, pmdp, pmd);
+ VM_BUG_ON(pmd_numa(*pmdp));
+ update_mmu_cache_pmd(vma, addr, pmdp);
++ unlock_page(page);
+ out_unlock:
+ spin_unlock(&mm->page_table_lock);
+
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -1710,12 +1710,12 @@ int migrate_misplaced_transhuge_page(str
+ unlock_page(new_page);
+ put_page(new_page); /* Free it */
+
+- unlock_page(page);
++ /* Retake the callers reference and putback on LRU */
++ get_page(page);
+ putback_lru_page(page);
+-
+- count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+- isolated = 0;
+- goto out;
++ mod_zone_page_state(page_zone(page),
++ NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
++ goto out_fail;
+ }
+
+ /*
+@@ -1732,9 +1732,9 @@ int migrate_misplaced_transhuge_page(str
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+
+- page_add_new_anon_rmap(new_page, vma, haddr);
+-
++ pmdp_clear_flush(vma, haddr, pmd);
+ set_pmd_at(mm, haddr, pmd, entry);
++ page_add_new_anon_rmap(new_page, vma, haddr);
+ update_mmu_cache_pmd(vma, address, &entry);
+ page_remove_rmap(page);
+ /*
+@@ -1753,7 +1753,6 @@ int migrate_misplaced_transhuge_page(str
+ count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
+ count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
+
+-out:
+ mod_zone_page_state(page_zone(page),
+ NR_ISOLATED_ANON + page_lru,
+ -HPAGE_PMD_NR);
+@@ -1762,6 +1761,10 @@ out:
+ out_fail:
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ out_dropref:
++ entry = pmd_mknonnuma(entry);
++ set_pmd_at(mm, haddr, pmd, entry);
++ update_mmu_cache_pmd(vma, address, &entry);
++
+ unlock_page(page);
+ put_page(page);
+ return 0;
--- /dev/null
+From 1dd49bfa3465756b3ce72214b58a33e4afb67aa3 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:42 +0100
+Subject: mm: numa: Do not account for a hinting fault if we raced
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 1dd49bfa3465756b3ce72214b58a33e4afb67aa3 upstream.
+
+If another task handled a hinting fault in parallel then do not double
+account for it.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-5-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1335,8 +1335,11 @@ int do_huge_pmd_numa_page(struct mm_stru
+
+ check_same:
+ spin_lock(&mm->page_table_lock);
+- if (unlikely(!pmd_same(pmd, *pmdp)))
++ if (unlikely(!pmd_same(pmd, *pmdp))) {
++ /* Someone else took our fault */
++ current_nid = -1;
+ goto out_unlock;
++ }
+ clear_pmdnuma:
+ pmd = pmd_mknonnuma(pmd);
+ set_pmd_at(mm, haddr, pmdp, pmd);
--- /dev/null
+From c61109e34f60f6e85bb43c5a1cd51c0e3db40847 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:45 +0100
+Subject: mm: numa: Sanitize task_numa_fault() callsites
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit c61109e34f60f6e85bb43c5a1cd51c0e3db40847 upstream.
+
+There are three callers of task_numa_fault():
+
+ - do_huge_pmd_numa_page():
+ Accounts against the current node, not the node where the
+ page resides, unless we migrated, in which case it accounts
+ against the node we migrated to.
+
+ - do_numa_page():
+ Accounts against the current node, not the node where the
+ page resides, unless we migrated, in which case it accounts
+ against the node we migrated to.
+
+ - do_pmd_numa_page():
+ Accounts not at all when the page isn't migrated, otherwise
+ accounts against the node we migrated towards.
+
+This seems wrong to me; all three sites should have the same
+sementaics, furthermore we should accounts against where the page
+really is, we already know where the task is.
+
+So modify all three sites to always account; we did after all receive
+the fault; and always account to where the page is after migration,
+regardless of success.
+
+They all still differ on when they clear the PTE/PMD; ideally that
+would get sorted too.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-8-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c | 25 +++++++++++++------------
+ mm/memory.c | 53 +++++++++++++++++++++--------------------------------
+ 2 files changed, 34 insertions(+), 44 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1291,18 +1291,19 @@ int do_huge_pmd_numa_page(struct mm_stru
+ struct anon_vma *anon_vma = NULL;
+ struct page *page;
+ unsigned long haddr = addr & HPAGE_PMD_MASK;
++ int page_nid = -1, this_nid = numa_node_id();
+ int target_nid;
+- int current_nid = -1;
+- bool migrated, page_locked;
++ bool page_locked;
++ bool migrated = false;
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp)))
+ goto out_unlock;
+
+ page = pmd_page(pmd);
+- current_nid = page_to_nid(page);
++ page_nid = page_to_nid(page);
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+- if (current_nid == numa_node_id())
++ if (page_nid == this_nid)
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+
+ /*
+@@ -1345,19 +1346,18 @@ int do_huge_pmd_numa_page(struct mm_stru
+ spin_unlock(&mm->page_table_lock);
+ migrated = migrate_misplaced_transhuge_page(mm, vma,
+ pmdp, pmd, addr, page, target_nid);
+- if (!migrated)
++ if (migrated)
++ page_nid = target_nid;
++ else
+ goto check_same;
+
+- task_numa_fault(target_nid, HPAGE_PMD_NR, true);
+- if (anon_vma)
+- page_unlock_anon_vma_read(anon_vma);
+- return 0;
++ goto out;
+
+ check_same:
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ /* Someone else took our fault */
+- current_nid = -1;
++ page_nid = -1;
+ goto out_unlock;
+ }
+ clear_pmdnuma:
+@@ -1372,8 +1372,9 @@ out:
+ if (anon_vma)
+ page_unlock_anon_vma_read(anon_vma);
+
+- if (current_nid != -1)
+- task_numa_fault(current_nid, HPAGE_PMD_NR, false);
++ if (page_nid != -1)
++ task_numa_fault(page_nid, HPAGE_PMD_NR, migrated);
++
+ return 0;
+ }
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -3525,12 +3525,12 @@ static int do_nonlinear_fault(struct mm_
+ }
+
+ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+- unsigned long addr, int current_nid)
++ unsigned long addr, int page_nid)
+ {
+ get_page(page);
+
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+- if (current_nid == numa_node_id())
++ if (page_nid == numa_node_id())
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+
+ return mpol_misplaced(page, vma, addr);
+@@ -3541,7 +3541,7 @@ int do_numa_page(struct mm_struct *mm, s
+ {
+ struct page *page = NULL;
+ spinlock_t *ptl;
+- int current_nid = -1;
++ int page_nid = -1;
+ int target_nid;
+ bool migrated = false;
+
+@@ -3571,15 +3571,10 @@ int do_numa_page(struct mm_struct *mm, s
+ return 0;
+ }
+
+- current_nid = page_to_nid(page);
+- target_nid = numa_migrate_prep(page, vma, addr, current_nid);
++ page_nid = page_to_nid(page);
++ target_nid = numa_migrate_prep(page, vma, addr, page_nid);
+ pte_unmap_unlock(ptep, ptl);
+ if (target_nid == -1) {
+- /*
+- * Account for the fault against the current node if it not
+- * being replaced regardless of where the page is located.
+- */
+- current_nid = numa_node_id();
+ put_page(page);
+ goto out;
+ }
+@@ -3587,11 +3582,11 @@ int do_numa_page(struct mm_struct *mm, s
+ /* Migrate to the requested node */
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
+- current_nid = target_nid;
++ page_nid = target_nid;
+
+ out:
+- if (current_nid != -1)
+- task_numa_fault(current_nid, 1, migrated);
++ if (page_nid != -1)
++ task_numa_fault(page_nid, 1, migrated);
+ return 0;
+ }
+
+@@ -3606,7 +3601,6 @@ static int do_pmd_numa_page(struct mm_st
+ unsigned long offset;
+ spinlock_t *ptl;
+ bool numa = false;
+- int local_nid = numa_node_id();
+
+ spin_lock(&mm->page_table_lock);
+ pmd = *pmdp;
+@@ -3629,9 +3623,10 @@ static int do_pmd_numa_page(struct mm_st
+ for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
+ pte_t pteval = *pte;
+ struct page *page;
+- int curr_nid = local_nid;
++ int page_nid = -1;
+ int target_nid;
+- bool migrated;
++ bool migrated = false;
++
+ if (!pte_present(pteval))
+ continue;
+ if (!pte_numa(pteval))
+@@ -3653,25 +3648,19 @@ static int do_pmd_numa_page(struct mm_st
+ if (unlikely(page_mapcount(page) != 1))
+ continue;
+
+- /*
+- * Note that the NUMA fault is later accounted to either
+- * the node that is currently running or where the page is
+- * migrated to.
+- */
+- curr_nid = local_nid;
+- target_nid = numa_migrate_prep(page, vma, addr,
+- page_to_nid(page));
+- if (target_nid == -1) {
++ page_nid = page_to_nid(page);
++ target_nid = numa_migrate_prep(page, vma, addr, page_nid);
++ pte_unmap_unlock(pte, ptl);
++ if (target_nid != -1) {
++ migrated = migrate_misplaced_page(page, target_nid);
++ if (migrated)
++ page_nid = target_nid;
++ } else {
+ put_page(page);
+- continue;
+ }
+
+- /* Migrate to the requested node */
+- pte_unmap_unlock(pte, ptl);
+- migrated = migrate_misplaced_page(page, target_nid);
+- if (migrated)
+- curr_nid = target_nid;
+- task_numa_fault(curr_nid, 1, migrated);
++ if (page_nid != -1)
++ task_numa_fault(page_nid, 1, migrated);
+
+ pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ }
--- /dev/null
+From 587fe586f44a48f9691001ba6c45b86c8e4ba21f Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:44 +0100
+Subject: mm: Prevent parallel splits during THP migration
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 587fe586f44a48f9691001ba6c45b86c8e4ba21f upstream.
+
+THP migrations are serialised by the page lock but on its own that does
+not prevent THP splits. If the page is split during THP migration then
+the pmd_same checks will prevent page table corruption but the unlock page
+and other fix-ups potentially will cause corruption. This patch takes the
+anon_vma lock to prevent parallel splits during migration.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-7-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c | 44 ++++++++++++++++++++++++++++++--------------
+ 1 file changed, 30 insertions(+), 14 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1288,18 +1288,18 @@ out:
+ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+ {
++ struct anon_vma *anon_vma = NULL;
+ struct page *page;
+ unsigned long haddr = addr & HPAGE_PMD_MASK;
+ int target_nid;
+ int current_nid = -1;
+- bool migrated;
++ bool migrated, page_locked;
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp)))
+ goto out_unlock;
+
+ page = pmd_page(pmd);
+- get_page(page);
+ current_nid = page_to_nid(page);
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+ if (current_nid == numa_node_id())
+@@ -1309,12 +1309,29 @@ int do_huge_pmd_numa_page(struct mm_stru
+ * Acquire the page lock to serialise THP migrations but avoid dropping
+ * page_table_lock if at all possible
+ */
+- if (trylock_page(page))
+- goto got_lock;
++ page_locked = trylock_page(page);
++ target_nid = mpol_misplaced(page, vma, haddr);
++ if (target_nid == -1) {
++ /* If the page was locked, there are no parallel migrations */
++ if (page_locked) {
++ unlock_page(page);
++ goto clear_pmdnuma;
++ }
++
++ /* Otherwise wait for potential migrations and retry fault */
++ spin_unlock(&mm->page_table_lock);
++ wait_on_page_locked(page);
++ goto out;
++ }
+
+- /* Serialise against migrationa and check placement check placement */
++ /* Page is misplaced, serialise migrations and parallel THP splits */
++ get_page(page);
+ spin_unlock(&mm->page_table_lock);
+- lock_page(page);
++ if (!page_locked) {
++ lock_page(page);
++ page_locked = true;
++ }
++ anon_vma = page_lock_anon_vma_read(page);
+
+ /* Confirm the PTE did not while locked */
+ spin_lock(&mm->page_table_lock);
+@@ -1324,14 +1341,6 @@ int do_huge_pmd_numa_page(struct mm_stru
+ goto out_unlock;
+ }
+
+-got_lock:
+- target_nid = mpol_misplaced(page, vma, haddr);
+- if (target_nid == -1) {
+- unlock_page(page);
+- put_page(page);
+- goto clear_pmdnuma;
+- }
+-
+ /* Migrate the THP to the requested node */
+ spin_unlock(&mm->page_table_lock);
+ migrated = migrate_misplaced_transhuge_page(mm, vma,
+@@ -1340,6 +1349,8 @@ got_lock:
+ goto check_same;
+
+ task_numa_fault(target_nid, HPAGE_PMD_NR, true);
++ if (anon_vma)
++ page_unlock_anon_vma_read(anon_vma);
+ return 0;
+
+ check_same:
+@@ -1356,6 +1367,11 @@ clear_pmdnuma:
+ update_mmu_cache_pmd(vma, addr, pmdp);
+ out_unlock:
+ spin_unlock(&mm->page_table_lock);
++
++out:
++ if (anon_vma)
++ page_unlock_anon_vma_read(anon_vma);
++
+ if (current_nid != -1)
+ task_numa_fault(current_nid, HPAGE_PMD_NR, false);
+ return 0;
--- /dev/null
+From 42836f5f8baa33085f547098b74aa98991ee9216 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:43 +0100
+Subject: mm: Wait for THP migrations to complete during NUMA hinting faults
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 42836f5f8baa33085f547098b74aa98991ee9216 upstream.
+
+The locking for migrating THP is unusual. While normal page migration
+prevents parallel accesses using a migration PTE, THP migration relies on
+a combination of the page_table_lock, the page lock and the existance of
+the NUMA hinting PTE to guarantee safety but there is a bug in the scheme.
+
+If a THP page is currently being migrated and another thread traps a
+fault on the same page it checks if the page is misplaced. If it is not,
+then pmd_numa is cleared. The problem is that it checks if the page is
+misplaced without holding the page lock meaning that the racing thread
+can be migrating the THP when the second thread clears the NUMA bit
+and faults a stale page.
+
+This patch checks if the page is potentially being migrated and stalls
+using the lock_page if it is potentially being migrated before checking
+if the page is misplaced or not.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-6-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c | 23 ++++++++++++++++-------
+ 1 file changed, 16 insertions(+), 7 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1305,13 +1305,14 @@ int do_huge_pmd_numa_page(struct mm_stru
+ if (current_nid == numa_node_id())
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+
+- target_nid = mpol_misplaced(page, vma, haddr);
+- if (target_nid == -1) {
+- put_page(page);
+- goto clear_pmdnuma;
+- }
++ /*
++ * Acquire the page lock to serialise THP migrations but avoid dropping
++ * page_table_lock if at all possible
++ */
++ if (trylock_page(page))
++ goto got_lock;
+
+- /* Acquire the page lock to serialise THP migrations */
++ /* Serialise against migrationa and check placement check placement */
+ spin_unlock(&mm->page_table_lock);
+ lock_page(page);
+
+@@ -1322,9 +1323,17 @@ int do_huge_pmd_numa_page(struct mm_stru
+ put_page(page);
+ goto out_unlock;
+ }
+- spin_unlock(&mm->page_table_lock);
++
++got_lock:
++ target_nid = mpol_misplaced(page, vma, haddr);
++ if (target_nid == -1) {
++ unlock_page(page);
++ put_page(page);
++ goto clear_pmdnuma;
++ }
+
+ /* Migrate the THP to the requested node */
++ spin_unlock(&mm->page_table_lock);
+ migrated = migrate_misplaced_transhuge_page(mm, vma,
+ pmdp, pmd, addr, page, target_nid);
+ if (!migrated)
lib-scatterlist.c-don-t-flush_kernel_dcache_page-on-slab-page.patch
aacraid-missing-capable-check-in-compat-ioctl.patch
clk-fixup-argument-order-when-setting-vco-parameters.patch
+mm-numa-do-not-account-for-a-hinting-fault-if-we-raced.patch
+mm-wait-for-thp-migrations-to-complete-during-numa-hinting-faults.patch
+mm-prevent-parallel-splits-during-thp-migration.patch
+mm-numa-sanitize-task_numa_fault-callsites.patch
+mm-close-races-between-thp-migration-and-pmd-numa-clearing.patch
+mm-account-for-a-thp-numa-hinting-update-as-one-pte-update.patch