]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
3.11-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 9 Nov 2013 05:55:24 +0000 (21:55 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 9 Nov 2013 05:55:24 +0000 (21:55 -0800)
added patches:
mm-account-for-a-thp-numa-hinting-update-as-one-pte-update.patch
mm-close-races-between-thp-migration-and-pmd-numa-clearing.patch
mm-numa-do-not-account-for-a-hinting-fault-if-we-raced.patch
mm-numa-sanitize-task_numa_fault-callsites.patch
mm-prevent-parallel-splits-during-thp-migration.patch
mm-wait-for-thp-migrations-to-complete-during-numa-hinting-faults.patch

queue-3.11/mm-account-for-a-thp-numa-hinting-update-as-one-pte-update.patch [new file with mode: 0644]
queue-3.11/mm-close-races-between-thp-migration-and-pmd-numa-clearing.patch [new file with mode: 0644]
queue-3.11/mm-numa-do-not-account-for-a-hinting-fault-if-we-raced.patch [new file with mode: 0644]
queue-3.11/mm-numa-sanitize-task_numa_fault-callsites.patch [new file with mode: 0644]
queue-3.11/mm-prevent-parallel-splits-during-thp-migration.patch [new file with mode: 0644]
queue-3.11/mm-wait-for-thp-migrations-to-complete-during-numa-hinting-faults.patch [new file with mode: 0644]
queue-3.11/series

diff --git a/queue-3.11/mm-account-for-a-thp-numa-hinting-update-as-one-pte-update.patch b/queue-3.11/mm-account-for-a-thp-numa-hinting-update-as-one-pte-update.patch
new file mode 100644 (file)
index 0000000..15016d6
--- /dev/null
@@ -0,0 +1,39 @@
+From 0255d491848032f6c601b6410c3b8ebded3a37b1 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:47 +0100
+Subject: mm: Account for a THP NUMA hinting update as one PTE update
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 0255d491848032f6c601b6410c3b8ebded3a37b1 upstream.
+
+A THP PMD update is accounted for as 512 pages updated in vmstat.  This is
+large difference when estimating the cost of automatic NUMA balancing and
+can be misleading when comparing results that had collapsed versus split
+THP. This patch addresses the accounting issue.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-10-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/mprotect.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -148,7 +148,7 @@ static inline unsigned long change_pmd_r
+                               split_huge_page_pmd(vma, addr, pmd);
+                       else if (change_huge_pmd(vma, pmd, addr, newprot,
+                                                prot_numa)) {
+-                              pages += HPAGE_PMD_NR;
++                              pages++;
+                               continue;
+                       }
+                       /* fall through */
diff --git a/queue-3.11/mm-close-races-between-thp-migration-and-pmd-numa-clearing.patch b/queue-3.11/mm-close-races-between-thp-migration-and-pmd-numa-clearing.patch
new file mode 100644 (file)
index 0000000..234a6cc
--- /dev/null
@@ -0,0 +1,173 @@
+From 3f926ab945b60a5824369d21add7710622a2eac0 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:46 +0100
+Subject: mm: Close races between THP migration and PMD numa clearing
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 3f926ab945b60a5824369d21add7710622a2eac0 upstream.
+
+THP migration uses the page lock to guard against parallel allocations
+but there are cases like this still open
+
+  Task A                                       Task B
+  ---------------------                                ---------------------
+  do_huge_pmd_numa_page                                do_huge_pmd_numa_page
+  lock_page
+  mpol_misplaced == -1
+  unlock_page
+  goto clear_pmdnuma
+                                               lock_page
+                                               mpol_misplaced == 2
+                                               migrate_misplaced_transhuge
+  pmd = pmd_mknonnuma
+  set_pmd_at
+
+During hours of testing, one crashed with weird errors and while I have
+no direct evidence, I suspect something like the race above happened.
+This patch extends the page lock to being held until the pmd_numa is
+cleared to prevent migration starting in parallel while the pmd_numa is
+being cleared. It also flushes the old pmd entry and orders pagetable
+insertion before rmap insertion.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-9-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c |   33 +++++++++++++++------------------
+ mm/migrate.c     |   19 +++++++++++--------
+ 2 files changed, 26 insertions(+), 26 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1316,24 +1316,25 @@ int do_huge_pmd_numa_page(struct mm_stru
+       target_nid = mpol_misplaced(page, vma, haddr);
+       if (target_nid == -1) {
+               /* If the page was locked, there are no parallel migrations */
+-              if (page_locked) {
+-                      unlock_page(page);
++              if (page_locked)
+                       goto clear_pmdnuma;
+-              }
+-              /* Otherwise wait for potential migrations and retry fault */
++              /*
++               * Otherwise wait for potential migrations and retry. We do
++               * relock and check_same as the page may no longer be mapped.
++               * As the fault is being retried, do not account for it.
++               */
+               spin_unlock(&mm->page_table_lock);
+               wait_on_page_locked(page);
++              page_nid = -1;
+               goto out;
+       }
+       /* Page is misplaced, serialise migrations and parallel THP splits */
+       get_page(page);
+       spin_unlock(&mm->page_table_lock);
+-      if (!page_locked) {
++      if (!page_locked)
+               lock_page(page);
+-              page_locked = true;
+-      }
+       anon_vma = page_lock_anon_vma_read(page);
+       /* Confirm the PTE did not while locked */
+@@ -1341,32 +1342,28 @@ int do_huge_pmd_numa_page(struct mm_stru
+       if (unlikely(!pmd_same(pmd, *pmdp))) {
+               unlock_page(page);
+               put_page(page);
++              page_nid = -1;
+               goto out_unlock;
+       }
+-      /* Migrate the THP to the requested node */
++      /*
++       * Migrate the THP to the requested node, returns with page unlocked
++       * and pmd_numa cleared.
++       */
+       spin_unlock(&mm->page_table_lock);
+       migrated = migrate_misplaced_transhuge_page(mm, vma,
+                               pmdp, pmd, addr, page, target_nid);
+       if (migrated)
+               page_nid = target_nid;
+-      else
+-              goto check_same;
+       goto out;
+-
+-check_same:
+-      spin_lock(&mm->page_table_lock);
+-      if (unlikely(!pmd_same(pmd, *pmdp))) {
+-              /* Someone else took our fault */
+-              page_nid = -1;
+-              goto out_unlock;
+-      }
+ clear_pmdnuma:
++      BUG_ON(!PageLocked(page));
+       pmd = pmd_mknonnuma(pmd);
+       set_pmd_at(mm, haddr, pmdp, pmd);
+       VM_BUG_ON(pmd_numa(*pmdp));
+       update_mmu_cache_pmd(vma, addr, pmdp);
++      unlock_page(page);
+ out_unlock:
+       spin_unlock(&mm->page_table_lock);
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -1712,12 +1712,12 @@ int migrate_misplaced_transhuge_page(str
+               unlock_page(new_page);
+               put_page(new_page);             /* Free it */
+-              unlock_page(page);
++              /* Retake the callers reference and putback on LRU */
++              get_page(page);
+               putback_lru_page(page);
+-
+-              count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+-              isolated = 0;
+-              goto out;
++              mod_zone_page_state(page_zone(page),
++                       NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
++              goto out_fail;
+       }
+       /*
+@@ -1734,9 +1734,9 @@ int migrate_misplaced_transhuge_page(str
+       entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+       entry = pmd_mkhuge(entry);
+-      page_add_new_anon_rmap(new_page, vma, haddr);
+-
++      pmdp_clear_flush(vma, haddr, pmd);
+       set_pmd_at(mm, haddr, pmd, entry);
++      page_add_new_anon_rmap(new_page, vma, haddr);
+       update_mmu_cache_pmd(vma, address, &entry);
+       page_remove_rmap(page);
+       /*
+@@ -1755,7 +1755,6 @@ int migrate_misplaced_transhuge_page(str
+       count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
+       count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
+-out:
+       mod_zone_page_state(page_zone(page),
+                       NR_ISOLATED_ANON + page_lru,
+                       -HPAGE_PMD_NR);
+@@ -1764,6 +1763,10 @@ out:
+ out_fail:
+       count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ out_dropref:
++      entry = pmd_mknonnuma(entry);
++      set_pmd_at(mm, haddr, pmd, entry);
++      update_mmu_cache_pmd(vma, address, &entry);
++
+       unlock_page(page);
+       put_page(page);
+       return 0;
diff --git a/queue-3.11/mm-numa-do-not-account-for-a-hinting-fault-if-we-raced.patch b/queue-3.11/mm-numa-do-not-account-for-a-hinting-fault-if-we-raced.patch
new file mode 100644 (file)
index 0000000..0384863
--- /dev/null
@@ -0,0 +1,41 @@
+From 1dd49bfa3465756b3ce72214b58a33e4afb67aa3 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:42 +0100
+Subject: mm: numa: Do not account for a hinting fault if we raced
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 1dd49bfa3465756b3ce72214b58a33e4afb67aa3 upstream.
+
+If another task handled a hinting fault in parallel then do not double
+account for it.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-5-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c |    5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1337,8 +1337,11 @@ int do_huge_pmd_numa_page(struct mm_stru
+ check_same:
+       spin_lock(&mm->page_table_lock);
+-      if (unlikely(!pmd_same(pmd, *pmdp)))
++      if (unlikely(!pmd_same(pmd, *pmdp))) {
++              /* Someone else took our fault */
++              current_nid = -1;
+               goto out_unlock;
++      }
+ clear_pmdnuma:
+       pmd = pmd_mknonnuma(pmd);
+       set_pmd_at(mm, haddr, pmdp, pmd);
diff --git a/queue-3.11/mm-numa-sanitize-task_numa_fault-callsites.patch b/queue-3.11/mm-numa-sanitize-task_numa_fault-callsites.patch
new file mode 100644 (file)
index 0000000..9b1520a
--- /dev/null
@@ -0,0 +1,230 @@
+From c61109e34f60f6e85bb43c5a1cd51c0e3db40847 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:45 +0100
+Subject: mm: numa: Sanitize task_numa_fault() callsites
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit c61109e34f60f6e85bb43c5a1cd51c0e3db40847 upstream.
+
+There are three callers of task_numa_fault():
+
+ - do_huge_pmd_numa_page():
+     Accounts against the current node, not the node where the
+     page resides, unless we migrated, in which case it accounts
+     against the node we migrated to.
+
+ - do_numa_page():
+     Accounts against the current node, not the node where the
+     page resides, unless we migrated, in which case it accounts
+     against the node we migrated to.
+
+ - do_pmd_numa_page():
+     Accounts not at all when the page isn't migrated, otherwise
+     accounts against the node we migrated towards.
+
+This seems wrong to me; all three sites should have the same
+sementaics, furthermore we should accounts against where the page
+really is, we already know where the task is.
+
+So modify all three sites to always account; we did after all receive
+the fault; and always account to where the page is after migration,
+regardless of success.
+
+They all still differ on when they clear the PTE/PMD; ideally that
+would get sorted too.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-8-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c |   25 +++++++++++++------------
+ mm/memory.c      |   53 +++++++++++++++++++++--------------------------------
+ 2 files changed, 34 insertions(+), 44 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1293,18 +1293,19 @@ int do_huge_pmd_numa_page(struct mm_stru
+       struct anon_vma *anon_vma = NULL;
+       struct page *page;
+       unsigned long haddr = addr & HPAGE_PMD_MASK;
++      int page_nid = -1, this_nid = numa_node_id();
+       int target_nid;
+-      int current_nid = -1;
+-      bool migrated, page_locked;
++      bool page_locked;
++      bool migrated = false;
+       spin_lock(&mm->page_table_lock);
+       if (unlikely(!pmd_same(pmd, *pmdp)))
+               goto out_unlock;
+       page = pmd_page(pmd);
+-      current_nid = page_to_nid(page);
++      page_nid = page_to_nid(page);
+       count_vm_numa_event(NUMA_HINT_FAULTS);
+-      if (current_nid == numa_node_id())
++      if (page_nid == this_nid)
+               count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+       /*
+@@ -1347,19 +1348,18 @@ int do_huge_pmd_numa_page(struct mm_stru
+       spin_unlock(&mm->page_table_lock);
+       migrated = migrate_misplaced_transhuge_page(mm, vma,
+                               pmdp, pmd, addr, page, target_nid);
+-      if (!migrated)
++      if (migrated)
++              page_nid = target_nid;
++      else
+               goto check_same;
+-      task_numa_fault(target_nid, HPAGE_PMD_NR, true);
+-      if (anon_vma)
+-              page_unlock_anon_vma_read(anon_vma);
+-      return 0;
++      goto out;
+ check_same:
+       spin_lock(&mm->page_table_lock);
+       if (unlikely(!pmd_same(pmd, *pmdp))) {
+               /* Someone else took our fault */
+-              current_nid = -1;
++              page_nid = -1;
+               goto out_unlock;
+       }
+ clear_pmdnuma:
+@@ -1374,8 +1374,9 @@ out:
+       if (anon_vma)
+               page_unlock_anon_vma_read(anon_vma);
+-      if (current_nid != -1)
+-              task_numa_fault(current_nid, HPAGE_PMD_NR, false);
++      if (page_nid != -1)
++              task_numa_fault(page_nid, HPAGE_PMD_NR, migrated);
++
+       return 0;
+ }
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -3532,12 +3532,12 @@ static int do_nonlinear_fault(struct mm_
+ }
+ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+-                              unsigned long addr, int current_nid)
++                              unsigned long addr, int page_nid)
+ {
+       get_page(page);
+       count_vm_numa_event(NUMA_HINT_FAULTS);
+-      if (current_nid == numa_node_id())
++      if (page_nid == numa_node_id())
+               count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+       return mpol_misplaced(page, vma, addr);
+@@ -3548,7 +3548,7 @@ int do_numa_page(struct mm_struct *mm, s
+ {
+       struct page *page = NULL;
+       spinlock_t *ptl;
+-      int current_nid = -1;
++      int page_nid = -1;
+       int target_nid;
+       bool migrated = false;
+@@ -3578,15 +3578,10 @@ int do_numa_page(struct mm_struct *mm, s
+               return 0;
+       }
+-      current_nid = page_to_nid(page);
+-      target_nid = numa_migrate_prep(page, vma, addr, current_nid);
++      page_nid = page_to_nid(page);
++      target_nid = numa_migrate_prep(page, vma, addr, page_nid);
+       pte_unmap_unlock(ptep, ptl);
+       if (target_nid == -1) {
+-              /*
+-               * Account for the fault against the current node if it not
+-               * being replaced regardless of where the page is located.
+-               */
+-              current_nid = numa_node_id();
+               put_page(page);
+               goto out;
+       }
+@@ -3594,11 +3589,11 @@ int do_numa_page(struct mm_struct *mm, s
+       /* Migrate to the requested node */
+       migrated = migrate_misplaced_page(page, target_nid);
+       if (migrated)
+-              current_nid = target_nid;
++              page_nid = target_nid;
+ out:
+-      if (current_nid != -1)
+-              task_numa_fault(current_nid, 1, migrated);
++      if (page_nid != -1)
++              task_numa_fault(page_nid, 1, migrated);
+       return 0;
+ }
+@@ -3613,7 +3608,6 @@ static int do_pmd_numa_page(struct mm_st
+       unsigned long offset;
+       spinlock_t *ptl;
+       bool numa = false;
+-      int local_nid = numa_node_id();
+       spin_lock(&mm->page_table_lock);
+       pmd = *pmdp;
+@@ -3636,9 +3630,10 @@ static int do_pmd_numa_page(struct mm_st
+       for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
+               pte_t pteval = *pte;
+               struct page *page;
+-              int curr_nid = local_nid;
++              int page_nid = -1;
+               int target_nid;
+-              bool migrated;
++              bool migrated = false;
++
+               if (!pte_present(pteval))
+                       continue;
+               if (!pte_numa(pteval))
+@@ -3660,25 +3655,19 @@ static int do_pmd_numa_page(struct mm_st
+               if (unlikely(page_mapcount(page) != 1))
+                       continue;
+-              /*
+-               * Note that the NUMA fault is later accounted to either
+-               * the node that is currently running or where the page is
+-               * migrated to.
+-               */
+-              curr_nid = local_nid;
+-              target_nid = numa_migrate_prep(page, vma, addr,
+-                                             page_to_nid(page));
+-              if (target_nid == -1) {
++              page_nid = page_to_nid(page);
++              target_nid = numa_migrate_prep(page, vma, addr, page_nid);
++              pte_unmap_unlock(pte, ptl);
++              if (target_nid != -1) {
++                      migrated = migrate_misplaced_page(page, target_nid);
++                      if (migrated)
++                              page_nid = target_nid;
++              } else {
+                       put_page(page);
+-                      continue;
+               }
+-              /* Migrate to the requested node */
+-              pte_unmap_unlock(pte, ptl);
+-              migrated = migrate_misplaced_page(page, target_nid);
+-              if (migrated)
+-                      curr_nid = target_nid;
+-              task_numa_fault(curr_nid, 1, migrated);
++              if (page_nid != -1)
++                      task_numa_fault(page_nid, 1, migrated);
+               pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+       }
diff --git a/queue-3.11/mm-prevent-parallel-splits-during-thp-migration.patch b/queue-3.11/mm-prevent-parallel-splits-during-thp-migration.patch
new file mode 100644 (file)
index 0000000..95453ce
--- /dev/null
@@ -0,0 +1,122 @@
+From 587fe586f44a48f9691001ba6c45b86c8e4ba21f Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:44 +0100
+Subject: mm: Prevent parallel splits during THP migration
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 587fe586f44a48f9691001ba6c45b86c8e4ba21f upstream.
+
+THP migrations are serialised by the page lock but on its own that does
+not prevent THP splits. If the page is split during THP migration then
+the pmd_same checks will prevent page table corruption but the unlock page
+and other fix-ups potentially will cause corruption. This patch takes the
+anon_vma lock to prevent parallel splits during migration.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-7-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c |   44 ++++++++++++++++++++++++++++++--------------
+ 1 file changed, 30 insertions(+), 14 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1290,18 +1290,18 @@ out:
+ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                               unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+ {
++      struct anon_vma *anon_vma = NULL;
+       struct page *page;
+       unsigned long haddr = addr & HPAGE_PMD_MASK;
+       int target_nid;
+       int current_nid = -1;
+-      bool migrated;
++      bool migrated, page_locked;
+       spin_lock(&mm->page_table_lock);
+       if (unlikely(!pmd_same(pmd, *pmdp)))
+               goto out_unlock;
+       page = pmd_page(pmd);
+-      get_page(page);
+       current_nid = page_to_nid(page);
+       count_vm_numa_event(NUMA_HINT_FAULTS);
+       if (current_nid == numa_node_id())
+@@ -1311,12 +1311,29 @@ int do_huge_pmd_numa_page(struct mm_stru
+        * Acquire the page lock to serialise THP migrations but avoid dropping
+        * page_table_lock if at all possible
+        */
+-      if (trylock_page(page))
+-              goto got_lock;
++      page_locked = trylock_page(page);
++      target_nid = mpol_misplaced(page, vma, haddr);
++      if (target_nid == -1) {
++              /* If the page was locked, there are no parallel migrations */
++              if (page_locked) {
++                      unlock_page(page);
++                      goto clear_pmdnuma;
++              }
++
++              /* Otherwise wait for potential migrations and retry fault */
++              spin_unlock(&mm->page_table_lock);
++              wait_on_page_locked(page);
++              goto out;
++      }
+-      /* Serialise against migrationa and check placement check placement */
++      /* Page is misplaced, serialise migrations and parallel THP splits */
++      get_page(page);
+       spin_unlock(&mm->page_table_lock);
+-      lock_page(page);
++      if (!page_locked) {
++              lock_page(page);
++              page_locked = true;
++      }
++      anon_vma = page_lock_anon_vma_read(page);
+       /* Confirm the PTE did not while locked */
+       spin_lock(&mm->page_table_lock);
+@@ -1326,14 +1343,6 @@ int do_huge_pmd_numa_page(struct mm_stru
+               goto out_unlock;
+       }
+-got_lock:
+-      target_nid = mpol_misplaced(page, vma, haddr);
+-      if (target_nid == -1) {
+-              unlock_page(page);
+-              put_page(page);
+-              goto clear_pmdnuma;
+-      }
+-
+       /* Migrate the THP to the requested node */
+       spin_unlock(&mm->page_table_lock);
+       migrated = migrate_misplaced_transhuge_page(mm, vma,
+@@ -1342,6 +1351,8 @@ got_lock:
+               goto check_same;
+       task_numa_fault(target_nid, HPAGE_PMD_NR, true);
++      if (anon_vma)
++              page_unlock_anon_vma_read(anon_vma);
+       return 0;
+ check_same:
+@@ -1358,6 +1369,11 @@ clear_pmdnuma:
+       update_mmu_cache_pmd(vma, addr, pmdp);
+ out_unlock:
+       spin_unlock(&mm->page_table_lock);
++
++out:
++      if (anon_vma)
++              page_unlock_anon_vma_read(anon_vma);
++
+       if (current_nid != -1)
+               task_numa_fault(current_nid, HPAGE_PMD_NR, false);
+       return 0;
diff --git a/queue-3.11/mm-wait-for-thp-migrations-to-complete-during-numa-hinting-faults.patch b/queue-3.11/mm-wait-for-thp-migrations-to-complete-during-numa-hinting-faults.patch
new file mode 100644 (file)
index 0000000..80de7f4
--- /dev/null
@@ -0,0 +1,81 @@
+From 42836f5f8baa33085f547098b74aa98991ee9216 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 7 Oct 2013 11:28:43 +0100
+Subject: mm: Wait for THP migrations to complete during NUMA hinting faults
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 42836f5f8baa33085f547098b74aa98991ee9216 upstream.
+
+The locking for migrating THP is unusual. While normal page migration
+prevents parallel accesses using a migration PTE, THP migration relies on
+a combination of the page_table_lock, the page lock and the existance of
+the NUMA hinting PTE to guarantee safety but there is a bug in the scheme.
+
+If a THP page is currently being migrated and another thread traps a
+fault on the same page it checks if the page is misplaced. If it is not,
+then pmd_numa is cleared. The problem is that it checks if the page is
+misplaced without holding the page lock meaning that the racing thread
+can be migrating the THP when the second thread clears the NUMA bit
+and faults a stale page.
+
+This patch checks if the page is potentially being migrated and stalls
+using the lock_page if it is potentially being migrated before checking
+if the page is misplaced or not.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <peterz@infradead.org>
+Link: http://lkml.kernel.org/r/1381141781-10992-6-git-send-email-mgorman@suse.de
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c |   23 ++++++++++++++++-------
+ 1 file changed, 16 insertions(+), 7 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1307,13 +1307,14 @@ int do_huge_pmd_numa_page(struct mm_stru
+       if (current_nid == numa_node_id())
+               count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+-      target_nid = mpol_misplaced(page, vma, haddr);
+-      if (target_nid == -1) {
+-              put_page(page);
+-              goto clear_pmdnuma;
+-      }
++      /*
++       * Acquire the page lock to serialise THP migrations but avoid dropping
++       * page_table_lock if at all possible
++       */
++      if (trylock_page(page))
++              goto got_lock;
+-      /* Acquire the page lock to serialise THP migrations */
++      /* Serialise against migrationa and check placement check placement */
+       spin_unlock(&mm->page_table_lock);
+       lock_page(page);
+@@ -1324,9 +1325,17 @@ int do_huge_pmd_numa_page(struct mm_stru
+               put_page(page);
+               goto out_unlock;
+       }
+-      spin_unlock(&mm->page_table_lock);
++
++got_lock:
++      target_nid = mpol_misplaced(page, vma, haddr);
++      if (target_nid == -1) {
++              unlock_page(page);
++              put_page(page);
++              goto clear_pmdnuma;
++      }
+       /* Migrate the THP to the requested node */
++      spin_unlock(&mm->page_table_lock);
+       migrated = migrate_misplaced_transhuge_page(mm, vma,
+                               pmdp, pmd, addr, page, target_nid);
+       if (!migrated)
index 00fbd9f0493f41f6d054fc43ee8acf5ce48acc3e..56847d048a2c15abe138cf87b2e51548ede6490d 100644 (file)
@@ -66,3 +66,9 @@ lib-scatterlist.c-don-t-flush_kernel_dcache_page-on-slab-page.patch
 aacraid-missing-capable-check-in-compat-ioctl.patch
 clk-fixup-argument-order-when-setting-vco-parameters.patch
 clk-nomadik-set-all-timers-to-use-2.4-mhz-timclk.patch
+mm-numa-do-not-account-for-a-hinting-fault-if-we-raced.patch
+mm-wait-for-thp-migrations-to-complete-during-numa-hinting-faults.patch
+mm-prevent-parallel-splits-during-thp-migration.patch
+mm-numa-sanitize-task_numa_fault-callsites.patch
+mm-close-races-between-thp-migration-and-pmd-numa-clearing.patch
+mm-account-for-a-thp-numa-hinting-update-as-one-pte-update.patch