From f78fb89ea532a96f17585a80132f6f5c9ed12afa Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 7 Jan 2014 09:47:30 -0800 Subject: [PATCH] 3.12-stable patches added patches: mm-numa-call-mmu-notifiers-on-thp-migration.patch mm-numa-serialise-parallel-get_user_page-against-thp-migration.patch --- ...m-clear-pmd_numa-before-invalidating.patch | 46 ----- ...-call-mmu-notifiers-on-thp-migration.patch | 103 ++++++++++ ...-get_user_page-against-thp-migration.patch | 193 ++++++++++++++++++ queue-3.12/series | 3 +- 4 files changed, 298 insertions(+), 47 deletions(-) delete mode 100644 queue-3.12/mm-clear-pmd_numa-before-invalidating.patch create mode 100644 queue-3.12/mm-numa-call-mmu-notifiers-on-thp-migration.patch create mode 100644 queue-3.12/mm-numa-serialise-parallel-get_user_page-against-thp-migration.patch diff --git a/queue-3.12/mm-clear-pmd_numa-before-invalidating.patch b/queue-3.12/mm-clear-pmd_numa-before-invalidating.patch deleted file mode 100644 index 0b6bd84ec1d..00000000000 --- a/queue-3.12/mm-clear-pmd_numa-before-invalidating.patch +++ /dev/null @@ -1,46 +0,0 @@ -From 67f87463d3a3362424efcbe8b40e4772fd34fc61 Mon Sep 17 00:00:00 2001 -From: Mel Gorman -Date: Wed, 18 Dec 2013 17:08:34 -0800 -Subject: mm: clear pmd_numa before invalidating - -From: Mel Gorman - -commit 67f87463d3a3362424efcbe8b40e4772fd34fc61 upstream. - -On x86, PMD entries are similar to _PAGE_PROTNONE protection and are -handled as NUMA hinting faults. The following two page table protection -bits are what defines them - - _PAGE_NUMA:set _PAGE_PRESENT:clear - -A PMD is considered present if any of the _PAGE_PRESENT, _PAGE_PROTNONE, -_PAGE_PSE or _PAGE_NUMA bits are set. If pmdp_invalidate encounters a -pmd_numa, it clears the present bit leaving _PAGE_NUMA which will be -considered not present by the CPU but present by pmd_present. The -existing caller of pmdp_invalidate should handle it but it's an -inconsistent state for a PMD. This patch keeps the state consistent -when calling pmdp_invalidate. - -Signed-off-by: Mel Gorman -Reviewed-by: Rik van Riel -Cc: Alex Thorlton -Signed-off-by: Andrew Morton -Signed-off-by: Linus Torvalds -Signed-off-by: Greg Kroah-Hartman - ---- - mm/pgtable-generic.c | 3 +++ - 1 file changed, 3 insertions(+) - ---- a/mm/pgtable-generic.c -+++ b/mm/pgtable-generic.c -@@ -191,6 +191,9 @@ pgtable_t pgtable_trans_huge_withdraw(st - void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) - { -+ pmd_t entry = *pmdp; -+ if (pmd_numa(entry)) -+ entry = pmd_mknonnuma(entry); - set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); - } diff --git a/queue-3.12/mm-numa-call-mmu-notifiers-on-thp-migration.patch b/queue-3.12/mm-numa-call-mmu-notifiers-on-thp-migration.patch new file mode 100644 index 00000000000..708630c804b --- /dev/null +++ b/queue-3.12/mm-numa-call-mmu-notifiers-on-thp-migration.patch @@ -0,0 +1,103 @@ +From mgorman@suse.de Tue Jan 7 09:45:19 2014 +From: Mel Gorman +Date: Tue, 7 Jan 2014 14:00:37 +0000 +Subject: mm: numa: call MMU notifiers on THP migration +To: gregkh@linuxfoundation.org +Cc: athorlton@sgi.com, riel@redhat.com, chegu_vinod@hp.com, Mel Gorman , stable@vger.kernel.org +Message-ID: <1389103248-17617-3-git-send-email-mgorman@suse.de> + +From: Mel Gorman + +commit f714f4f20e59ea6eea264a86b9a51fd51b88fc54 upstream. + +MMU notifiers must be called on THP page migration or secondary MMUs +will get very confused. + +Signed-off-by: Mel Gorman +Reviewed-by: Rik van Riel +Cc: Alex Thorlton +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/migrate.c | 22 ++++++++++++++-------- + 1 file changed, 14 insertions(+), 8 deletions(-) + +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + + #include + +@@ -1655,12 +1656,13 @@ int migrate_misplaced_transhuge_page(str + unsigned long address, + struct page *page, int node) + { +- unsigned long haddr = address & HPAGE_PMD_MASK; + pg_data_t *pgdat = NODE_DATA(node); + int isolated = 0; + struct page *new_page = NULL; + struct mem_cgroup *memcg = NULL; + int page_lru = page_is_file_cache(page); ++ unsigned long mmun_start = address & HPAGE_PMD_MASK; ++ unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; + pmd_t orig_entry; + + /* +@@ -1702,10 +1704,12 @@ int migrate_misplaced_transhuge_page(str + WARN_ON(PageLRU(new_page)); + + /* Recheck the target PMD */ ++ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) { + fail_putback: + spin_unlock(&mm->page_table_lock); ++ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + + /* Reverse changes made by migrate_page_copy() */ + if (TestClearPageActive(new_page)) +@@ -1746,15 +1750,16 @@ fail_putback: + * The SetPageUptodate on the new page and page_add_new_anon_rmap + * guarantee the copy is visible before the pagetable update. + */ +- flush_cache_range(vma, haddr, haddr + HPAGE_PMD_SIZE); +- page_add_new_anon_rmap(new_page, vma, haddr); +- pmdp_clear_flush(vma, haddr, pmd); +- set_pmd_at(mm, haddr, pmd, entry); ++ flush_cache_range(vma, mmun_start, mmun_end); ++ page_add_new_anon_rmap(new_page, vma, mmun_start); ++ pmdp_clear_flush(vma, mmun_start, pmd); ++ set_pmd_at(mm, mmun_start, pmd, entry); ++ flush_tlb_range(vma, mmun_start, mmun_end); + update_mmu_cache_pmd(vma, address, &entry); + + if (page_count(page) != 2) { +- set_pmd_at(mm, haddr, pmd, orig_entry); +- flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); ++ set_pmd_at(mm, mmun_start, pmd, orig_entry); ++ flush_tlb_range(vma, mmun_start, mmun_end); + update_mmu_cache_pmd(vma, address, &entry); + page_remove_rmap(new_page); + goto fail_putback; +@@ -1769,6 +1774,7 @@ fail_putback: + */ + mem_cgroup_end_migration(memcg, page, new_page, true); + spin_unlock(&mm->page_table_lock); ++ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + + unlock_page(new_page); + unlock_page(page); +@@ -1789,7 +1795,7 @@ out_dropref: + spin_lock(&mm->page_table_lock); + if (pmd_same(*pmd, entry)) { + entry = pmd_mknonnuma(entry); +- set_pmd_at(mm, haddr, pmd, entry); ++ set_pmd_at(mm, mmun_start, pmd, entry); + update_mmu_cache_pmd(vma, address, &entry); + } + spin_unlock(&mm->page_table_lock); diff --git a/queue-3.12/mm-numa-serialise-parallel-get_user_page-against-thp-migration.patch b/queue-3.12/mm-numa-serialise-parallel-get_user_page-against-thp-migration.patch new file mode 100644 index 00000000000..a6723c7de86 --- /dev/null +++ b/queue-3.12/mm-numa-serialise-parallel-get_user_page-against-thp-migration.patch @@ -0,0 +1,193 @@ +From mgorman@suse.de Tue Jan 7 09:44:16 2014 +From: Mel Gorman +Date: Tue, 7 Jan 2014 14:00:36 +0000 +Subject: mm: numa: serialise parallel get_user_page against THP migration +To: gregkh@linuxfoundation.org +Cc: athorlton@sgi.com, riel@redhat.com, chegu_vinod@hp.com, Mel Gorman , stable@vger.kernel.org +Message-ID: <1389103248-17617-2-git-send-email-mgorman@suse.de> + +From: Mel Gorman + +commit 2b4847e73004c10ae6666c2e27b5c5430aed8698 upstream. + +Base pages are unmapped and flushed from cache and TLB during normal +page migration and replaced with a migration entry that causes any +parallel NUMA hinting fault or gup to block until migration completes. + +THP does not unmap pages due to a lack of support for migration entries +at a PMD level. This allows races with get_user_pages and +get_user_pages_fast which commit 3f926ab945b6 ("mm: Close races between +THP migration and PMD numa clearing") made worse by introducing a +pmd_clear_flush(). + +This patch forces get_user_page (fast and normal) on a pmd_numa page to +go through the slow get_user_page path where it will serialise against +THP migration and properly account for the NUMA hinting fault. On the +migration side the page table lock is taken for each PTE update. + +Signed-off-by: Mel Gorman +Reviewed-by: Rik van Riel +Cc: Alex Thorlton +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/mm/gup.c | 13 +++++++++++++ + mm/huge_memory.c | 24 ++++++++++++++++-------- + mm/migrate.c | 38 +++++++++++++++++++++++++++++++------- + 3 files changed, 60 insertions(+), 15 deletions(-) + +--- a/arch/x86/mm/gup.c ++++ b/arch/x86/mm/gup.c +@@ -83,6 +83,12 @@ static noinline int gup_pte_range(pmd_t + pte_t pte = gup_get_pte(ptep); + struct page *page; + ++ /* Similar to the PMD case, NUMA hinting must take slow path */ ++ if (pte_numa(pte)) { ++ pte_unmap(ptep); ++ return 0; ++ } ++ + if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { + pte_unmap(ptep); + return 0; +@@ -167,6 +173,13 @@ static int gup_pmd_range(pud_t pud, unsi + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + return 0; + if (unlikely(pmd_large(pmd))) { ++ /* ++ * NUMA hinting faults need to be handled in the GUP ++ * slowpath for accounting purposes and so that they ++ * can be serialised against THP migration. ++ */ ++ if (pmd_numa(pmd)) ++ return 0; + if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) + return 0; + } else { +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -1240,6 +1240,10 @@ struct page *follow_trans_huge_pmd(struc + if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) + return ERR_PTR(-EFAULT); + ++ /* Full NUMA hinting faults to serialise migration in fault paths */ ++ if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) ++ goto out; ++ + page = pmd_page(*pmd); + VM_BUG_ON(!PageHead(page)); + if (flags & FOLL_TOUCH) { +@@ -1306,23 +1310,27 @@ int do_huge_pmd_numa_page(struct mm_stru + /* If the page was locked, there are no parallel migrations */ + if (page_locked) + goto clear_pmdnuma; ++ } + +- /* +- * Otherwise wait for potential migrations and retry. We do +- * relock and check_same as the page may no longer be mapped. +- * As the fault is being retried, do not account for it. +- */ ++ /* ++ * If there are potential migrations, wait for completion and retry. We ++ * do not relock and check_same as the page may no longer be mapped. ++ * Furtermore, even if the page is currently misplaced, there is no ++ * guarantee it is still misplaced after the migration completes. ++ */ ++ if (!page_locked) { + spin_unlock(&mm->page_table_lock); + wait_on_page_locked(page); + page_nid = -1; + goto out; + } + +- /* Page is misplaced, serialise migrations and parallel THP splits */ ++ /* ++ * Page is misplaced. Page lock serialises migrations. Acquire anon_vma ++ * to serialises splits ++ */ + get_page(page); + spin_unlock(&mm->page_table_lock); +- if (!page_locked) +- lock_page(page); + anon_vma = page_lock_anon_vma_read(page); + + /* Confirm the PTE did not while locked */ +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -1661,6 +1661,7 @@ int migrate_misplaced_transhuge_page(str + struct page *new_page = NULL; + struct mem_cgroup *memcg = NULL; + int page_lru = page_is_file_cache(page); ++ pmd_t orig_entry; + + /* + * Don't migrate pages that are mapped in multiple processes. +@@ -1702,7 +1703,8 @@ int migrate_misplaced_transhuge_page(str + + /* Recheck the target PMD */ + spin_lock(&mm->page_table_lock); +- if (unlikely(!pmd_same(*pmd, entry))) { ++ if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) { ++fail_putback: + spin_unlock(&mm->page_table_lock); + + /* Reverse changes made by migrate_page_copy() */ +@@ -1732,16 +1734,34 @@ int migrate_misplaced_transhuge_page(str + */ + mem_cgroup_prepare_migration(page, new_page, &memcg); + ++ orig_entry = *pmd; + entry = mk_pmd(new_page, vma->vm_page_prot); +- entry = pmd_mknonnuma(entry); +- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); ++ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + ++ /* ++ * Clear the old entry under pagetable lock and establish the new PTE. ++ * Any parallel GUP will either observe the old page blocking on the ++ * page lock, block on the page table lock or observe the new page. ++ * The SetPageUptodate on the new page and page_add_new_anon_rmap ++ * guarantee the copy is visible before the pagetable update. ++ */ ++ flush_cache_range(vma, haddr, haddr + HPAGE_PMD_SIZE); ++ page_add_new_anon_rmap(new_page, vma, haddr); + pmdp_clear_flush(vma, haddr, pmd); + set_pmd_at(mm, haddr, pmd, entry); +- page_add_new_anon_rmap(new_page, vma, haddr); + update_mmu_cache_pmd(vma, address, &entry); ++ ++ if (page_count(page) != 2) { ++ set_pmd_at(mm, haddr, pmd, orig_entry); ++ flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE); ++ update_mmu_cache_pmd(vma, address, &entry); ++ page_remove_rmap(new_page); ++ goto fail_putback; ++ } ++ + page_remove_rmap(page); ++ + /* + * Finish the charge transaction under the page table lock to + * prevent split_huge_page() from dividing up the charge +@@ -1766,9 +1786,13 @@ int migrate_misplaced_transhuge_page(str + out_fail: + count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); + out_dropref: +- entry = pmd_mknonnuma(entry); +- set_pmd_at(mm, haddr, pmd, entry); +- update_mmu_cache_pmd(vma, address, &entry); ++ spin_lock(&mm->page_table_lock); ++ if (pmd_same(*pmd, entry)) { ++ entry = pmd_mknonnuma(entry); ++ set_pmd_at(mm, haddr, pmd, entry); ++ update_mmu_cache_pmd(vma, address, &entry); ++ } ++ spin_unlock(&mm->page_table_lock); + + unlock_page(page); + put_page(page); diff --git a/queue-3.12/series b/queue-3.12/series index 0b9f3b1922b..d7408c6dcf3 100644 --- a/queue-3.12/series +++ b/queue-3.12/series @@ -107,7 +107,8 @@ ext2-fix-oops-in-ext2_get_block-called-from-ext2_quota_write.patch acpi-pci-hotplug-avoid-warning-when-_adr-not-present.patch intel_pstate-fail-initialization-if-p-state-information-is-missing.patch revert-of-address-handle-address-cells-2-specially.patch -mm-clear-pmd_numa-before-invalidating.patch +mm-numa-serialise-parallel-get_user_page-against-thp-migration.patch +mm-numa-call-mmu-notifiers-on-thp-migration.patch mm-numa-ensure-anon_vma-is-locked-to-prevent-parallel-thp-splits.patch mm-numa-avoid-unnecessary-work-on-the-failure-path.patch mm-fix-tlb-flush-race-between-migration-and-change_protection_range.patch -- 2.47.3