--- /dev/null
+From 27c73ae759774e63313c1fbfeb17ba076cea64c5 Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Thu, 21 Nov 2013 14:32:02 -0800
+Subject: mm: hugetlbfs: fix hugetlbfs optimization
+
+From: Andrea Arcangeli <aarcange@redhat.com>
+
+commit 27c73ae759774e63313c1fbfeb17ba076cea64c5 upstream.
+
+Commit 7cb2ef56e6a8 ("mm: fix aio performance regression for database
+caused by THP") can cause dereference of a dangling pointer if
+split_huge_page runs during PageHuge() if there are updates to the
+tail_page->private field.
+
+Also it is repeating compound_head twice for hugetlbfs and it is running
+compound_head+compound_trans_head for THP when a single one is needed in
+both cases.
+
+The new code within the PageSlab() check doesn't need to verify that the
+THP page size is never bigger than the smallest hugetlbfs page size, to
+avoid memory corruption.
+
+A longstanding theoretical race condition was found while fixing the
+above (see the change right after the skip_unlock label, that is
+relevant for the compound_lock path too).
+
+By re-establishing the _mapcount tail refcounting for all compound
+pages, this also fixes the below problem:
+
+ echo 0 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+
+ BUG: Bad page state in process bash pfn:59a01
+ page:ffffea000139b038 count:0 mapcount:10 mapping: (null) index:0x0
+ page flags: 0x1c00000000008000(tail)
+ Modules linked in:
+ CPU: 6 PID: 2018 Comm: bash Not tainted 3.12.0+ #25
+ Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
+ Call Trace:
+ dump_stack+0x55/0x76
+ bad_page+0xd5/0x130
+ free_pages_prepare+0x213/0x280
+ __free_pages+0x36/0x80
+ update_and_free_page+0xc1/0xd0
+ free_pool_huge_page+0xc2/0xe0
+ set_max_huge_pages.part.58+0x14c/0x220
+ nr_hugepages_store_common.isra.60+0xd0/0xf0
+ nr_hugepages_store+0x13/0x20
+ kobj_attr_store+0xf/0x20
+ sysfs_write_file+0x189/0x1e0
+ vfs_write+0xc5/0x1f0
+ SyS_write+0x55/0xb0
+ system_call_fastpath+0x16/0x1b
+
+Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+Tested-by: Khalid Aziz <khalid.aziz@oracle.com>
+Cc: Pravin Shelar <pshelar@nicira.com>
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Ben Hutchings <bhutchings@solarflare.com>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Johannes Weiner <jweiner@redhat.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Andi Kleen <andi@firstfloor.org>
+Cc: Minchan Kim <minchan@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Guillaume Morin <guillaume@morinfr.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/hugetlb.h | 6 ++
+ mm/hugetlb.c | 17 +++++
+ mm/swap.c | 143 +++++++++++++++++++++++++++---------------------
+ 3 files changed, 106 insertions(+), 60 deletions(-)
+
+--- a/include/linux/hugetlb.h
++++ b/include/linux/hugetlb.h
+@@ -31,6 +31,7 @@ struct hugepage_subpool *hugepage_new_su
+ void hugepage_put_subpool(struct hugepage_subpool *spool);
+
+ int PageHuge(struct page *page);
++int PageHeadHuge(struct page *page_head);
+
+ void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
+ int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
+@@ -103,6 +104,11 @@ static inline int PageHuge(struct page *
+ {
+ return 0;
+ }
++
++static inline int PageHeadHuge(struct page *page_head)
++{
++ return 0;
++}
+
+ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+ {
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -736,6 +736,23 @@ int PageHuge(struct page *page)
+ }
+ EXPORT_SYMBOL_GPL(PageHuge);
+
++/*
++ * PageHeadHuge() only returns true for hugetlbfs head page, but not for
++ * normal or transparent huge pages.
++ */
++int PageHeadHuge(struct page *page_head)
++{
++ compound_page_dtor *dtor;
++
++ if (!PageHead(page_head))
++ return 0;
++
++ dtor = get_compound_page_dtor(page_head);
++
++ return dtor == free_huge_page;
++}
++EXPORT_SYMBOL_GPL(PageHeadHuge);
++
+ pgoff_t __basepage_index(struct page *page)
+ {
+ struct page *page_head = compound_head(page);
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -82,19 +82,6 @@ static void __put_compound_page(struct p
+
+ static void put_compound_page(struct page *page)
+ {
+- /*
+- * hugetlbfs pages cannot be split from under us. If this is a
+- * hugetlbfs page, check refcount on head page and release the page if
+- * the refcount becomes zero.
+- */
+- if (PageHuge(page)) {
+- page = compound_head(page);
+- if (put_page_testzero(page))
+- __put_compound_page(page);
+-
+- return;
+- }
+-
+ if (unlikely(PageTail(page))) {
+ /* __split_huge_page_refcount can run under us */
+ struct page *page_head = compound_trans_head(page);
+@@ -111,14 +98,31 @@ static void put_compound_page(struct pag
+ * still hot on arches that do not support
+ * this_cpu_cmpxchg_double().
+ */
+- if (PageSlab(page_head)) {
+- if (PageTail(page)) {
++ if (PageSlab(page_head) || PageHeadHuge(page_head)) {
++ if (likely(PageTail(page))) {
++ /*
++ * __split_huge_page_refcount
++ * cannot race here.
++ */
++ VM_BUG_ON(!PageHead(page_head));
++ atomic_dec(&page->_mapcount);
+ if (put_page_testzero(page_head))
+ VM_BUG_ON(1);
+-
+- atomic_dec(&page->_mapcount);
+- goto skip_lock_tail;
++ if (put_page_testzero(page_head))
++ __put_compound_page(page_head);
++ return;
+ } else
++ /*
++ * __split_huge_page_refcount
++ * run before us, "page" was a
++ * THP tail. The split
++ * page_head has been freed
++ * and reallocated as slab or
++ * hugetlbfs page of smaller
++ * order (only possible if
++ * reallocated as slab on
++ * x86).
++ */
+ goto skip_lock;
+ }
+ /*
+@@ -132,8 +136,27 @@ static void put_compound_page(struct pag
+ /* __split_huge_page_refcount run before us */
+ compound_unlock_irqrestore(page_head, flags);
+ skip_lock:
+- if (put_page_testzero(page_head))
+- __put_single_page(page_head);
++ if (put_page_testzero(page_head)) {
++ /*
++ * The head page may have been
++ * freed and reallocated as a
++ * compound page of smaller
++ * order and then freed again.
++ * All we know is that it
++ * cannot have become: a THP
++ * page, a compound page of
++ * higher order, a tail page.
++ * That is because we still
++ * hold the refcount of the
++ * split THP tail and
++ * page_head was the THP head
++ * before the split.
++ */
++ if (PageHead(page_head))
++ __put_compound_page(page_head);
++ else
++ __put_single_page(page_head);
++ }
+ out_put_single:
+ if (put_page_testzero(page))
+ __put_single_page(page);
+@@ -155,7 +178,6 @@ out_put_single:
+ VM_BUG_ON(atomic_read(&page->_count) != 0);
+ compound_unlock_irqrestore(page_head, flags);
+
+-skip_lock_tail:
+ if (put_page_testzero(page_head)) {
+ if (PageHead(page_head))
+ __put_compound_page(page_head);
+@@ -198,51 +220,52 @@ bool __get_page_tail(struct page *page)
+ * proper PT lock that already serializes against
+ * split_huge_page().
+ */
++ unsigned long flags;
+ bool got = false;
+- struct page *page_head;
+-
+- /*
+- * If this is a hugetlbfs page it cannot be split under us. Simply
+- * increment refcount for the head page.
+- */
+- if (PageHuge(page)) {
+- page_head = compound_head(page);
+- atomic_inc(&page_head->_count);
+- got = true;
+- } else {
+- unsigned long flags;
+-
+- page_head = compound_trans_head(page);
+- if (likely(page != page_head &&
+- get_page_unless_zero(page_head))) {
+-
+- /* Ref to put_compound_page() comment. */
+- if (PageSlab(page_head)) {
+- if (likely(PageTail(page))) {
+- __get_page_tail_foll(page, false);
+- return true;
+- } else {
+- put_page(page_head);
+- return false;
+- }
+- }
++ struct page *page_head = compound_trans_head(page);
+
+- /*
+- * page_head wasn't a dangling pointer but it
+- * may not be a head page anymore by the time
+- * we obtain the lock. That is ok as long as it
+- * can't be freed from under us.
+- */
+- flags = compound_lock_irqsave(page_head);
+- /* here __split_huge_page_refcount won't run anymore */
++ if (likely(page != page_head && get_page_unless_zero(page_head))) {
++ /* Ref to put_compound_page() comment. */
++ if (PageSlab(page_head) || PageHeadHuge(page_head)) {
+ if (likely(PageTail(page))) {
++ /*
++ * This is a hugetlbfs page or a slab
++ * page. __split_huge_page_refcount
++ * cannot race here.
++ */
++ VM_BUG_ON(!PageHead(page_head));
+ __get_page_tail_foll(page, false);
+- got = true;
+- }
+- compound_unlock_irqrestore(page_head, flags);
+- if (unlikely(!got))
++ return true;
++ } else {
++ /*
++ * __split_huge_page_refcount run
++ * before us, "page" was a THP
++ * tail. The split page_head has been
++ * freed and reallocated as slab or
++ * hugetlbfs page of smaller order
++ * (only possible if reallocated as
++ * slab on x86).
++ */
+ put_page(page_head);
++ return false;
++ }
++ }
++
++ /*
++ * page_head wasn't a dangling pointer but it
++ * may not be a head page anymore by the time
++ * we obtain the lock. That is ok as long as it
++ * can't be freed from under us.
++ */
++ flags = compound_lock_irqsave(page_head);
++ /* here __split_huge_page_refcount won't run anymore */
++ if (likely(PageTail(page))) {
++ __get_page_tail_foll(page, false);
++ got = true;
+ }
++ compound_unlock_irqrestore(page_head, flags);
++ if (unlikely(!got))
++ put_page(page_head);
+ }
+ return got;
+ }
--- /dev/null
+From b3084f4db3aeb991c507ca774337c7e7893ed04f Mon Sep 17 00:00:00 2001
+From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+Date: Mon, 13 Jan 2014 11:34:24 +0530
+Subject: powerpc/thp: Fix crash on mremap
+
+From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
+
+commit b3084f4db3aeb991c507ca774337c7e7893ed04f upstream.
+
+This patch fix the below crash
+
+NIP [c00000000004cee4] .__hash_page_thp+0x2a4/0x440
+LR [c0000000000439ac] .hash_page+0x18c/0x5e0
+...
+Call Trace:
+[c000000736103c40] [00001ffffb000000] 0x1ffffb000000(unreliable)
+[437908.479693] [c000000736103d50] [c0000000000439ac] .hash_page+0x18c/0x5e0
+[437908.479699] [c000000736103e30] [c00000000000924c] .do_hash_page+0x4c/0x58
+
+On ppc64 we use the pgtable for storing the hpte slot information and
+store address to the pgtable at a constant offset (PTRS_PER_PMD) from
+pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
+the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
+from new pmd.
+
+We also want to move the withdraw and deposit before the set_pmd so
+that, when page fault find the pmd as trans huge we can be sure that
+pgtable can be located at the offset.
+
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ arch/Kconfig | 3 +++
+ arch/powerpc/platforms/Kconfig.cputype | 1 +
+ mm/huge_memory.c | 12 ++++++++++++
+ 3 files changed, 16 insertions(+)
+
+--- a/arch/Kconfig
++++ b/arch/Kconfig
+@@ -365,6 +365,9 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ config HAVE_ARCH_SOFT_DIRTY
+ bool
+
++config ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
++ bool
++
+ config HAVE_MOD_ARCH_SPECIFIC
+ bool
+ help
+--- a/arch/powerpc/platforms/Kconfig.cputype
++++ b/arch/powerpc/platforms/Kconfig.cputype
+@@ -71,6 +71,7 @@ config PPC_BOOK3S_64
+ select PPC_FPU
+ select PPC_HAVE_PMU_SUPPORT
+ select SYS_SUPPORTS_HUGETLBFS
++ select ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
+ select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
+
+ config PPC_BOOK3E_64
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1474,8 +1474,20 @@ int move_huge_pmd(struct vm_area_struct
+
+ ret = __pmd_trans_huge_lock(old_pmd, vma);
+ if (ret == 1) {
++#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
++ pgtable_t pgtable;
++#endif
+ pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+ VM_BUG_ON(!pmd_none(*new_pmd));
++#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
++ /*
++ * Archs like ppc64 use pgtable to store per pmd
++ * specific information. So when we switch the pmd,
++ * we should also withdraw and deposit the pgtable
++ */
++ pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
++ pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
++#endif
+ set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
+ spin_unlock(&mm->page_table_lock);
+ }