--- /dev/null
+From 27c73ae759774e63313c1fbfeb17ba076cea64c5 Mon Sep 17 00:00:00 2001
+From: Andrea Arcangeli <aarcange@redhat.com>
+Date: Thu, 21 Nov 2013 14:32:02 -0800
+Subject: mm: hugetlbfs: fix hugetlbfs optimization
+
+From: Andrea Arcangeli <aarcange@redhat.com>
+
+commit 27c73ae759774e63313c1fbfeb17ba076cea64c5 upstream.
+
+Commit 7cb2ef56e6a8 ("mm: fix aio performance regression for database
+caused by THP") can cause dereference of a dangling pointer if
+split_huge_page runs during PageHuge() if there are updates to the
+tail_page->private field.
+
+Also it is repeating compound_head twice for hugetlbfs and it is running
+compound_head+compound_trans_head for THP when a single one is needed in
+both cases.
+
+The new code within the PageSlab() check doesn't need to verify that the
+THP page size is never bigger than the smallest hugetlbfs page size, to
+avoid memory corruption.
+
+A longstanding theoretical race condition was found while fixing the
+above (see the change right after the skip_unlock label, that is
+relevant for the compound_lock path too).
+
+By re-establishing the _mapcount tail refcounting for all compound
+pages, this also fixes the below problem:
+
+ echo 0 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
+
+ BUG: Bad page state in process bash pfn:59a01
+ page:ffffea000139b038 count:0 mapcount:10 mapping: (null) index:0x0
+ page flags: 0x1c00000000008000(tail)
+ Modules linked in:
+ CPU: 6 PID: 2018 Comm: bash Not tainted 3.12.0+ #25
+ Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
+ Call Trace:
+ dump_stack+0x55/0x76
+ bad_page+0xd5/0x130
+ free_pages_prepare+0x213/0x280
+ __free_pages+0x36/0x80
+ update_and_free_page+0xc1/0xd0
+ free_pool_huge_page+0xc2/0xe0
+ set_max_huge_pages.part.58+0x14c/0x220
+ nr_hugepages_store_common.isra.60+0xd0/0xf0
+ nr_hugepages_store+0x13/0x20
+ kobj_attr_store+0xf/0x20
+ sysfs_write_file+0x189/0x1e0
+ vfs_write+0xc5/0x1f0
+ SyS_write+0x55/0xb0
+ system_call_fastpath+0x16/0x1b
+
+Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
+Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
+Tested-by: Khalid Aziz <khalid.aziz@oracle.com>
+Cc: Pravin Shelar <pshelar@nicira.com>
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Ben Hutchings <bhutchings@solarflare.com>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Johannes Weiner <jweiner@redhat.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Andi Kleen <andi@firstfloor.org>
+Cc: Minchan Kim <minchan@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Guillaume Morin <guillaume@morinfr.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ include/linux/hugetlb.h | 6 ++
+ mm/hugetlb.c | 17 +++++++
+ mm/swap.c | 107 ++++++++++++++++++++++++++++++++++--------------
+ 3 files changed, 101 insertions(+), 29 deletions(-)
+
+--- a/include/linux/hugetlb.h
++++ b/include/linux/hugetlb.h
+@@ -24,6 +24,7 @@ struct hugepage_subpool *hugepage_new_su
+ void hugepage_put_subpool(struct hugepage_subpool *spool);
+
+ int PageHuge(struct page *page);
++int PageHeadHuge(struct page *page_head);
+
+ void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
+ int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
+@@ -84,6 +85,11 @@ static inline int PageHuge(struct page *
+ {
+ return 0;
+ }
++
++static inline int PageHeadHuge(struct page *page_head)
++{
++ return 0;
++}
+
+ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+ {
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -679,6 +679,23 @@ int PageHuge(struct page *page)
+ }
+ EXPORT_SYMBOL_GPL(PageHuge);
+
++/*
++ * PageHeadHuge() only returns true for hugetlbfs head page, but not for
++ * normal or transparent huge pages.
++ */
++int PageHeadHuge(struct page *page_head)
++{
++ compound_page_dtor *dtor;
++
++ if (!PageHead(page_head))
++ return 0;
++
++ dtor = get_compound_page_dtor(page_head);
++
++ return dtor == free_huge_page;
++}
++EXPORT_SYMBOL_GPL(PageHeadHuge);
++
+ pgoff_t __basepage_index(struct page *page)
+ {
+ struct page *page_head = compound_head(page);
+--- a/mm/swap.c
++++ b/mm/swap.c
+@@ -77,18 +77,6 @@ static void __put_compound_page(struct p
+
+ static void put_compound_page(struct page *page)
+ {
+- /*
+- * hugetlbfs pages cannot be split from under us. If this is a
+- * hugetlbfs page, check refcount on head page and release the page if
+- * the refcount becomes zero.
+- */
+- if (PageHuge(page)) {
+- page = compound_head(page);
+- if (put_page_testzero(page))
+- __put_compound_page(page);
+- return;
+- }
+-
+ if (unlikely(PageTail(page))) {
+ /* __split_huge_page_refcount can run under us */
+ struct page *page_head = compound_trans_head(page);
+@@ -96,6 +84,35 @@ static void put_compound_page(struct pag
+ if (likely(page != page_head &&
+ get_page_unless_zero(page_head))) {
+ unsigned long flags;
++
++ if (PageHeadHuge(page_head)) {
++ if (likely(PageTail(page))) {
++ /*
++ * __split_huge_page_refcount
++ * cannot race here.
++ */
++ VM_BUG_ON(!PageHead(page_head));
++ atomic_dec(&page->_mapcount);
++ if (put_page_testzero(page_head))
++ VM_BUG_ON(1);
++ if (put_page_testzero(page_head))
++ __put_compound_page(page_head);
++ return;
++ } else {
++ /*
++ * __split_huge_page_refcount
++ * run before us, "page" was a
++ * THP tail. The split
++ * page_head has been freed
++ * and reallocated as slab or
++ * hugetlbfs page of smaller
++ * order (only possible if
++ * reallocated as slab on
++ * x86).
++ */
++ goto skip_lock;
++ }
++ }
+ /*
+ * page_head wasn't a dangling pointer but it
+ * may not be a head page anymore by the time
+@@ -107,9 +124,29 @@ static void put_compound_page(struct pag
+ /* __split_huge_page_refcount run before us */
+ compound_unlock_irqrestore(page_head, flags);
+ VM_BUG_ON(PageHead(page_head));
+- if (put_page_testzero(page_head))
+- __put_single_page(page_head);
+- out_put_single:
++skip_lock:
++ if (put_page_testzero(page_head)) {
++ /*
++ * The head page may have been
++ * freed and reallocated as a
++ * compound page of smaller
++ * order and then freed again.
++ * All we know is that it
++ * cannot have become: a THP
++ * page, a compound page of
++ * higher order, a tail page.
++ * That is because we still
++ * hold the refcount of the
++ * split THP tail and
++ * page_head was the THP head
++ * before the split.
++ */
++ if (PageHead(page_head))
++ __put_compound_page(page_head);
++ else
++ __put_single_page(page_head);
++ }
++out_put_single:
+ if (put_page_testzero(page))
+ __put_single_page(page);
+ return;
+@@ -173,21 +210,34 @@ bool __get_page_tail(struct page *page)
+ */
+ unsigned long flags;
+ bool got = false;
+- struct page *page_head;
++ struct page *page_head = compound_trans_head(page);
+
+- /*
+- * If this is a hugetlbfs page it cannot be split under us. Simply
+- * increment refcount for the head page.
+- */
+- if (PageHuge(page)) {
+- page_head = compound_head(page);
+- atomic_inc(&page_head->_count);
+- got = true;
+- goto out;
+- }
+-
+- page_head = compound_trans_head(page);
+ if (likely(page != page_head && get_page_unless_zero(page_head))) {
++ /* Ref to put_compound_page() comment. */
++ if (PageHeadHuge(page_head)) {
++ if (likely(PageTail(page))) {
++ /*
++ * This is a hugetlbfs
++ * page. __split_huge_page_refcount
++ * cannot race here.
++ */
++ VM_BUG_ON(!PageHead(page_head));
++ __get_page_tail_foll(page, false);
++ return true;
++ } else {
++ /*
++ * __split_huge_page_refcount run
++ * before us, "page" was a THP
++ * tail. The split page_head has been
++ * freed and reallocated as slab or
++ * hugetlbfs page of smaller order
++ * (only possible if reallocated as
++ * slab on x86).
++ */
++ put_page(page_head);
++ return false;
++ }
++ }
+ /*
+ * page_head wasn't a dangling pointer but it
+ * may not be a head page anymore by the time
+@@ -204,7 +254,6 @@ bool __get_page_tail(struct page *page)
+ if (unlikely(!got))
+ put_page(page_head);
+ }
+-out:
+ return got;
+ }
+ EXPORT_SYMBOL(__get_page_tail);