--- /dev/null
+From 04697858d89e4bf2650364f8d6956e2554e8ef88 Mon Sep 17 00:00:00 2001
+From: Yinghai Lu <yinghai@kernel.org>
+Date: Fri, 4 Sep 2015 15:42:39 -0700
+Subject: mm: check if section present during memory block registering
+
+From: Yinghai Lu <yinghai@kernel.org>
+
+commit 04697858d89e4bf2650364f8d6956e2554e8ef88 upstream.
+
+Tony Luck found on his setup, if memory block size 512M will cause crash
+during booting.
+
+ BUG: unable to handle kernel paging request at ffffea0074000020
+ IP: get_nid_for_pfn+0x17/0x40
+ PGD 128ffcb067 PUD 128ffc9067 PMD 0
+ Oops: 0000 [#1] SMP
+ Modules linked in:
+ CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.2.0-rc8 #1
+ ...
+ Call Trace:
+ ? register_mem_sect_under_node+0x66/0xe0
+ register_one_node+0x17b/0x240
+ ? pci_iommu_alloc+0x6e/0x6e
+ topology_init+0x3c/0x95
+ do_one_initcall+0xcd/0x1f0
+
+The system has non continuous RAM address:
+ BIOS-e820: [mem 0x0000001300000000-0x0000001cffffffff] usable
+ BIOS-e820: [mem 0x0000001d70000000-0x0000001ec7ffefff] usable
+ BIOS-e820: [mem 0x0000001f00000000-0x0000002bffffffff] usable
+ BIOS-e820: [mem 0x0000002c18000000-0x0000002d6fffefff] usable
+ BIOS-e820: [mem 0x0000002e00000000-0x00000039ffffffff] usable
+
+So there are start sections in memory block not present. For example:
+
+ memory block : [0x2c18000000, 0x2c20000000) 512M
+
+first three sections are not present.
+
+The current register_mem_sect_under_node() assume first section is
+present, but memory block section number range [start_section_nr,
+end_section_nr] would include not present section.
+
+For arch that support vmemmap, we don't setup memmap for struct page
+area within not present sections area.
+
+So skip the pfn range that belong to absent section.
+
+[akpm@linux-foundation.org: simplification]
+[rientjes@google.com: more simplification]
+Fixes: bdee237c0343 ("x86: mm: Use 2GB memory block size on large memory x86-64 systems")
+Fixes: 982792c782ef ("x86, mm: probe memory block size for generic x86 64bit")
+Signed-off-by: Yinghai Lu <yinghai@kernel.org>
+Signed-off-by: David Rientjes <rientjes@google.com>
+Reported-by: Tony Luck <tony.luck@intel.com>
+Tested-by: Tony Luck <tony.luck@intel.com>
+Cc: Greg KH <greg@kroah.com>
+Cc: Ingo Molnar <mingo@elte.hu>
+Tested-by: David Rientjes <rientjes@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/base/node.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/drivers/base/node.c
++++ b/drivers/base/node.c
+@@ -388,6 +388,16 @@ int register_mem_sect_under_node(struct
+ for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
+ int page_nid;
+
++ /*
++ * memory block could have several absent sections from start.
++ * skip pfn range from absent section
++ */
++ if (!pfn_present(pfn)) {
++ pfn = round_down(pfn + PAGES_PER_SECTION,
++ PAGES_PER_SECTION) - 1;
++ continue;
++ }
++
+ page_nid = get_nid_for_pfn(pfn);
+ if (page_nid < 0)
+ continue;
--- /dev/null
+From 2f064f3485cd29633ad1b3cfb00cc519509a3d72 Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.com>
+Date: Fri, 21 Aug 2015 14:11:51 -0700
+Subject: mm: make page pfmemalloc check more robust
+
+From: Michal Hocko <mhocko@suse.com>
+
+commit 2f064f3485cd29633ad1b3cfb00cc519509a3d72 upstream.
+
+Commit c48a11c7ad26 ("netvm: propagate page->pfmemalloc to skb") added
+checks for page->pfmemalloc to __skb_fill_page_desc():
+
+ if (page->pfmemalloc && !page->mapping)
+ skb->pfmemalloc = true;
+
+It assumes page->mapping == NULL implies that page->pfmemalloc can be
+trusted. However, __delete_from_page_cache() can set set page->mapping
+to NULL and leave page->index value alone. Due to being in union, a
+non-zero page->index will be interpreted as true page->pfmemalloc.
+
+So the assumption is invalid if the networking code can see such a page.
+And it seems it can. We have encountered this with a NFS over loopback
+setup when such a page is attached to a new skbuf. There is no copying
+going on in this case so the page confuses __skb_fill_page_desc which
+interprets the index as pfmemalloc flag and the network stack drops
+packets that have been allocated using the reserves unless they are to
+be queued on sockets handling the swapping which is the case here and
+that leads to hangs when the nfs client waits for a response from the
+server which has been dropped and thus never arrive.
+
+The struct page is already heavily packed so rather than finding another
+hole to put it in, let's do a trick instead. We can reuse the index
+again but define it to an impossible value (-1UL). This is the page
+index so it should never see the value that large. Replace all direct
+users of page->pfmemalloc by page_is_pfmemalloc which will hide this
+nastiness from unspoiled eyes.
+
+The information will get lost if somebody wants to use page->index
+obviously but that was the case before and the original code expected
+that the information should be persisted somewhere else if that is
+really needed (e.g. what SLAB and SLUB do).
+
+[akpm@linux-foundation.org: fix blooper in slub]
+Fixes: c48a11c7ad26 ("netvm: propagate page->pfmemalloc to skb")
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Debugged-by: Vlastimil Babka <vbabka@suse.com>
+Debugged-by: Jiri Bohac <jbohac@suse.com>
+Cc: Eric Dumazet <eric.dumazet@gmail.com>
+Cc: David Miller <davem@davemloft.net>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ drivers/net/ethernet/intel/fm10k/fm10k_main.c | 2 -
+ drivers/net/ethernet/intel/igb/igb_main.c | 2 -
+ drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 -
+ drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 2 -
+ include/linux/mm.h | 28 ++++++++++++++++++++++
+ include/linux/mm_types.h | 9 -------
+ include/linux/skbuff.h | 14 +++--------
+ mm/page_alloc.c | 7 +++--
+ mm/slab.c | 4 +--
+ mm/slub.c | 2 -
+ net/core/skbuff.c | 2 -
+ 11 files changed, 46 insertions(+), 28 deletions(-)
+
+--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
++++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c
+@@ -216,7 +216,7 @@ static void fm10k_reuse_rx_page(struct f
+
+ static inline bool fm10k_page_is_reserved(struct page *page)
+ {
+- return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
++ return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
+ }
+
+ static bool fm10k_can_reuse_rx_page(struct fm10k_rx_buffer *rx_buffer,
+--- a/drivers/net/ethernet/intel/igb/igb_main.c
++++ b/drivers/net/ethernet/intel/igb/igb_main.c
+@@ -6596,7 +6596,7 @@ static void igb_reuse_rx_page(struct igb
+
+ static inline bool igb_page_is_reserved(struct page *page)
+ {
+- return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
++ return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
+ }
+
+ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
+--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
++++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+@@ -1829,7 +1829,7 @@ static void ixgbe_reuse_rx_page(struct i
+
+ static inline bool ixgbe_page_is_reserved(struct page *page)
+ {
+- return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
++ return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
+ }
+
+ /**
+--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
++++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+@@ -765,7 +765,7 @@ static void ixgbevf_reuse_rx_page(struct
+
+ static inline bool ixgbevf_page_is_reserved(struct page *page)
+ {
+- return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
++ return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
+ }
+
+ /**
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1002,6 +1002,34 @@ static inline int page_mapped(struct pag
+ }
+
+ /*
++ * Return true only if the page has been allocated with
++ * ALLOC_NO_WATERMARKS and the low watermark was not
++ * met implying that the system is under some pressure.
++ */
++static inline bool page_is_pfmemalloc(struct page *page)
++{
++ /*
++ * Page index cannot be this large so this must be
++ * a pfmemalloc page.
++ */
++ return page->index == -1UL;
++}
++
++/*
++ * Only to be called by the page allocator on a freshly allocated
++ * page.
++ */
++static inline void set_page_pfmemalloc(struct page *page)
++{
++ page->index = -1UL;
++}
++
++static inline void clear_page_pfmemalloc(struct page *page)
++{
++ page->index = 0;
++}
++
++/*
+ * Different kinds of faults, as returned by handle_mm_fault().
+ * Used to decide whether a process gets delivered SIGBUS or
+ * just gets major/minor fault counters bumped up.
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -63,15 +63,6 @@ struct page {
+ union {
+ pgoff_t index; /* Our offset within mapping. */
+ void *freelist; /* sl[aou]b first free object */
+- bool pfmemalloc; /* If set by the page allocator,
+- * ALLOC_NO_WATERMARKS was set
+- * and the low watermark was not
+- * met implying that the system
+- * is under some pressure. The
+- * caller should try ensure
+- * this page is only used to
+- * free other pages.
+- */
+ };
+
+ union {
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -1590,20 +1590,16 @@ static inline void __skb_fill_page_desc(
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ /*
+- * Propagate page->pfmemalloc to the skb if we can. The problem is
+- * that not all callers have unique ownership of the page. If
+- * pfmemalloc is set, we check the mapping as a mapping implies
+- * page->index is set (index and pfmemalloc share space).
+- * If it's a valid mapping, we cannot use page->pfmemalloc but we
+- * do not lose pfmemalloc information as the pages would not be
+- * allocated using __GFP_MEMALLOC.
++ * Propagate page pfmemalloc to the skb if we can. The problem is
++ * that not all callers have unique ownership of the page but rely
++ * on page_is_pfmemalloc doing the right thing(tm).
+ */
+ frag->page.p = page;
+ frag->page_offset = off;
+ skb_frag_size_set(frag, size);
+
+ page = compound_head(page);
+- if (page->pfmemalloc && !page->mapping)
++ if (page_is_pfmemalloc(page))
+ skb->pfmemalloc = true;
+ }
+
+@@ -2250,7 +2246,7 @@ static inline struct page *dev_alloc_pag
+ static inline void skb_propagate_pfmemalloc(struct page *page,
+ struct sk_buff *skb)
+ {
+- if (page && page->pfmemalloc)
++ if (page_is_pfmemalloc(page))
+ skb->pfmemalloc = true;
+ }
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -983,12 +983,15 @@ static int prep_new_page(struct page *pa
+ set_page_owner(page, order, gfp_flags);
+
+ /*
+- * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
++ * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
+ * allocate the page. The expectation is that the caller is taking
+ * steps that will free more memory. The caller should avoid the page
+ * being used for !PFMEMALLOC purposes.
+ */
+- page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
++ if (alloc_flags & ALLOC_NO_WATERMARKS)
++ set_page_pfmemalloc(page);
++ else
++ clear_page_pfmemalloc(page);
+
+ return 0;
+ }
+--- a/mm/slab.c
++++ b/mm/slab.c
+@@ -1602,7 +1602,7 @@ static struct page *kmem_getpages(struct
+ }
+
+ /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
+- if (unlikely(page->pfmemalloc))
++ if (page_is_pfmemalloc(page))
+ pfmemalloc_active = true;
+
+ nr_pages = (1 << cachep->gfporder);
+@@ -1613,7 +1613,7 @@ static struct page *kmem_getpages(struct
+ add_zone_page_state(page_zone(page),
+ NR_SLAB_UNRECLAIMABLE, nr_pages);
+ __SetPageSlab(page);
+- if (page->pfmemalloc)
++ if (page_is_pfmemalloc(page))
+ SetPageSlabPfmemalloc(page);
+
+ if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
+--- a/mm/slub.c
++++ b/mm/slub.c
+@@ -1427,7 +1427,7 @@ static struct page *new_slab(struct kmem
+ inc_slabs_node(s, page_to_nid(page), page->objects);
+ page->slab_cache = s;
+ __SetPageSlab(page);
+- if (page->pfmemalloc)
++ if (page_is_pfmemalloc(page))
+ SetPageSlabPfmemalloc(page);
+
+ start = page_address(page);
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -340,7 +340,7 @@ struct sk_buff *build_skb(void *data, un
+
+ if (skb && frag_size) {
+ skb->head_frag = 1;
+- if (virt_to_head_page(data)->pfmemalloc)
++ if (page_is_pfmemalloc(virt_to_head_page(data)))
+ skb->pfmemalloc = 1;
+ }
+ return skb;