]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.11-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 5 Jun 2017 13:16:16 +0000 (15:16 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 5 Jun 2017 13:16:16 +0000 (15:16 +0200)
added patches:
dax-fix-race-between-colliding-pmd-pte-entries.patch
mlock-fix-mlock-count-can-not-decrease-in-race-condition.patch
mm-avoid-spurious-bad-pmd-warning-messages.patch
mm-consider-memblock-reservations-for-deferred-memory-initialization-sizing.patch
mm-hugetlb-report-ehwpoison-not-efault-when-foll_hwpoison-is-specified.patch
mm-migrate-fix-refcount-handling-when-hugepage_migration_supported.patch
mm-page_alloc.c-make-sure-oom-victim-can-try-allocations-with-no-watermarks-once.patch
pci-pm-add-needs_resume-flag-to-avoid-suspend-complete-optimization.patch
rdma-qib-hfi1-fix-mr-reference-count-leak-on-write-with-immediate.patch
rdma-srp-fix-null-deref-at-srp_destroy_qp.patch

queue-4.11/dax-fix-race-between-colliding-pmd-pte-entries.patch [new file with mode: 0644]
queue-4.11/mlock-fix-mlock-count-can-not-decrease-in-race-condition.patch [new file with mode: 0644]
queue-4.11/mm-avoid-spurious-bad-pmd-warning-messages.patch [new file with mode: 0644]
queue-4.11/mm-consider-memblock-reservations-for-deferred-memory-initialization-sizing.patch [new file with mode: 0644]
queue-4.11/mm-hugetlb-report-ehwpoison-not-efault-when-foll_hwpoison-is-specified.patch [new file with mode: 0644]
queue-4.11/mm-migrate-fix-refcount-handling-when-hugepage_migration_supported.patch [new file with mode: 0644]
queue-4.11/mm-page_alloc.c-make-sure-oom-victim-can-try-allocations-with-no-watermarks-once.patch [new file with mode: 0644]
queue-4.11/pci-pm-add-needs_resume-flag-to-avoid-suspend-complete-optimization.patch [new file with mode: 0644]
queue-4.11/rdma-qib-hfi1-fix-mr-reference-count-leak-on-write-with-immediate.patch [new file with mode: 0644]
queue-4.11/rdma-srp-fix-null-deref-at-srp_destroy_qp.patch [new file with mode: 0644]
queue-4.11/series

diff --git a/queue-4.11/dax-fix-race-between-colliding-pmd-pte-entries.patch b/queue-4.11/dax-fix-race-between-colliding-pmd-pte-entries.patch
new file mode 100644 (file)
index 0000000..65434e9
--- /dev/null
@@ -0,0 +1,154 @@
+From e2093926a098a8ccf0f1d10f6df8dad452cb28d3 Mon Sep 17 00:00:00 2001
+From: Ross Zwisler <ross.zwisler@linux.intel.com>
+Date: Fri, 2 Jun 2017 14:46:37 -0700
+Subject: dax: fix race between colliding PMD & PTE entries
+
+From: Ross Zwisler <ross.zwisler@linux.intel.com>
+
+commit e2093926a098a8ccf0f1d10f6df8dad452cb28d3 upstream.
+
+We currently have two related PMD vs PTE races in the DAX code.  These
+can both be easily triggered by having two threads reading and writing
+simultaneously to the same private mapping, with the key being that
+private mapping reads can be handled with PMDs but private mapping
+writes are always handled with PTEs so that we can COW.
+
+Here is the first race:
+
+  CPU 0                                        CPU 1
+
+  (private mapping write)
+  __handle_mm_fault()
+    create_huge_pmd() - FALLBACK
+    handle_pte_fault()
+      passes check for pmd_devmap()
+
+                                       (private mapping read)
+                                       __handle_mm_fault()
+                                         create_huge_pmd()
+                                           dax_iomap_pmd_fault() inserts PMD
+
+      dax_iomap_pte_fault() does a PTE fault, but we already have a DAX PMD
+                         installed in our page tables at this spot.
+
+Here's the second race:
+
+  CPU 0                                        CPU 1
+
+  (private mapping read)
+  __handle_mm_fault()
+    passes check for pmd_none()
+    create_huge_pmd()
+      dax_iomap_pmd_fault() inserts PMD
+
+  (private mapping write)
+  __handle_mm_fault()
+    create_huge_pmd() - FALLBACK
+                                       (private mapping read)
+                                       __handle_mm_fault()
+                                         passes check for pmd_none()
+                                         create_huge_pmd()
+
+    handle_pte_fault()
+      dax_iomap_pte_fault() inserts PTE
+                                           dax_iomap_pmd_fault() inserts PMD,
+                                              but we already have a PTE at
+                                              this spot.
+
+The core of the issue is that while there is isolation between faults to
+the same range in the DAX fault handlers via our DAX entry locking,
+there is no isolation between faults in the code in mm/memory.c.  This
+means for instance that this code in __handle_mm_fault() can run:
+
+       if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
+               ret = create_huge_pmd(&vmf);
+
+But by the time we actually get to run the fault handler called by
+create_huge_pmd(), the PMD is no longer pmd_none() because a racing PTE
+fault has installed a normal PMD here as a parent.  This is the cause of
+the 2nd race.  The first race is similar - there is the following check
+in handle_pte_fault():
+
+       } else {
+               /* See comment in pte_alloc_one_map() */
+               if (pmd_devmap(*vmf->pmd) || pmd_trans_unstable(vmf->pmd))
+                       return 0;
+
+So if a pmd_devmap() PMD (a DAX PMD) has been installed at vmf->pmd, we
+will bail and retry the fault.  This is correct, but there is nothing
+preventing the PMD from being installed after this check but before we
+actually get to the DAX PTE fault handlers.
+
+In my testing these races result in the following types of errors:
+
+  BUG: Bad rss-counter state mm:ffff8800a817d280 idx:1 val:1
+  BUG: non-zero nr_ptes on freeing mm: 15
+
+Fix this issue by having the DAX fault handlers verify that it is safe
+to continue their fault after they have taken an entry lock to block
+other racing faults.
+
+[ross.zwisler@linux.intel.com: improve fix for colliding PMD & PTE entries]
+  Link: http://lkml.kernel.org/r/20170526195932.32178-1-ross.zwisler@linux.intel.com
+Link: http://lkml.kernel.org/r/20170522215749.23516-2-ross.zwisler@linux.intel.com
+Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
+Reported-by: Pawel Lebioda <pawel.lebioda@intel.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Matthew Wilcox <mawilcox@microsoft.com>
+Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
+Cc: Pawel Lebioda <pawel.lebioda@intel.com>
+Cc: Dave Jiang <dave.jiang@intel.com>
+Cc: Xiong Zhou <xzhou@redhat.com>
+Cc: Eryu Guan <eguan@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/dax.c |   23 +++++++++++++++++++++++
+ 1 file changed, 23 insertions(+)
+
+--- a/fs/dax.c
++++ b/fs/dax.c
+@@ -1129,6 +1129,17 @@ static int dax_iomap_pte_fault(struct vm
+               return dax_fault_return(PTR_ERR(entry));
+       /*
++       * It is possible, particularly with mixed reads & writes to private
++       * mappings, that we have raced with a PMD fault that overlaps with
++       * the PTE we need to set up.  If so just return and the fault will be
++       * retried.
++       */
++      if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
++              vmf_ret = VM_FAULT_NOPAGE;
++              goto unlock_entry;
++      }
++
++      /*
+        * Note that we don't bother to use iomap_apply here: DAX required
+        * the file system block size to be equal the page size, which means
+        * that we never have to deal with more than a single extent here.
+@@ -1363,6 +1374,18 @@ static int dax_iomap_pmd_fault(struct vm
+               goto fallback;
+       /*
++       * It is possible, particularly with mixed reads & writes to private
++       * mappings, that we have raced with a PTE fault that overlaps with
++       * the PMD we need to set up.  If so just return and the fault will be
++       * retried.
++       */
++      if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
++                      !pmd_devmap(*vmf->pmd)) {
++              result = 0;
++              goto unlock_entry;
++      }
++
++      /*
+        * Note that we don't use iomap_apply here.  We aren't doing I/O, only
+        * setting up a mapping, so really we're using iomap_begin() as a way
+        * to look up our filesystem block.
diff --git a/queue-4.11/mlock-fix-mlock-count-can-not-decrease-in-race-condition.patch b/queue-4.11/mlock-fix-mlock-count-can-not-decrease-in-race-condition.patch
new file mode 100644 (file)
index 0000000..2aed78c
--- /dev/null
@@ -0,0 +1,113 @@
+From 70feee0e1ef331b22cc51f383d532a0d043fbdcc Mon Sep 17 00:00:00 2001
+From: Yisheng Xie <xieyisheng1@huawei.com>
+Date: Fri, 2 Jun 2017 14:46:43 -0700
+Subject: mlock: fix mlock count can not decrease in race condition
+
+From: Yisheng Xie <xieyisheng1@huawei.com>
+
+commit 70feee0e1ef331b22cc51f383d532a0d043fbdcc upstream.
+
+Kefeng reported that when running the follow test, the mlock count in
+meminfo will increase permanently:
+
+ [1] testcase
+ linux:~ # cat test_mlockal
+ grep Mlocked /proc/meminfo
+  for j in `seq 0 10`
+  do
+       for i in `seq 4 15`
+       do
+               ./p_mlockall >> log &
+       done
+       sleep 0.2
+ done
+ # wait some time to let mlock counter decrease and 5s may not enough
+ sleep 5
+ grep Mlocked /proc/meminfo
+
+ linux:~ # cat p_mlockall.c
+ #include <sys/mman.h>
+ #include <stdlib.h>
+ #include <stdio.h>
+
+ #define SPACE_LEN     4096
+
+ int main(int argc, char ** argv)
+ {
+               int ret;
+               void *adr = malloc(SPACE_LEN);
+               if (!adr)
+                       return -1;
+
+               ret = mlockall(MCL_CURRENT | MCL_FUTURE);
+               printf("mlcokall ret = %d\n", ret);
+
+               ret = munlockall();
+               printf("munlcokall ret = %d\n", ret);
+
+               free(adr);
+               return 0;
+        }
+
+In __munlock_pagevec() we should decrement NR_MLOCK for each page where
+we clear the PageMlocked flag.  Commit 1ebb7cc6a583 ("mm: munlock: batch
+NR_MLOCK zone state updates") has introduced a bug where we don't
+decrement NR_MLOCK for pages where we clear the flag, but fail to
+isolate them from the lru list (e.g.  when the pages are on some other
+cpu's percpu pagevec).  Since PageMlocked stays cleared, the NR_MLOCK
+accounting gets permanently disrupted by this.
+
+Fix it by counting the number of page whose PageMlock flag is cleared.
+
+Fixes: 1ebb7cc6a583 (" mm: munlock: batch NR_MLOCK zone state updates")
+Link: http://lkml.kernel.org/r/1495678405-54569-1-git-send-email-xieyisheng1@huawei.com
+Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
+Reported-by: Kefeng Wang <wangkefeng.wang@huawei.com>
+Tested-by: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Joern Engel <joern@logfs.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Michel Lespinasse <walken@google.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Xishi Qiu <qiuxishi@huawei.com>
+Cc: zhongjiang <zhongjiang@huawei.com>
+Cc: Hanjun Guo <guohanjun@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/mlock.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/mm/mlock.c
++++ b/mm/mlock.c
+@@ -286,7 +286,7 @@ static void __munlock_pagevec(struct pag
+ {
+       int i;
+       int nr = pagevec_count(pvec);
+-      int delta_munlocked;
++      int delta_munlocked = -nr;
+       struct pagevec pvec_putback;
+       int pgrescued = 0;
+@@ -306,6 +306,8 @@ static void __munlock_pagevec(struct pag
+                               continue;
+                       else
+                               __munlock_isolation_failed(page);
++              } else {
++                      delta_munlocked++;
+               }
+               /*
+@@ -317,7 +319,6 @@ static void __munlock_pagevec(struct pag
+               pagevec_add(&pvec_putback, pvec->pages[i]);
+               pvec->pages[i] = NULL;
+       }
+-      delta_munlocked = -nr + pagevec_count(&pvec_putback);
+       __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+       spin_unlock_irq(zone_lru_lock(zone));
diff --git a/queue-4.11/mm-avoid-spurious-bad-pmd-warning-messages.patch b/queue-4.11/mm-avoid-spurious-bad-pmd-warning-messages.patch
new file mode 100644 (file)
index 0000000..d1932b9
--- /dev/null
@@ -0,0 +1,125 @@
+From d0f0931de936a0a468d7e59284d39581c16d3a73 Mon Sep 17 00:00:00 2001
+From: Ross Zwisler <ross.zwisler@linux.intel.com>
+Date: Fri, 2 Jun 2017 14:46:34 -0700
+Subject: mm: avoid spurious 'bad pmd' warning messages
+
+From: Ross Zwisler <ross.zwisler@linux.intel.com>
+
+commit d0f0931de936a0a468d7e59284d39581c16d3a73 upstream.
+
+When the pmd_devmap() checks were added by 5c7fb56e5e3f ("mm, dax:
+dax-pmd vs thp-pmd vs hugetlbfs-pmd") to add better support for DAX huge
+pages, they were all added to the end of if() statements after existing
+pmd_trans_huge() checks.  So, things like:
+
+  -       if (pmd_trans_huge(*pmd))
+  +       if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
+
+When further checks were added after pmd_trans_unstable() checks by
+commit 7267ec008b5c ("mm: postpone page table allocation until we have
+page to map") they were also added at the end of the conditional:
+
+  +       if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
+
+This ordering is fine for pmd_trans_huge(), but doesn't work for
+pmd_trans_unstable().  This is because DAX huge pages trip the bad_pmd()
+check inside of pmd_none_or_trans_huge_or_clear_bad() (called by
+pmd_trans_unstable()), which prints out a warning and returns 1.  So, we
+do end up doing the right thing, but only after spamming dmesg with
+suspicious looking messages:
+
+  mm/pgtable-generic.c:39: bad pmd ffff8808daa49b88(84000001006000a5)
+
+Reorder these checks in a helper so that pmd_devmap() is checked first,
+avoiding the error messages, and add a comment explaining why the
+ordering is important.
+
+Fixes: commit 7267ec008b5c ("mm: postpone page table allocation until we have page to map")
+Link: http://lkml.kernel.org/r/20170522215749.23516-1-ross.zwisler@linux.intel.com
+Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Cc: Pawel Lebioda <pawel.lebioda@intel.com>
+Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: Matthew Wilcox <mawilcox@microsoft.com>
+Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
+Cc: Dave Jiang <dave.jiang@intel.com>
+Cc: Xiong Zhou <xzhou@redhat.com>
+Cc: Eryu Guan <eguan@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memory.c |   40 ++++++++++++++++++++++++++++++----------
+ 1 file changed, 30 insertions(+), 10 deletions(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -3029,6 +3029,17 @@ static int __do_fault(struct vm_fault *v
+       return ret;
+ }
++/*
++ * The ordering of these checks is important for pmds with _PAGE_DEVMAP set.
++ * If we check pmd_trans_unstable() first we will trip the bad_pmd() check
++ * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly
++ * returning 1 but not before it spams dmesg with the pmd_clear_bad() output.
++ */
++static int pmd_devmap_trans_unstable(pmd_t *pmd)
++{
++      return pmd_devmap(*pmd) || pmd_trans_unstable(pmd);
++}
++
+ static int pte_alloc_one_map(struct vm_fault *vmf)
+ {
+       struct vm_area_struct *vma = vmf->vma;
+@@ -3052,18 +3063,27 @@ static int pte_alloc_one_map(struct vm_f
+ map_pte:
+       /*
+        * If a huge pmd materialized under us just retry later.  Use
+-       * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
+-       * didn't become pmd_trans_huge under us and then back to pmd_none, as
+-       * a result of MADV_DONTNEED running immediately after a huge pmd fault
+-       * in a different thread of this mm, in turn leading to a misleading
+-       * pmd_trans_huge() retval.  All we have to ensure is that it is a
+-       * regular pmd that we can walk with pte_offset_map() and we can do that
+-       * through an atomic read in C, which is what pmd_trans_unstable()
+-       * provides.
++       * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of
++       * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge
++       * under us and then back to pmd_none, as a result of MADV_DONTNEED
++       * running immediately after a huge pmd fault in a different thread of
++       * this mm, in turn leading to a misleading pmd_trans_huge() retval.
++       * All we have to ensure is that it is a regular pmd that we can walk
++       * with pte_offset_map() and we can do that through an atomic read in
++       * C, which is what pmd_trans_unstable() provides.
+        */
+-      if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
++      if (pmd_devmap_trans_unstable(vmf->pmd))
+               return VM_FAULT_NOPAGE;
++      /*
++       * At this point we know that our vmf->pmd points to a page of ptes
++       * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge()
++       * for the duration of the fault.  If a racing MADV_DONTNEED runs and
++       * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still
++       * be valid and we will re-check to make sure the vmf->pte isn't
++       * pte_none() under vmf->ptl protection when we return to
++       * alloc_set_pte().
++       */
+       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+                       &vmf->ptl);
+       return 0;
+@@ -3690,7 +3710,7 @@ static int handle_pte_fault(struct vm_fa
+               vmf->pte = NULL;
+       } else {
+               /* See comment in pte_alloc_one_map() */
+-              if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
++              if (pmd_devmap_trans_unstable(vmf->pmd))
+                       return 0;
+               /*
+                * A regular pmd is established and it can't morph into a huge
diff --git a/queue-4.11/mm-consider-memblock-reservations-for-deferred-memory-initialization-sizing.patch b/queue-4.11/mm-consider-memblock-reservations-for-deferred-memory-initialization-sizing.patch
new file mode 100644 (file)
index 0000000..098dcf4
--- /dev/null
@@ -0,0 +1,217 @@
+From 864b9a393dcb5aed09b8fd31b9bbda0fdda99374 Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.com>
+Date: Fri, 2 Jun 2017 14:46:49 -0700
+Subject: mm: consider memblock reservations for deferred memory initialization sizing
+
+From: Michal Hocko <mhocko@suse.com>
+
+commit 864b9a393dcb5aed09b8fd31b9bbda0fdda99374 upstream.
+
+We have seen an early OOM killer invocation on ppc64 systems with
+crashkernel=4096M:
+
+       kthreadd invoked oom-killer: gfp_mask=0x16040c0(GFP_KERNEL|__GFP_COMP|__GFP_NOTRACK), nodemask=7, order=0, oom_score_adj=0
+       kthreadd cpuset=/ mems_allowed=7
+       CPU: 0 PID: 2 Comm: kthreadd Not tainted 4.4.68-1.gd7fe927-default #1
+       Call Trace:
+         dump_stack+0xb0/0xf0 (unreliable)
+         dump_header+0xb0/0x258
+         out_of_memory+0x5f0/0x640
+         __alloc_pages_nodemask+0xa8c/0xc80
+         kmem_getpages+0x84/0x1a0
+         fallback_alloc+0x2a4/0x320
+         kmem_cache_alloc_node+0xc0/0x2e0
+         copy_process.isra.25+0x260/0x1b30
+         _do_fork+0x94/0x470
+         kernel_thread+0x48/0x60
+         kthreadd+0x264/0x330
+         ret_from_kernel_thread+0x5c/0xa4
+
+       Mem-Info:
+       active_anon:0 inactive_anon:0 isolated_anon:0
+        active_file:0 inactive_file:0 isolated_file:0
+        unevictable:0 dirty:0 writeback:0 unstable:0
+        slab_reclaimable:5 slab_unreclaimable:73
+        mapped:0 shmem:0 pagetables:0 bounce:0
+        free:0 free_pcp:0 free_cma:0
+       Node 7 DMA free:0kB min:0kB low:0kB high:0kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:52428800kB managed:110016kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:320kB slab_unreclaimable:4672kB kernel_stack:1152kB pagetables:0kB unstable:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes
+       lowmem_reserve[]: 0 0 0 0
+       Node 7 DMA: 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB 0*8192kB 0*16384kB = 0kB
+       0 total pagecache pages
+       0 pages in swap cache
+       Swap cache stats: add 0, delete 0, find 0/0
+       Free swap  = 0kB
+       Total swap = 0kB
+       819200 pages RAM
+       0 pages HighMem/MovableOnly
+       817481 pages reserved
+       0 pages cma reserved
+       0 pages hwpoisoned
+
+the reason is that the managed memory is too low (only 110MB) while the
+rest of the the 50GB is still waiting for the deferred intialization to
+be done.  update_defer_init estimates the initial memoty to initialize
+to 2GB at least but it doesn't consider any memory allocated in that
+range.  In this particular case we've had
+
+       Reserving 4096MB of memory at 128MB for crashkernel (System RAM: 51200MB)
+
+so the low 2GB is mostly depleted.
+
+Fix this by considering memblock allocations in the initial static
+initialization estimation.  Move the max_initialise to
+reset_deferred_meminit and implement a simple memblock_reserved_memory
+helper which iterates all reserved blocks and sums the size of all that
+start below the given address.  The cumulative size is than added on top
+of the initial estimation.  This is still not ideal because
+reset_deferred_meminit doesn't consider holes and so reservation might
+be above the initial estimation whihch we ignore but let's make the
+logic simpler until we really need to handle more complicated cases.
+
+Fixes: 3a80a7fa7989 ("mm: meminit: initialise a subset of struct pages if CONFIG_DEFERRED_STRUCT_PAGE_INIT is set")
+Link: http://lkml.kernel.org/r/20170531104010.GI27783@dhcp22.suse.cz
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Tested-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/memblock.h |    8 ++++++++
+ include/linux/mmzone.h   |    1 +
+ mm/memblock.c            |   23 +++++++++++++++++++++++
+ mm/page_alloc.c          |   33 ++++++++++++++++++++++-----------
+ 4 files changed, 54 insertions(+), 11 deletions(-)
+
+--- a/include/linux/memblock.h
++++ b/include/linux/memblock.h
+@@ -423,11 +423,19 @@ static inline void early_memtest(phys_ad
+ }
+ #endif
++extern unsigned long memblock_reserved_memory_within(phys_addr_t start_addr,
++              phys_addr_t end_addr);
+ #else
+ static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align)
+ {
+       return 0;
+ }
++
++static inline unsigned long memblock_reserved_memory_within(phys_addr_t start_addr,
++              phys_addr_t end_addr)
++{
++      return 0;
++}
+ #endif /* CONFIG_HAVE_MEMBLOCK */
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -672,6 +672,7 @@ typedef struct pglist_data {
+        * is the first PFN that needs to be initialised.
+        */
+       unsigned long first_deferred_pfn;
++      unsigned long static_init_size;
+ #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+--- a/mm/memblock.c
++++ b/mm/memblock.c
+@@ -1713,6 +1713,29 @@ static void __init_memblock memblock_dum
+       }
+ }
++extern unsigned long __init_memblock
++memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr)
++{
++      struct memblock_region *rgn;
++      unsigned long size = 0;
++      int idx;
++
++      for_each_memblock_type((&memblock.reserved), rgn) {
++              phys_addr_t start, end;
++
++              if (rgn->base + rgn->size < start_addr)
++                      continue;
++              if (rgn->base > end_addr)
++                      continue;
++
++              start = rgn->base;
++              end = start + rgn->size;
++              size += end - start;
++      }
++
++      return size;
++}
++
+ void __init_memblock __memblock_dump_all(void)
+ {
+       pr_info("MEMBLOCK configuration:\n");
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -291,6 +291,26 @@ int page_group_by_mobility_disabled __re
+ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+ static inline void reset_deferred_meminit(pg_data_t *pgdat)
+ {
++      unsigned long max_initialise;
++      unsigned long reserved_lowmem;
++
++      /*
++       * Initialise at least 2G of a node but also take into account that
++       * two large system hashes that can take up 1GB for 0.25TB/node.
++       */
++      max_initialise = max(2UL << (30 - PAGE_SHIFT),
++              (pgdat->node_spanned_pages >> 8));
++
++      /*
++       * Compensate the all the memblock reservations (e.g. crash kernel)
++       * from the initial estimation to make sure we will initialize enough
++       * memory to boot.
++       */
++      reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
++                      pgdat->node_start_pfn + max_initialise);
++      max_initialise += reserved_lowmem;
++
++      pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages);
+       pgdat->first_deferred_pfn = ULONG_MAX;
+ }
+@@ -313,20 +333,11 @@ static inline bool update_defer_init(pg_
+                               unsigned long pfn, unsigned long zone_end,
+                               unsigned long *nr_initialised)
+ {
+-      unsigned long max_initialise;
+-
+       /* Always populate low zones for address-contrained allocations */
+       if (zone_end < pgdat_end_pfn(pgdat))
+               return true;
+-      /*
+-       * Initialise at least 2G of a node but also take into account that
+-       * two large system hashes that can take up 1GB for 0.25TB/node.
+-       */
+-      max_initialise = max(2UL << (30 - PAGE_SHIFT),
+-              (pgdat->node_spanned_pages >> 8));
+-
+       (*nr_initialised)++;
+-      if ((*nr_initialised > max_initialise) &&
++      if ((*nr_initialised > pgdat->static_init_size) &&
+           (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+               pgdat->first_deferred_pfn = pfn;
+               return false;
+@@ -6100,7 +6111,6 @@ void __paginginit free_area_init_node(in
+       /* pg_data_t should be reset to zero when it's allocated */
+       WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
+-      reset_deferred_meminit(pgdat);
+       pgdat->node_id = nid;
+       pgdat->node_start_pfn = node_start_pfn;
+       pgdat->per_cpu_nodestats = NULL;
+@@ -6122,6 +6132,7 @@ void __paginginit free_area_init_node(in
+               (unsigned long)pgdat->node_mem_map);
+ #endif
++      reset_deferred_meminit(pgdat);
+       free_area_init_core(pgdat);
+ }
diff --git a/queue-4.11/mm-hugetlb-report-ehwpoison-not-efault-when-foll_hwpoison-is-specified.patch b/queue-4.11/mm-hugetlb-report-ehwpoison-not-efault-when-foll_hwpoison-is-specified.patch
new file mode 100644 (file)
index 0000000..b8e0803
--- /dev/null
@@ -0,0 +1,120 @@
+From 9a291a7c9428155e8e623e4a3989f8be47134df5 Mon Sep 17 00:00:00 2001
+From: James Morse <james.morse@arm.com>
+Date: Fri, 2 Jun 2017 14:46:46 -0700
+Subject: mm/hugetlb: report -EHWPOISON not -EFAULT when FOLL_HWPOISON is specified
+
+From: James Morse <james.morse@arm.com>
+
+commit 9a291a7c9428155e8e623e4a3989f8be47134df5 upstream.
+
+KVM uses get_user_pages() to resolve its stage2 faults.  KVM sets the
+FOLL_HWPOISON flag causing faultin_page() to return -EHWPOISON when it
+finds a VM_FAULT_HWPOISON.  KVM handles these hwpoison pages as a
+special case.  (check_user_page_hwpoison())
+
+When huge pages are involved, this doesn't work so well.
+get_user_pages() calls follow_hugetlb_page(), which stops early if it
+receives VM_FAULT_HWPOISON from hugetlb_fault(), eventually returning
+-EFAULT to the caller.  The step to map this to -EHWPOISON based on the
+FOLL_ flags is missing.  The hwpoison special case is skipped, and
+-EFAULT is returned to user-space, causing Qemu or kvmtool to exit.
+
+Instead, move this VM_FAULT_ to errno mapping code into a header file
+and use it from faultin_page() and follow_hugetlb_page().
+
+With this, KVM works as expected.
+
+This isn't a problem for arm64 today as we haven't enabled
+MEMORY_FAILURE, but I can't see any reason this doesn't happen on x86
+too, so I think this should be a fix.  This doesn't apply earlier than
+stable's v4.11.1 due to all sorts of cleanup.
+
+[james.morse@arm.com: add vm_fault_to_errno() call to faultin_page()]
+suggested.
+  Link: http://lkml.kernel.org/r/20170525171035.16359-1-james.morse@arm.com
+[akpm@linux-foundation.org: coding-style fixes]
+Link: http://lkml.kernel.org/r/20170524160900.28786-1-james.morse@arm.com
+Signed-off-by: James Morse <james.morse@arm.com>
+Acked-by: Punit Agrawal <punit.agrawal@arm.com>
+Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/mm.h |   11 +++++++++++
+ mm/gup.c           |   20 ++++++++------------
+ mm/hugetlb.c       |    5 +++++
+ 3 files changed, 24 insertions(+), 12 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2315,6 +2315,17 @@ static inline struct page *follow_page(s
+ #define FOLL_REMOTE   0x2000  /* we are working on non-current tsk/mm */
+ #define FOLL_COW      0x4000  /* internal GUP flag */
++static inline int vm_fault_to_errno(int vm_fault, int foll_flags)
++{
++      if (vm_fault & VM_FAULT_OOM)
++              return -ENOMEM;
++      if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
++              return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT;
++      if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
++              return -EFAULT;
++      return 0;
++}
++
+ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
+                       void *data);
+ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -407,12 +407,10 @@ static int faultin_page(struct task_stru
+       ret = handle_mm_fault(vma, address, fault_flags);
+       if (ret & VM_FAULT_ERROR) {
+-              if (ret & VM_FAULT_OOM)
+-                      return -ENOMEM;
+-              if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+-                      return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
+-              if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
+-                      return -EFAULT;
++              int err = vm_fault_to_errno(ret, *flags);
++
++              if (err)
++                      return err;
+               BUG();
+       }
+@@ -723,12 +721,10 @@ retry:
+       ret = handle_mm_fault(vma, address, fault_flags);
+       major |= ret & VM_FAULT_MAJOR;
+       if (ret & VM_FAULT_ERROR) {
+-              if (ret & VM_FAULT_OOM)
+-                      return -ENOMEM;
+-              if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
+-                      return -EHWPOISON;
+-              if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
+-                      return -EFAULT;
++              int err = vm_fault_to_errno(ret, 0);
++
++              if (err)
++                      return err;
+               BUG();
+       }
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -4170,6 +4170,11 @@ long follow_hugetlb_page(struct mm_struc
+                       }
+                       ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
+                       if (ret & VM_FAULT_ERROR) {
++                              int err = vm_fault_to_errno(ret, flags);
++
++                              if (err)
++                                      return err;
++
+                               remainder = 0;
+                               break;
+                       }
diff --git a/queue-4.11/mm-migrate-fix-refcount-handling-when-hugepage_migration_supported.patch b/queue-4.11/mm-migrate-fix-refcount-handling-when-hugepage_migration_supported.patch
new file mode 100644 (file)
index 0000000..6182dba
--- /dev/null
@@ -0,0 +1,87 @@
+From 30809f559a0d348c2dfd7ab05e9a451e2384962e Mon Sep 17 00:00:00 2001
+From: Punit Agrawal <punit.agrawal@arm.com>
+Date: Fri, 2 Jun 2017 14:46:40 -0700
+Subject: mm/migrate: fix refcount handling when !hugepage_migration_supported()
+
+From: Punit Agrawal <punit.agrawal@arm.com>
+
+commit 30809f559a0d348c2dfd7ab05e9a451e2384962e upstream.
+
+On failing to migrate a page, soft_offline_huge_page() performs the
+necessary update to the hugepage ref-count.
+
+But when !hugepage_migration_supported() , unmap_and_move_hugepage()
+also decrements the page ref-count for the hugepage.  The combined
+behaviour leaves the ref-count in an inconsistent state.
+
+This leads to soft lockups when running the overcommitted hugepage test
+from mce-tests suite.
+
+  Soft offlining pfn 0x83ed600 at process virtual address 0x400000000000
+  soft offline: 0x83ed600: migration failed 1, type 1fffc00000008008 (uptodate|head)
+  INFO: rcu_preempt detected stalls on CPUs/tasks:
+   Tasks blocked on level-0 rcu_node (CPUs 0-7): P2715
+    (detected by 7, t=5254 jiffies, g=963, c=962, q=321)
+    thugetlb_overco R  running task        0  2715   2685 0x00000008
+    Call trace:
+      dump_backtrace+0x0/0x268
+      show_stack+0x24/0x30
+      sched_show_task+0x134/0x180
+      rcu_print_detail_task_stall_rnp+0x54/0x7c
+      rcu_check_callbacks+0xa74/0xb08
+      update_process_times+0x34/0x60
+      tick_sched_handle.isra.7+0x38/0x70
+      tick_sched_timer+0x4c/0x98
+      __hrtimer_run_queues+0xc0/0x300
+      hrtimer_interrupt+0xac/0x228
+      arch_timer_handler_phys+0x3c/0x50
+      handle_percpu_devid_irq+0x8c/0x290
+      generic_handle_irq+0x34/0x50
+      __handle_domain_irq+0x68/0xc0
+      gic_handle_irq+0x5c/0xb0
+
+Address this by changing the putback_active_hugepage() in
+soft_offline_huge_page() to putback_movable_pages().
+
+This only triggers on systems that enable memory failure handling
+(ARCH_SUPPORTS_MEMORY_FAILURE) but not hugepage migration
+(!ARCH_ENABLE_HUGEPAGE_MIGRATION).
+
+I imagine this wasn't triggered as there aren't many systems running
+this configuration.
+
+[akpm@linux-foundation.org: remove dead comment, per Naoya]
+Link: http://lkml.kernel.org/r/20170525135146.32011-1-punit.agrawal@arm.com
+Reported-by: Manoj Iyer <manoj.iyer@canonical.com>
+Tested-by: Manoj Iyer <manoj.iyer@canonical.com>
+Suggested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Wanpeng Li <wanpeng.li@hotmail.com>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memory-failure.c |    8 ++------
+ 1 file changed, 2 insertions(+), 6 deletions(-)
+
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -1587,12 +1587,8 @@ static int soft_offline_huge_page(struct
+       if (ret) {
+               pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
+                       pfn, ret, page->flags);
+-              /*
+-               * We know that soft_offline_huge_page() tries to migrate
+-               * only one hugepage pointed to by hpage, so we need not
+-               * run through the pagelist here.
+-               */
+-              putback_active_hugepage(hpage);
++              if (!list_empty(&pagelist))
++                      putback_movable_pages(&pagelist);
+               if (ret > 0)
+                       ret = -EIO;
+       } else {
diff --git a/queue-4.11/mm-page_alloc.c-make-sure-oom-victim-can-try-allocations-with-no-watermarks-once.patch b/queue-4.11/mm-page_alloc.c-make-sure-oom-victim-can-try-allocations-with-no-watermarks-once.patch
new file mode 100644 (file)
index 0000000..91bc7c3
--- /dev/null
@@ -0,0 +1,149 @@
+From c288983dddf714216428774e022ad78f48dd8cb1 Mon Sep 17 00:00:00 2001
+From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Date: Fri, 2 Jun 2017 14:46:31 -0700
+Subject: mm/page_alloc.c: make sure OOM victim can try allocations with no watermarks once
+
+From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+
+commit c288983dddf714216428774e022ad78f48dd8cb1 upstream.
+
+Roman Gushchin has reported that the OOM killer can trivially selects
+next OOM victim when a thread doing memory allocation from page fault
+path was selected as first OOM victim.
+
+    allocate invoked oom-killer: gfp_mask=0x14280ca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), nodemask=(null),  order=0, oom_score_adj=0
+    allocate cpuset=/ mems_allowed=0
+    CPU: 1 PID: 492 Comm: allocate Not tainted 4.12.0-rc1-mm1+ #181
+    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
+    Call Trace:
+     oom_kill_process+0x219/0x3e0
+     out_of_memory+0x11d/0x480
+     __alloc_pages_slowpath+0xc84/0xd40
+     __alloc_pages_nodemask+0x245/0x260
+     alloc_pages_vma+0xa2/0x270
+     __handle_mm_fault+0xca9/0x10c0
+     handle_mm_fault+0xf3/0x210
+     __do_page_fault+0x240/0x4e0
+     trace_do_page_fault+0x37/0xe0
+     do_async_page_fault+0x19/0x70
+     async_page_fault+0x28/0x30
+    ...
+    Out of memory: Kill process 492 (allocate) score 899 or sacrifice child
+    Killed process 492 (allocate) total-vm:2052368kB, anon-rss:1894576kB, file-rss:4kB, shmem-rss:0kB
+    allocate: page allocation failure: order:0, mode:0x14280ca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), nodemask=(null)
+    allocate cpuset=/ mems_allowed=0
+    CPU: 1 PID: 492 Comm: allocate Not tainted 4.12.0-rc1-mm1+ #181
+    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
+    Call Trace:
+     __alloc_pages_slowpath+0xd32/0xd40
+     __alloc_pages_nodemask+0x245/0x260
+     alloc_pages_vma+0xa2/0x270
+     __handle_mm_fault+0xca9/0x10c0
+     handle_mm_fault+0xf3/0x210
+     __do_page_fault+0x240/0x4e0
+     trace_do_page_fault+0x37/0xe0
+     do_async_page_fault+0x19/0x70
+     async_page_fault+0x28/0x30
+    ...
+    oom_reaper: reaped process 492 (allocate), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
+    ...
+    allocate invoked oom-killer: gfp_mask=0x0(), nodemask=(null),  order=0, oom_score_adj=0
+    allocate cpuset=/ mems_allowed=0
+    CPU: 1 PID: 492 Comm: allocate Not tainted 4.12.0-rc1-mm1+ #181
+    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
+    Call Trace:
+     oom_kill_process+0x219/0x3e0
+     out_of_memory+0x11d/0x480
+     pagefault_out_of_memory+0x68/0x80
+     mm_fault_error+0x8f/0x190
+     ? handle_mm_fault+0xf3/0x210
+     __do_page_fault+0x4b2/0x4e0
+     trace_do_page_fault+0x37/0xe0
+     do_async_page_fault+0x19/0x70
+     async_page_fault+0x28/0x30
+    ...
+    Out of memory: Kill process 233 (firewalld) score 10 or sacrifice child
+    Killed process 233 (firewalld) total-vm:246076kB, anon-rss:20956kB, file-rss:0kB, shmem-rss:0kB
+
+There is a race window that the OOM reaper completes reclaiming the
+first victim's memory while nothing but mutex_trylock() prevents the
+first victim from calling out_of_memory() from pagefault_out_of_memory()
+after memory allocation for page fault path failed due to being selected
+as an OOM victim.
+
+This is a side effect of commit 9a67f6488eca926f ("mm: consolidate
+GFP_NOFAIL checks in the allocator slowpath") because that commit
+silently changed the behavior from
+
+    /* Avoid allocations with no watermarks from looping endlessly */
+
+to
+
+    /*
+     * Give up allocations without trying memory reserves if selected
+     * as an OOM victim
+     */
+
+in __alloc_pages_slowpath() by moving the location to check TIF_MEMDIE
+flag.  I have noticed this change but I didn't post a patch because I
+thought it is an acceptable change other than noise by warn_alloc()
+because !__GFP_NOFAIL allocations are allowed to fail.  But we
+overlooked that failing memory allocation from page fault path makes
+difference due to the race window explained above.
+
+While it might be possible to add a check to pagefault_out_of_memory()
+that prevents the first victim from calling out_of_memory() or remove
+out_of_memory() from pagefault_out_of_memory(), changing
+pagefault_out_of_memory() does not suppress noise by warn_alloc() when
+allocating thread was selected as an OOM victim.  There is little point
+with printing similar backtraces and memory information from both
+out_of_memory() and warn_alloc().
+
+Instead, if we guarantee that current thread can try allocations with no
+watermarks once when current thread looping inside
+__alloc_pages_slowpath() was selected as an OOM victim, we can follow "who
+can use memory reserves" rules and suppress noise by warn_alloc() and
+prevent memory allocations from page fault path from calling
+pagefault_out_of_memory().
+
+If we take the comment literally, this patch would do
+
+  -    if (test_thread_flag(TIF_MEMDIE))
+  -        goto nopage;
+  +    if (alloc_flags == ALLOC_NO_WATERMARKS || (gfp_mask & __GFP_NOMEMALLOC))
+  +        goto nopage;
+
+because gfp_pfmemalloc_allowed() returns false if __GFP_NOMEMALLOC is
+given.  But if I recall correctly (I couldn't find the message), the
+condition is meant to apply to only OOM victims despite the comment.
+Therefore, this patch preserves TIF_MEMDIE check.
+
+Fixes: 9a67f6488eca926f ("mm: consolidate GFP_NOFAIL checks in the allocator slowpath")
+Link: http://lkml.kernel.org/r/201705192112.IAF69238.OQOHSJLFOFFMtV@I-love.SAKURA.ne.jp
+Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Reported-by: Roman Gushchin <guro@fb.com>
+Tested-by: Roman Gushchin <guro@fb.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_alloc.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3834,7 +3834,9 @@ retry:
+               goto got_pg;
+       /* Avoid allocations with no watermarks from looping endlessly */
+-      if (test_thread_flag(TIF_MEMDIE))
++      if (test_thread_flag(TIF_MEMDIE) &&
++          (alloc_flags == ALLOC_NO_WATERMARKS ||
++           (gfp_mask & __GFP_NOMEMALLOC)))
+               goto nopage;
+       /* Retry as long as the OOM killer is making progress */
diff --git a/queue-4.11/pci-pm-add-needs_resume-flag-to-avoid-suspend-complete-optimization.patch b/queue-4.11/pci-pm-add-needs_resume-flag-to-avoid-suspend-complete-optimization.patch
new file mode 100644 (file)
index 0000000..9125623
--- /dev/null
@@ -0,0 +1,55 @@
+From 4d071c3238987325b9e50e33051a40d1cce311cc Mon Sep 17 00:00:00 2001
+From: Imre Deak <imre.deak@intel.com>
+Date: Tue, 23 May 2017 14:18:17 -0500
+Subject: PCI/PM: Add needs_resume flag to avoid suspend complete optimization
+
+From: Imre Deak <imre.deak@intel.com>
+
+commit 4d071c3238987325b9e50e33051a40d1cce311cc upstream.
+
+Some drivers - like i915 - may not support the system suspend direct
+complete optimization due to differences in their runtime and system
+suspend sequence.  Add a flag that when set resumes the device before
+calling the driver's system suspend handlers which effectively disables
+the optimization.
+
+Needed by a future patch fixing suspend/resume on i915.
+
+Suggested by Rafael.
+
+Signed-off-by: Imre Deak <imre.deak@intel.com>
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
+Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/pci/pci.c   |    3 ++-
+ include/linux/pci.h |    5 +++++
+ 2 files changed, 7 insertions(+), 1 deletion(-)
+
+--- a/drivers/pci/pci.c
++++ b/drivers/pci/pci.c
+@@ -2142,7 +2142,8 @@ bool pci_dev_keep_suspended(struct pci_d
+       if (!pm_runtime_suspended(dev)
+           || pci_target_state(pci_dev) != pci_dev->current_state
+-          || platform_pci_need_resume(pci_dev))
++          || platform_pci_need_resume(pci_dev)
++          || (pci_dev->dev_flags & PCI_DEV_FLAGS_NEEDS_RESUME))
+               return false;
+       /*
+--- a/include/linux/pci.h
++++ b/include/linux/pci.h
+@@ -178,6 +178,11 @@ enum pci_dev_flags {
+       PCI_DEV_FLAGS_NO_PM_RESET = (__force pci_dev_flags_t) (1 << 7),
+       /* Get VPD from function 0 VPD */
+       PCI_DEV_FLAGS_VPD_REF_F0 = (__force pci_dev_flags_t) (1 << 8),
++      /*
++       * Resume before calling the driver's system suspend hooks, disabling
++       * the direct_complete optimization.
++       */
++      PCI_DEV_FLAGS_NEEDS_RESUME = (__force pci_dev_flags_t) (1 << 11),
+ };
+ enum pci_irq_reroute_variant {
diff --git a/queue-4.11/rdma-qib-hfi1-fix-mr-reference-count-leak-on-write-with-immediate.patch b/queue-4.11/rdma-qib-hfi1-fix-mr-reference-count-leak-on-write-with-immediate.patch
new file mode 100644 (file)
index 0000000..eada79a
--- /dev/null
@@ -0,0 +1,63 @@
+From 1feb40067cf04ae48d65f728d62ca255c9449178 Mon Sep 17 00:00:00 2001
+From: Mike Marciniszyn <mike.marciniszyn@intel.com>
+Date: Fri, 12 May 2017 09:02:00 -0700
+Subject: RDMA/qib,hfi1: Fix MR reference count leak on write with immediate
+
+From: Mike Marciniszyn <mike.marciniszyn@intel.com>
+
+commit 1feb40067cf04ae48d65f728d62ca255c9449178 upstream.
+
+The handling of IB_RDMA_WRITE_ONLY_WITH_IMMEDIATE will leak a memory
+reference when a buffer cannot be allocated for returning the immediate
+data.
+
+The issue is that the rkey validation has already occurred and the RNR
+nak fails to release the reference that was fruitlessly gotten.  The
+the peer will send the identical single packet request when its RNR
+timer pops.
+
+The fix is to release the held reference prior to the rnr nak exit.
+This is the only sequence the requires both rkey validation and the
+buffer allocation on the same packet.
+
+Tested-by: Tadeusz Struk <tadeusz.struk@intel.com>
+Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
+Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
+Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
+Signed-off-by: Doug Ledford <dledford@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/infiniband/hw/hfi1/rc.c    |    5 ++++-
+ drivers/infiniband/hw/qib/qib_rc.c |    4 +++-
+ 2 files changed, 7 insertions(+), 2 deletions(-)
+
+--- a/drivers/infiniband/hw/hfi1/rc.c
++++ b/drivers/infiniband/hw/hfi1/rc.c
+@@ -2149,8 +2149,11 @@ send_last:
+               ret = hfi1_rvt_get_rwqe(qp, 1);
+               if (ret < 0)
+                       goto nack_op_err;
+-              if (!ret)
++              if (!ret) {
++                      /* peer will send again */
++                      rvt_put_ss(&qp->r_sge);
+                       goto rnr_nak;
++              }
+               wc.ex.imm_data = ohdr->u.rc.imm_data;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               goto send_last;
+--- a/drivers/infiniband/hw/qib/qib_rc.c
++++ b/drivers/infiniband/hw/qib/qib_rc.c
+@@ -1947,8 +1947,10 @@ send_last:
+               ret = qib_get_rwqe(qp, 1);
+               if (ret < 0)
+                       goto nack_op_err;
+-              if (!ret)
++              if (!ret) {
++                      rvt_put_ss(&qp->r_sge);
+                       goto rnr_nak;
++              }
+               wc.ex.imm_data = ohdr->u.rc.imm_data;
+               hdrsize += 4;
+               wc.wc_flags = IB_WC_WITH_IMM;
diff --git a/queue-4.11/rdma-srp-fix-null-deref-at-srp_destroy_qp.patch b/queue-4.11/rdma-srp-fix-null-deref-at-srp_destroy_qp.patch
new file mode 100644 (file)
index 0000000..7d931f7
--- /dev/null
@@ -0,0 +1,36 @@
+From 95c2ef50c726a51d580c35ae8dccd383abaa8701 Mon Sep 17 00:00:00 2001
+From: Israel Rukshin <israelr@mellanox.com>
+Date: Thu, 11 May 2017 18:52:36 +0300
+Subject: RDMA/srp: Fix NULL deref at srp_destroy_qp()
+
+From: Israel Rukshin <israelr@mellanox.com>
+
+commit 95c2ef50c726a51d580c35ae8dccd383abaa8701 upstream.
+
+If srp_init_qp() fails at srp_create_ch_ib() then ch->send_cq
+may be NULL.
+Calling directly to ib_destroy_qp() is sufficient because
+no work requests were posted on the created qp.
+
+Fixes: 9294000d6d89 ("IB/srp: Drain the send queue before destroying a QP")
+Signed-off-by: Israel Rukshin <israelr@mellanox.com>
+Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
+Reviewed-by: Bart van Assche <bart.vanassche@sandisk.com>--
+Signed-off-by: Doug Ledford <dledford@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/infiniband/ulp/srp/ib_srp.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/infiniband/ulp/srp/ib_srp.c
++++ b/drivers/infiniband/ulp/srp/ib_srp.c
+@@ -570,7 +570,7 @@ static int srp_create_ch_ib(struct srp_r
+       return 0;
+ err_qp:
+-      srp_destroy_qp(ch, qp);
++      ib_destroy_qp(qp);
+ err_send_cq:
+       ib_free_cq(send_cq);
index 7c1cff3d497b03229802cede363b45b9e203d8e7..b0ecc875a0de84a6b9495482cf3cbfa2c91eb4e2 100644 (file)
@@ -76,6 +76,16 @@ alsa-hda-no-loopback-on-alc299-codec.patch
 alsa-hda-apply-stac_9200_dell_m22-quirk-for-dell-latitude-d430.patch
 revert-alsa-usb-audio-purge-needless-variable-length-array.patch
 alsa-usb-fix-a-typo-in-tascam-us-16x08-mixer-element.patch
+mm-page_alloc.c-make-sure-oom-victim-can-try-allocations-with-no-watermarks-once.patch
+mm-avoid-spurious-bad-pmd-warning-messages.patch
+dax-fix-race-between-colliding-pmd-pte-entries.patch
+mm-migrate-fix-refcount-handling-when-hugepage_migration_supported.patch
+mlock-fix-mlock-count-can-not-decrease-in-race-condition.patch
+mm-hugetlb-report-ehwpoison-not-efault-when-foll_hwpoison-is-specified.patch
+mm-consider-memblock-reservations-for-deferred-memory-initialization-sizing.patch
+rdma-srp-fix-null-deref-at-srp_destroy_qp.patch
+rdma-qib-hfi1-fix-mr-reference-count-leak-on-write-with-immediate.patch
+pci-pm-add-needs_resume-flag-to-avoid-suspend-complete-optimization.patch
 x86-boot-use-cross_compile-prefix-for-readelf.patch
 ksm-prevent-crash-after-write_protect_page-fails.patch
 slub-memcg-cure-the-brainless-abuse-of-sysfs-attributes.patch