4.9-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 5 Jun 2017 13:17:11 +0000 (15:17 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 5 Jun 2017 13:17:11 +0000 (15:17 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 5 Jun 2017 13:17:11 +0000 (15:17 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 5 Jun 2017 13:17:11 +0000 (15:17 +0200)
diff --git a/queue-4.9/mlock-fix-mlock-count-can-not-decrease-in-race-condition.patch b/queue-4.9/mlock-fix-mlock-count-can-not-decrease-in-race-condition.patch

new file mode 100644 (file)

index 0000000..1a113c6
--- /dev/null
+++ b/queue-4.9/mlock-fix-mlock-count-can-not-decrease-in-race-condition.patch
@@ -0,0 +1,113 @@
+From 70feee0e1ef331b22cc51f383d532a0d043fbdcc Mon Sep 17 00:00:00 2001
+From: Yisheng Xie <xieyisheng1@huawei.com>
+Date: Fri, 2 Jun 2017 14:46:43 -0700
+Subject: mlock: fix mlock count can not decrease in race condition
+
+From: Yisheng Xie <xieyisheng1@huawei.com>
+
+commit 70feee0e1ef331b22cc51f383d532a0d043fbdcc upstream.
+
+Kefeng reported that when running the follow test, the mlock count in
+meminfo will increase permanently:
+
+ [1] testcase
+ linux:~ # cat test_mlockal
+ grep Mlocked /proc/meminfo
+  for j in `seq 0 10`
+  do
+       for i in `seq 4 15`
+       do
+               ./p_mlockall >> log &
+       done
+       sleep 0.2
+ done
+ # wait some time to let mlock counter decrease and 5s may not enough
+ sleep 5
+ grep Mlocked /proc/meminfo
+
+ linux:~ # cat p_mlockall.c
+ #include <sys/mman.h>
+ #include <stdlib.h>
+ #include <stdio.h>
+
+ #define SPACE_LEN     4096
+
+ int main(int argc, char ** argv)
+ {
+               int ret;
+               void *adr = malloc(SPACE_LEN);
+               if (!adr)
+                       return -1;
+
+               ret = mlockall(MCL_CURRENT | MCL_FUTURE);
+               printf("mlcokall ret = %d\n", ret);
+
+               ret = munlockall();
+               printf("munlcokall ret = %d\n", ret);
+
+               free(adr);
+               return 0;
+        }
+
+In __munlock_pagevec() we should decrement NR_MLOCK for each page where
+we clear the PageMlocked flag.  Commit 1ebb7cc6a583 ("mm: munlock: batch
+NR_MLOCK zone state updates") has introduced a bug where we don't
+decrement NR_MLOCK for pages where we clear the flag, but fail to
+isolate them from the lru list (e.g.  when the pages are on some other
+cpu's percpu pagevec).  Since PageMlocked stays cleared, the NR_MLOCK
+accounting gets permanently disrupted by this.
+
+Fix it by counting the number of page whose PageMlock flag is cleared.
+
+Fixes: 1ebb7cc6a583 (" mm: munlock: batch NR_MLOCK zone state updates")
+Link: http://lkml.kernel.org/r/1495678405-54569-1-git-send-email-xieyisheng1@huawei.com
+Signed-off-by: Yisheng Xie <xieyisheng1@huawei.com>
+Reported-by: Kefeng Wang <wangkefeng.wang@huawei.com>
+Tested-by: Kefeng Wang <wangkefeng.wang@huawei.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Joern Engel <joern@logfs.org>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Michel Lespinasse <walken@google.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Xishi Qiu <qiuxishi@huawei.com>
+Cc: zhongjiang <zhongjiang@huawei.com>
+Cc: Hanjun Guo <guohanjun@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/mlock.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/mm/mlock.c
++++ b/mm/mlock.c
+@@ -285,7 +285,7 @@ static void __munlock_pagevec(struct pag
+ {
+       int i;
+       int nr = pagevec_count(pvec);
+-      int delta_munlocked;
++      int delta_munlocked = -nr;
+       struct pagevec pvec_putback;
+       int pgrescued = 0;
+ 
+@@ -305,6 +305,8 @@ static void __munlock_pagevec(struct pag
+                               continue;
+                       else
+                               __munlock_isolation_failed(page);
++              } else {
++                      delta_munlocked++;
+               }
+ 
+               /*
+@@ -316,7 +318,6 @@ static void __munlock_pagevec(struct pag
+               pagevec_add(&pvec_putback, pvec->pages[i]);
+               pvec->pages[i] = NULL;
+       }
+-      delta_munlocked = -nr + pagevec_count(&pvec_putback);
+       __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+       spin_unlock_irq(zone_lru_lock(zone));
+ 
diff --git a/queue-4.9/mm-consider-memblock-reservations-for-deferred-memory-initialization-sizing.patch b/queue-4.9/mm-consider-memblock-reservations-for-deferred-memory-initialization-sizing.patch

new file mode 100644 (file)

index 0000000..59bf696
--- /dev/null
+++ b/queue-4.9/mm-consider-memblock-reservations-for-deferred-memory-initialization-sizing.patch
@@ -0,0 +1,217 @@
+From 864b9a393dcb5aed09b8fd31b9bbda0fdda99374 Mon Sep 17 00:00:00 2001
+From: Michal Hocko <mhocko@suse.com>
+Date: Fri, 2 Jun 2017 14:46:49 -0700
+Subject: mm: consider memblock reservations for deferred memory initialization sizing
+
+From: Michal Hocko <mhocko@suse.com>
+
+commit 864b9a393dcb5aed09b8fd31b9bbda0fdda99374 upstream.
+
+We have seen an early OOM killer invocation on ppc64 systems with
+crashkernel=4096M:
+
+       kthreadd invoked oom-killer: gfp_mask=0x16040c0(GFP_KERNEL|__GFP_COMP|__GFP_NOTRACK), nodemask=7, order=0, oom_score_adj=0
+       kthreadd cpuset=/ mems_allowed=7
+       CPU: 0 PID: 2 Comm: kthreadd Not tainted 4.4.68-1.gd7fe927-default #1
+       Call Trace:
+         dump_stack+0xb0/0xf0 (unreliable)
+         dump_header+0xb0/0x258
+         out_of_memory+0x5f0/0x640
+         __alloc_pages_nodemask+0xa8c/0xc80
+         kmem_getpages+0x84/0x1a0
+         fallback_alloc+0x2a4/0x320
+         kmem_cache_alloc_node+0xc0/0x2e0
+         copy_process.isra.25+0x260/0x1b30
+         _do_fork+0x94/0x470
+         kernel_thread+0x48/0x60
+         kthreadd+0x264/0x330
+         ret_from_kernel_thread+0x5c/0xa4
+
+       Mem-Info:
+       active_anon:0 inactive_anon:0 isolated_anon:0
+        active_file:0 inactive_file:0 isolated_file:0
+        unevictable:0 dirty:0 writeback:0 unstable:0
+        slab_reclaimable:5 slab_unreclaimable:73
+        mapped:0 shmem:0 pagetables:0 bounce:0
+        free:0 free_pcp:0 free_cma:0
+       Node 7 DMA free:0kB min:0kB low:0kB high:0kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:52428800kB managed:110016kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:320kB slab_unreclaimable:4672kB kernel_stack:1152kB pagetables:0kB unstable:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes
+       lowmem_reserve[]: 0 0 0 0
+       Node 7 DMA: 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB 0*8192kB 0*16384kB = 0kB
+       0 total pagecache pages
+       0 pages in swap cache
+       Swap cache stats: add 0, delete 0, find 0/0
+       Free swap  = 0kB
+       Total swap = 0kB
+       819200 pages RAM
+       0 pages HighMem/MovableOnly
+       817481 pages reserved
+       0 pages cma reserved
+       0 pages hwpoisoned
+
+the reason is that the managed memory is too low (only 110MB) while the
+rest of the the 50GB is still waiting for the deferred intialization to
+be done.  update_defer_init estimates the initial memoty to initialize
+to 2GB at least but it doesn't consider any memory allocated in that
+range.  In this particular case we've had
+
+       Reserving 4096MB of memory at 128MB for crashkernel (System RAM: 51200MB)
+
+so the low 2GB is mostly depleted.
+
+Fix this by considering memblock allocations in the initial static
+initialization estimation.  Move the max_initialise to
+reset_deferred_meminit and implement a simple memblock_reserved_memory
+helper which iterates all reserved blocks and sums the size of all that
+start below the given address.  The cumulative size is than added on top
+of the initial estimation.  This is still not ideal because
+reset_deferred_meminit doesn't consider holes and so reservation might
+be above the initial estimation whihch we ignore but let's make the
+logic simpler until we really need to handle more complicated cases.
+
+Fixes: 3a80a7fa7989 ("mm: meminit: initialise a subset of struct pages if CONFIG_DEFERRED_STRUCT_PAGE_INIT is set")
+Link: http://lkml.kernel.org/r/20170531104010.GI27783@dhcp22.suse.cz
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Mel Gorman <mgorman@suse.de>
+Tested-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/memblock.h |    8 ++++++++
+ include/linux/mmzone.h   |    1 +
+ mm/memblock.c            |   23 +++++++++++++++++++++++
+ mm/page_alloc.c          |   33 ++++++++++++++++++++++-----------
+ 4 files changed, 54 insertions(+), 11 deletions(-)
+
+--- a/include/linux/memblock.h
++++ b/include/linux/memblock.h
+@@ -421,11 +421,19 @@ static inline void early_memtest(phys_ad
+ }
+ #endif
+ 
++extern unsigned long memblock_reserved_memory_within(phys_addr_t start_addr,
++              phys_addr_t end_addr);
+ #else
+ static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align)
+ {
+       return 0;
+ }
++
++static inline unsigned long memblock_reserved_memory_within(phys_addr_t start_addr,
++              phys_addr_t end_addr)
++{
++      return 0;
++}
+ 
+ #endif /* CONFIG_HAVE_MEMBLOCK */
+ 
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -672,6 +672,7 @@ typedef struct pglist_data {
+        * is the first PFN that needs to be initialised.
+        */
+       unsigned long first_deferred_pfn;
++      unsigned long static_init_size;
+ #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+ 
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+--- a/mm/memblock.c
++++ b/mm/memblock.c
+@@ -1696,6 +1696,29 @@ static void __init_memblock memblock_dum
+       }
+ }
+ 
++extern unsigned long __init_memblock
++memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr)
++{
++      struct memblock_region *rgn;
++      unsigned long size = 0;
++      int idx;
++
++      for_each_memblock_type((&memblock.reserved), rgn) {
++              phys_addr_t start, end;
++
++              if (rgn->base + rgn->size < start_addr)
++                      continue;
++              if (rgn->base > end_addr)
++                      continue;
++
++              start = rgn->base;
++              end = start + rgn->size;
++              size += end - start;
++      }
++
++      return size;
++}
++
+ void __init_memblock __memblock_dump_all(void)
+ {
+       pr_info("MEMBLOCK configuration:\n");
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -286,6 +286,26 @@ int page_group_by_mobility_disabled __re
+ #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+ static inline void reset_deferred_meminit(pg_data_t *pgdat)
+ {
++      unsigned long max_initialise;
++      unsigned long reserved_lowmem;
++
++      /*
++       * Initialise at least 2G of a node but also take into account that
++       * two large system hashes that can take up 1GB for 0.25TB/node.
++       */
++      max_initialise = max(2UL << (30 - PAGE_SHIFT),
++              (pgdat->node_spanned_pages >> 8));
++
++      /*
++       * Compensate the all the memblock reservations (e.g. crash kernel)
++       * from the initial estimation to make sure we will initialize enough
++       * memory to boot.
++       */
++      reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
++                      pgdat->node_start_pfn + max_initialise);
++      max_initialise += reserved_lowmem;
++
++      pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages);
+       pgdat->first_deferred_pfn = ULONG_MAX;
+ }
+ 
+@@ -308,20 +328,11 @@ static inline bool update_defer_init(pg_
+                               unsigned long pfn, unsigned long zone_end,
+                               unsigned long *nr_initialised)
+ {
+-      unsigned long max_initialise;
+-
+       /* Always populate low zones for address-contrained allocations */
+       if (zone_end < pgdat_end_pfn(pgdat))
+               return true;
+-      /*
+-       * Initialise at least 2G of a node but also take into account that
+-       * two large system hashes that can take up 1GB for 0.25TB/node.
+-       */
+-      max_initialise = max(2UL << (30 - PAGE_SHIFT),
+-              (pgdat->node_spanned_pages >> 8));
+-
+       (*nr_initialised)++;
+-      if ((*nr_initialised > max_initialise) &&
++      if ((*nr_initialised > pgdat->static_init_size) &&
+           (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+               pgdat->first_deferred_pfn = pfn;
+               return false;
+@@ -5911,7 +5922,6 @@ void __paginginit free_area_init_node(in
+       /* pg_data_t should be reset to zero when it's allocated */
+       WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
+ 
+-      reset_deferred_meminit(pgdat);
+       pgdat->node_id = nid;
+       pgdat->node_start_pfn = node_start_pfn;
+       pgdat->per_cpu_nodestats = NULL;
+@@ -5933,6 +5943,7 @@ void __paginginit free_area_init_node(in
+               (unsigned long)pgdat->node_mem_map);
+ #endif
+ 
++      reset_deferred_meminit(pgdat);
+       free_area_init_core(pgdat);
+ }
+ 
diff --git a/queue-4.9/mm-migrate-fix-refcount-handling-when-hugepage_migration_supported.patch b/queue-4.9/mm-migrate-fix-refcount-handling-when-hugepage_migration_supported.patch

new file mode 100644 (file)

index 0000000..6182dba
--- /dev/null
+++ b/queue-4.9/mm-migrate-fix-refcount-handling-when-hugepage_migration_supported.patch
@@ -0,0 +1,87 @@
+From 30809f559a0d348c2dfd7ab05e9a451e2384962e Mon Sep 17 00:00:00 2001
+From: Punit Agrawal <punit.agrawal@arm.com>
+Date: Fri, 2 Jun 2017 14:46:40 -0700
+Subject: mm/migrate: fix refcount handling when !hugepage_migration_supported()
+
+From: Punit Agrawal <punit.agrawal@arm.com>
+
+commit 30809f559a0d348c2dfd7ab05e9a451e2384962e upstream.
+
+On failing to migrate a page, soft_offline_huge_page() performs the
+necessary update to the hugepage ref-count.
+
+But when !hugepage_migration_supported() , unmap_and_move_hugepage()
+also decrements the page ref-count for the hugepage.  The combined
+behaviour leaves the ref-count in an inconsistent state.
+
+This leads to soft lockups when running the overcommitted hugepage test
+from mce-tests suite.
+
+  Soft offlining pfn 0x83ed600 at process virtual address 0x400000000000
+  soft offline: 0x83ed600: migration failed 1, type 1fffc00000008008 (uptodate|head)
+  INFO: rcu_preempt detected stalls on CPUs/tasks:
+   Tasks blocked on level-0 rcu_node (CPUs 0-7): P2715
+    (detected by 7, t=5254 jiffies, g=963, c=962, q=321)
+    thugetlb_overco R  running task        0  2715   2685 0x00000008
+    Call trace:
+      dump_backtrace+0x0/0x268
+      show_stack+0x24/0x30
+      sched_show_task+0x134/0x180
+      rcu_print_detail_task_stall_rnp+0x54/0x7c
+      rcu_check_callbacks+0xa74/0xb08
+      update_process_times+0x34/0x60
+      tick_sched_handle.isra.7+0x38/0x70
+      tick_sched_timer+0x4c/0x98
+      __hrtimer_run_queues+0xc0/0x300
+      hrtimer_interrupt+0xac/0x228
+      arch_timer_handler_phys+0x3c/0x50
+      handle_percpu_devid_irq+0x8c/0x290
+      generic_handle_irq+0x34/0x50
+      __handle_domain_irq+0x68/0xc0
+      gic_handle_irq+0x5c/0xb0
+
+Address this by changing the putback_active_hugepage() in
+soft_offline_huge_page() to putback_movable_pages().
+
+This only triggers on systems that enable memory failure handling
+(ARCH_SUPPORTS_MEMORY_FAILURE) but not hugepage migration
+(!ARCH_ENABLE_HUGEPAGE_MIGRATION).
+
+I imagine this wasn't triggered as there aren't many systems running
+this configuration.
+
+[akpm@linux-foundation.org: remove dead comment, per Naoya]
+Link: http://lkml.kernel.org/r/20170525135146.32011-1-punit.agrawal@arm.com
+Reported-by: Manoj Iyer <manoj.iyer@canonical.com>
+Tested-by: Manoj Iyer <manoj.iyer@canonical.com>
+Suggested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Wanpeng Li <wanpeng.li@hotmail.com>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memory-failure.c |    8 ++------
+ 1 file changed, 2 insertions(+), 6 deletions(-)
+
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -1587,12 +1587,8 @@ static int soft_offline_huge_page(struct
+       if (ret) {
+               pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
+                       pfn, ret, page->flags);
+-              /*
+-               * We know that soft_offline_huge_page() tries to migrate
+-               * only one hugepage pointed to by hpage, so we need not
+-               * run through the pagelist here.
+-               */
+-              putback_active_hugepage(hpage);
++              if (!list_empty(&pagelist))
++                      putback_movable_pages(&pagelist);
+               if (ret > 0)
+                       ret = -EIO;
+       } else {
diff --git a/queue-4.9/pci-pm-add-needs_resume-flag-to-avoid-suspend-complete-optimization.patch b/queue-4.9/pci-pm-add-needs_resume-flag-to-avoid-suspend-complete-optimization.patch

new file mode 100644 (file)

index 0000000..9125623
--- /dev/null
+++ b/queue-4.9/pci-pm-add-needs_resume-flag-to-avoid-suspend-complete-optimization.patch
@@ -0,0 +1,55 @@
+From 4d071c3238987325b9e50e33051a40d1cce311cc Mon Sep 17 00:00:00 2001
+From: Imre Deak <imre.deak@intel.com>
+Date: Tue, 23 May 2017 14:18:17 -0500
+Subject: PCI/PM: Add needs_resume flag to avoid suspend complete optimization
+
+From: Imre Deak <imre.deak@intel.com>
+
+commit 4d071c3238987325b9e50e33051a40d1cce311cc upstream.
+
+Some drivers - like i915 - may not support the system suspend direct
+complete optimization due to differences in their runtime and system
+suspend sequence.  Add a flag that when set resumes the device before
+calling the driver's system suspend handlers which effectively disables
+the optimization.
+
+Needed by a future patch fixing suspend/resume on i915.
+
+Suggested by Rafael.
+
+Signed-off-by: Imre Deak <imre.deak@intel.com>
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
+Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/pci/pci.c   |    3 ++-
+ include/linux/pci.h |    5 +++++
+ 2 files changed, 7 insertions(+), 1 deletion(-)
+
+--- a/drivers/pci/pci.c
++++ b/drivers/pci/pci.c
+@@ -2142,7 +2142,8 @@ bool pci_dev_keep_suspended(struct pci_d
+ 
+       if (!pm_runtime_suspended(dev)
+           || pci_target_state(pci_dev) != pci_dev->current_state
+-          || platform_pci_need_resume(pci_dev))
++          || platform_pci_need_resume(pci_dev)
++          || (pci_dev->dev_flags & PCI_DEV_FLAGS_NEEDS_RESUME))
+               return false;
+ 
+       /*
+--- a/include/linux/pci.h
++++ b/include/linux/pci.h
+@@ -178,6 +178,11 @@ enum pci_dev_flags {
+       PCI_DEV_FLAGS_NO_PM_RESET = (__force pci_dev_flags_t) (1 << 7),
+       /* Get VPD from function 0 VPD */
+       PCI_DEV_FLAGS_VPD_REF_F0 = (__force pci_dev_flags_t) (1 << 8),
++      /*
++       * Resume before calling the driver's system suspend hooks, disabling
++       * the direct_complete optimization.
++       */
++      PCI_DEV_FLAGS_NEEDS_RESUME = (__force pci_dev_flags_t) (1 << 11),
+ };
+ 
+ enum pci_irq_reroute_variant {
diff --git a/queue-4.9/rdma-qib-hfi1-fix-mr-reference-count-leak-on-write-with-immediate.patch b/queue-4.9/rdma-qib-hfi1-fix-mr-reference-count-leak-on-write-with-immediate.patch

new file mode 100644 (file)

index 0000000..f6f582d
--- /dev/null
+++ b/queue-4.9/rdma-qib-hfi1-fix-mr-reference-count-leak-on-write-with-immediate.patch
@@ -0,0 +1,63 @@
+From 1feb40067cf04ae48d65f728d62ca255c9449178 Mon Sep 17 00:00:00 2001
+From: Mike Marciniszyn <mike.marciniszyn@intel.com>
+Date: Fri, 12 May 2017 09:02:00 -0700
+Subject: RDMA/qib,hfi1: Fix MR reference count leak on write with immediate
+
+From: Mike Marciniszyn <mike.marciniszyn@intel.com>
+
+commit 1feb40067cf04ae48d65f728d62ca255c9449178 upstream.
+
+The handling of IB_RDMA_WRITE_ONLY_WITH_IMMEDIATE will leak a memory
+reference when a buffer cannot be allocated for returning the immediate
+data.
+
+The issue is that the rkey validation has already occurred and the RNR
+nak fails to release the reference that was fruitlessly gotten.  The
+the peer will send the identical single packet request when its RNR
+timer pops.
+
+The fix is to release the held reference prior to the rnr nak exit.
+This is the only sequence the requires both rkey validation and the
+buffer allocation on the same packet.
+
+Tested-by: Tadeusz Struk <tadeusz.struk@intel.com>
+Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
+Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
+Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
+Signed-off-by: Doug Ledford <dledford@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/infiniband/hw/hfi1/rc.c    |    5 ++++-
+ drivers/infiniband/hw/qib/qib_rc.c |    4 +++-
+ 2 files changed, 7 insertions(+), 2 deletions(-)
+
+--- a/drivers/infiniband/hw/hfi1/rc.c
++++ b/drivers/infiniband/hw/hfi1/rc.c
+@@ -2366,8 +2366,11 @@ send_last:
+               ret = hfi1_rvt_get_rwqe(qp, 1);
+               if (ret < 0)
+                       goto nack_op_err;
+-              if (!ret)
++              if (!ret) {
++                      /* peer will send again */
++                      rvt_put_ss(&qp->r_sge);
+                       goto rnr_nak;
++              }
+               wc.ex.imm_data = ohdr->u.rc.imm_data;
+               wc.wc_flags = IB_WC_WITH_IMM;
+               goto send_last;
+--- a/drivers/infiniband/hw/qib/qib_rc.c
++++ b/drivers/infiniband/hw/qib/qib_rc.c
+@@ -2067,8 +2067,10 @@ send_last:
+               ret = qib_get_rwqe(qp, 1);
+               if (ret < 0)
+                       goto nack_op_err;
+-              if (!ret)
++              if (!ret) {
++                      rvt_put_ss(&qp->r_sge);
+                       goto rnr_nak;
++              }
+               wc.ex.imm_data = ohdr->u.rc.imm_data;
+               hdrsize += 4;
+               wc.wc_flags = IB_WC_WITH_IMM;
diff --git a/queue-4.9/series b/queue-4.9/series

index 1b3c8812ee41b5596596f0ea4695045c56065fb7..4a09c63c21bf5c206a0823cfdb9d4d64b55c56fd 100644 (file)
--- a/queue-4.9/series
+++ b/queue-4.9/series
@@ -58,6 +58,11 @@ drm-radeon-unbreak-hpd-handling-for-r600.patch
  drm-radeon-fix-vram_size-visible-values-in-drm_radeon_gem_info-ioctl.patch
  pcmcia-remove-left-over-z-format.patch
  alsa-hda-apply-stac_9200_dell_m22-quirk-for-dell-latitude-d430.patch
+mm-migrate-fix-refcount-handling-when-hugepage_migration_supported.patch
+mlock-fix-mlock-count-can-not-decrease-in-race-condition.patch
+mm-consider-memblock-reservations-for-deferred-memory-initialization-sizing.patch
+rdma-qib-hfi1-fix-mr-reference-count-leak-on-write-with-immediate.patch
+pci-pm-add-needs_resume-flag-to-avoid-suspend-complete-optimization.patch
  x86-boot-use-cross_compile-prefix-for-readelf.patch
  ksm-prevent-crash-after-write_protect_page-fails.patch
  slub-memcg-cure-the-brainless-abuse-of-sysfs-attributes.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 5 Jun 2017 13:17:11 +0000 (15:17 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 5 Jun 2017 13:17:11 +0000 (15:17 +0200)
queue-4.9/mlock-fix-mlock-count-can-not-decrease-in-race-condition.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/mm-consider-memblock-reservations-for-deferred-memory-initialization-sizing.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/mm-migrate-fix-refcount-handling-when-hugepage_migration_supported.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/pci-pm-add-needs_resume-flag-to-avoid-suspend-complete-optimization.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/rdma-qib-hfi1-fix-mr-reference-count-leak-on-write-with-immediate.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/series		patch \| blob \| blame \| history