5.3-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sun, 27 Oct 2019 13:49:22 +0000 (14:49 +0100)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sun, 27 Oct 2019 13:49:22 +0000 (14:49 +0100)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 27 Oct 2019 13:49:22 +0000 (14:49 +0100)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 27 Oct 2019 13:49:22 +0000 (14:49 +0100)
diff --git a/queue-5.3/hugetlbfs-don-t-access-uninitialized-memmaps-in-pfn_range_valid_gigantic.patch b/queue-5.3/hugetlbfs-don-t-access-uninitialized-memmaps-in-pfn_range_valid_gigantic.patch

new file mode 100644 (file)

index 0000000..a73dfdd
--- /dev/null
+++ b/queue-5.3/hugetlbfs-don-t-access-uninitialized-memmaps-in-pfn_range_valid_gigantic.patch
@@ -0,0 +1,59 @@
+From f231fe4235e22e18d847e05cbe705deaca56580a Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Fri, 18 Oct 2019 20:20:05 -0700
+Subject: hugetlbfs: don't access uninitialized memmaps in pfn_range_valid_gigantic()
+
+From: David Hildenbrand <david@redhat.com>
+
+commit f231fe4235e22e18d847e05cbe705deaca56580a upstream.
+
+Uninitialized memmaps contain garbage and in the worst case trigger
+kernel BUGs, especially with CONFIG_PAGE_POISONING.  They should not get
+touched.
+
+Let's make sure that we only consider online memory (managed by the
+buddy) that has initialized memmaps.  ZONE_DEVICE is not applicable.
+
+page_zone() will call page_to_nid(), which will trigger
+VM_BUG_ON_PGFLAGS(PagePoisoned(page), page) with CONFIG_PAGE_POISONING
+and CONFIG_DEBUG_VM_PGFLAGS when called on uninitialized memmaps.  This
+can be the case when an offline memory block (e.g., never onlined) is
+spanned by a zone.
+
+Note: As explained by Michal in [1], alloc_contig_range() will verify
+the range.  So it boils down to the wrong access in this function.
+
+[1] http://lkml.kernel.org/r/20180423000943.GO17484@dhcp22.suse.cz
+
+Link: http://lkml.kernel.org/r/20191015120717.4858-1-david@redhat.com
+Fixes: f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online")     [visible after d0dc12e86b319]
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Reported-by: Michal Hocko <mhocko@kernel.org>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Anshuman Khandual <anshuman.khandual@arm.com>
+Cc: <stable@vger.kernel.org>   [4.13+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/hugetlb.c |    5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -1084,11 +1084,10 @@ static bool pfn_range_valid_gigantic(str
+       struct page *page;
+ 
+       for (i = start_pfn; i < end_pfn; i++) {
+-              if (!pfn_valid(i))
++              page = pfn_to_online_page(i);
++              if (!page)
+                       return false;
+ 
+-              page = pfn_to_page(i);
+-
+               if (page_zone(page) != z)
+                       return false;
+ 
diff --git a/queue-5.3/mm-compaction-fix-wrong-pfn-handling-in-__reset_isolation_pfn.patch b/queue-5.3/mm-compaction-fix-wrong-pfn-handling-in-__reset_isolation_pfn.patch

new file mode 100644 (file)

index 0000000..e945eda
--- /dev/null
+++ b/queue-5.3/mm-compaction-fix-wrong-pfn-handling-in-__reset_isolation_pfn.patch
@@ -0,0 +1,70 @@
+From a2e9a5afce080226edbf1882d63d99bf32070e9e Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Mon, 14 Oct 2019 14:12:07 -0700
+Subject: mm, compaction: fix wrong pfn handling in __reset_isolation_pfn()
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit a2e9a5afce080226edbf1882d63d99bf32070e9e upstream.
+
+Florian and Dave reported [1] a NULL pointer dereference in
+__reset_isolation_pfn().  While the exact cause is unclear, staring at
+the code revealed two bugs, which might be related.
+
+One bug is that if zone starts in the middle of pageblock, block_page
+might correspond to different pfn than block_pfn, and then the
+pfn_valid_within() checks will check different pfn's than those accessed
+via struct page.  This might result in acessing an unitialized page in
+CONFIG_HOLES_IN_ZONE configs.
+
+The other bug is that end_page refers to the first page of next
+pageblock and not last page of current pageblock.  The online and valid
+check is then wrong and with sections, the while (page < end_page) loop
+might wander off actual struct page arrays.
+
+[1] https://lore.kernel.org/linux-xfs/87o8z1fvqu.fsf@mid.deneb.enyo.de/
+
+Link: http://lkml.kernel.org/r/20191008152915.24704-1-vbabka@suse.cz
+Fixes: 6b0868c820ff ("mm/compaction.c: correct zone boundary handling when resetting pageblock skip hints")
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Reported-by: Florian Weimer <fw@deneb.enyo.de>
+Reported-by: Dave Chinner <david@fromorbit.com>
+Acked-by: Mel Gorman <mgorman@techsingularity.net>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/compaction.c |    7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -270,14 +270,15 @@ __reset_isolation_pfn(struct zone *zone,
+ 
+       /* Ensure the start of the pageblock or zone is online and valid */
+       block_pfn = pageblock_start_pfn(pfn);
+-      block_page = pfn_to_online_page(max(block_pfn, zone->zone_start_pfn));
++      block_pfn = max(block_pfn, zone->zone_start_pfn);
++      block_page = pfn_to_online_page(block_pfn);
+       if (block_page) {
+               page = block_page;
+               pfn = block_pfn;
+       }
+ 
+       /* Ensure the end of the pageblock or zone is online and valid */
+-      block_pfn += pageblock_nr_pages;
++      block_pfn = pageblock_end_pfn(pfn) - 1;
+       block_pfn = min(block_pfn, zone_end_pfn(zone) - 1);
+       end_page = pfn_to_online_page(block_pfn);
+       if (!end_page)
+@@ -303,7 +304,7 @@ __reset_isolation_pfn(struct zone *zone,
+ 
+               page += (1 << PAGE_ALLOC_COSTLY_ORDER);
+               pfn += (1 << PAGE_ALLOC_COSTLY_ORDER);
+-      } while (page < end_page);
++      } while (page <= end_page);
+ 
+       return false;
+ }
diff --git a/queue-5.3/mm-memblock-do-not-enforce-current-limit-for-memblock_phys-family.patch b/queue-5.3/mm-memblock-do-not-enforce-current-limit-for-memblock_phys-family.patch

new file mode 100644 (file)

index 0000000..1340fbe
--- /dev/null
+++ b/queue-5.3/mm-memblock-do-not-enforce-current-limit-for-memblock_phys-family.patch
@@ -0,0 +1,72 @@
+From f3057ad767542be7bbac44e548cb44017178a163 Mon Sep 17 00:00:00 2001
+From: Mike Rapoport <rppt@linux.ibm.com>
+Date: Fri, 18 Oct 2019 20:20:01 -0700
+Subject: mm: memblock: do not enforce current limit for memblock_phys* family
+
+From: Mike Rapoport <rppt@linux.ibm.com>
+
+commit f3057ad767542be7bbac44e548cb44017178a163 upstream.
+
+Until commit 92d12f9544b7 ("memblock: refactor internal allocation
+functions") the maximal address for memblock allocations was forced to
+memblock.current_limit only for the allocation functions returning
+virtual address.  The changes introduced by that commit moved the limit
+enforcement into the allocation core and as a result the allocation
+functions returning physical address also started to limit allocations
+to memblock.current_limit.
+
+This caused breakage of etnaviv GPU driver:
+
+  etnaviv etnaviv: bound 130000.gpu (ops gpu_ops)
+  etnaviv etnaviv: bound 134000.gpu (ops gpu_ops)
+  etnaviv etnaviv: bound 2204000.gpu (ops gpu_ops)
+  etnaviv-gpu 130000.gpu: model: GC2000, revision: 5108
+  etnaviv-gpu 130000.gpu: command buffer outside valid memory window
+  etnaviv-gpu 134000.gpu: model: GC320, revision: 5007
+  etnaviv-gpu 134000.gpu: command buffer outside valid memory window
+  etnaviv-gpu 2204000.gpu: model: GC355, revision: 1215
+  etnaviv-gpu 2204000.gpu: Ignoring GPU with VG and FE2.0
+
+Restore the behaviour of memblock_phys* family so that these functions
+will not enforce memblock.current_limit.
+
+Link: http://lkml.kernel.org/r/1570915861-17633-1-git-send-email-rppt@kernel.org
+Fixes: 92d12f9544b7 ("memblock: refactor internal allocation functions")
+Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
+Reported-by: Adam Ford <aford173@gmail.com>
+Tested-by: Adam Ford <aford173@gmail.com>      [imx6q-logicpd]
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Fabio Estevam <festevam@gmail.com>
+Cc: Lucas Stach <l.stach@pengutronix.de>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memblock.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/mm/memblock.c
++++ b/mm/memblock.c
+@@ -1356,9 +1356,6 @@ static phys_addr_t __init memblock_alloc
+               align = SMP_CACHE_BYTES;
+       }
+ 
+-      if (end > memblock.current_limit)
+-              end = memblock.current_limit;
+-
+ again:
+       found = memblock_find_in_range_node(size, align, start, end, nid,
+                                           flags);
+@@ -1469,6 +1466,9 @@ static void * __init memblock_alloc_inte
+       if (WARN_ON_ONCE(slab_is_available()))
+               return kzalloc_node(size, GFP_NOWAIT, nid);
+ 
++      if (max_addr > memblock.current_limit)
++              max_addr = memblock.current_limit;
++
+       alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid);
+ 
+       /* retry allocation without lower limit */
diff --git a/queue-5.3/mm-memcg-get-number-of-pages-on-the-lru-list-in-memcgroup-base-on-lru_zone_size.patch b/queue-5.3/mm-memcg-get-number-of-pages-on-the-lru-list-in-memcgroup-base-on-lru_zone_size.patch

new file mode 100644 (file)

index 0000000..df104cd
--- /dev/null
+++ b/queue-5.3/mm-memcg-get-number-of-pages-on-the-lru-list-in-memcgroup-base-on-lru_zone_size.patch
@@ -0,0 +1,151 @@
+From b11edebbc967ebf5c55b8f9e1d5bb6d68ec3a7fd Mon Sep 17 00:00:00 2001
+From: Honglei Wang <honglei.wang@oracle.com>
+Date: Fri, 18 Oct 2019 20:19:58 -0700
+Subject: mm: memcg: get number of pages on the LRU list in memcgroup base on lru_zone_size
+
+From: Honglei Wang <honglei.wang@oracle.com>
+
+commit b11edebbc967ebf5c55b8f9e1d5bb6d68ec3a7fd upstream.
+
+Commit 1a61ab8038e72 ("mm: memcontrol: replace zone summing with
+lruvec_page_state()") has made lruvec_page_state to use per-cpu counters
+instead of calculating it directly from lru_zone_size with an idea that
+this would be more effective.
+
+Tim has reported that this is not really the case for their database
+benchmark which is showing an opposite results where lruvec_page_state
+is taking up a huge chunk of CPU cycles (about 25% of the system time
+which is roughly 7% of total cpu cycles) on 5.3 kernels.  The workload
+is running on a larger machine (96cpus), it has many cgroups (500) and
+it is heavily direct reclaim bound.
+
+Tim Chen said:
+
+: The problem can also be reproduced by running simple multi-threaded
+: pmbench benchmark with a fast Optane SSD swap (see profile below).
+:
+:
+: 6.15%     3.08%  pmbench          [kernel.vmlinux]            [k] lruvec_lru_size
+:             |
+:             |--3.07%--lruvec_lru_size
+:             |          |
+:             |          |--2.11%--cpumask_next
+:             |          |          |
+:             |          |           --1.66%--find_next_bit
+:             |          |
+:             |           --0.57%--call_function_interrupt
+:             |                     |
+:             |                      --0.55%--smp_call_function_interrupt
+:             |
+:             |--1.59%--0x441f0fc3d009
+:             |          _ops_rdtsc_init_base_freq
+:             |          access_histogram
+:             |          page_fault
+:             |          __do_page_fault
+:             |          handle_mm_fault
+:             |          __handle_mm_fault
+:             |          |
+:             |           --1.54%--do_swap_page
+:             |                     swapin_readahead
+:             |                     swap_cluster_readahead
+:             |                     |
+:             |                      --1.53%--read_swap_cache_async
+:             |                                __read_swap_cache_async
+:             |                                alloc_pages_vma
+:             |                                __alloc_pages_nodemask
+:             |                                __alloc_pages_slowpath
+:             |                                try_to_free_pages
+:             |                                do_try_to_free_pages
+:             |                                shrink_node
+:             |                                shrink_node_memcg
+:             |                                |
+:             |                                |--0.77%--lruvec_lru_size
+:             |                                |
+:             |                                 --0.76%--inactive_list_is_low
+:             |                                           |
+:             |                                            --0.76%--lruvec_lru_size
+:             |
+:              --1.50%--measure_read
+:                        page_fault
+:                        __do_page_fault
+:                        handle_mm_fault
+:                        __handle_mm_fault
+:                        do_swap_page
+:                        swapin_readahead
+:                        swap_cluster_readahead
+:                        |
+:                         --1.48%--read_swap_cache_async
+:                                   __read_swap_cache_async
+:                                   alloc_pages_vma
+:                                   __alloc_pages_nodemask
+:                                   __alloc_pages_slowpath
+:                                   try_to_free_pages
+:                                   do_try_to_free_pages
+:                                   shrink_node
+:                                   shrink_node_memcg
+:                                   |
+:                                   |--0.75%--inactive_list_is_low
+:                                   |          |
+:                                   |           --0.75%--lruvec_lru_size
+:                                   |
+:                                    --0.73%--lruvec_lru_size
+
+The likely culprit is the cache traffic the lruvec_page_state_local
+generates.  Dave Hansen says:
+
+: I was thinking purely of the cache footprint.  If it's reading
+: pn->lruvec_stat_local->count[idx] is three separate cachelines, so 192
+: bytes of cache *96 CPUs = 18k of data, mostly read-only.  1 cgroup would
+: be 18k of data for the whole system and the caching would be pretty
+: efficient and all 18k would probably survive a tight page fault loop in
+: the L1.  500 cgroups would be ~90k of data per CPU thread which doesn't
+: fit in the L1 and probably wouldn't survive a tight page fault loop if
+: both logical threads were banging on different cgroups.
+:
+: It's just a theory, but it's why I noted the number of cgroups when I
+: initially saw this show up in profiles
+
+Fix the regression by partially reverting the said commit and calculate
+the lru size explicitly.
+
+Link: http://lkml.kernel.org/r/20190905071034.16822-1-honglei.wang@oracle.com
+Fixes: 1a61ab8038e72 ("mm: memcontrol: replace zone summing with lruvec_page_state()")
+Signed-off-by: Honglei Wang <honglei.wang@oracle.com>
+Reported-by: Tim Chen <tim.c.chen@linux.intel.com>
+Acked-by: Tim Chen <tim.c.chen@linux.intel.com>
+Tested-by: Tim Chen <tim.c.chen@linux.intel.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Roman Gushchin <guro@fb.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Dave Hansen <dave.hansen@intel.com>
+Cc: <stable@vger.kernel.org>   [5.2+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/vmscan.c |    9 +++++----
+ 1 file changed, 5 insertions(+), 4 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -354,12 +354,13 @@ unsigned long zone_reclaimable_pages(str
+  */
+ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
+ {
+-      unsigned long lru_size;
++      unsigned long lru_size = 0;
+       int zid;
+ 
+-      if (!mem_cgroup_disabled())
+-              lru_size = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
+-      else
++      if (!mem_cgroup_disabled()) {
++              for (zid = 0; zid < MAX_NR_ZONES; zid++)
++                      lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
++      } else
+               lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
+ 
+       for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
diff --git a/queue-5.3/mm-memcg-slab-fix-panic-in-__free_slab-caused-by-premature-memcg-pointer-release.patch b/queue-5.3/mm-memcg-slab-fix-panic-in-__free_slab-caused-by-premature-memcg-pointer-release.patch

new file mode 100644 (file)

index 0000000..f788f83
--- /dev/null
+++ b/queue-5.3/mm-memcg-slab-fix-panic-in-__free_slab-caused-by-premature-memcg-pointer-release.patch
@@ -0,0 +1,129 @@
+From b749ecfaf6c53ce79d6ab66afd2fc34189a073b1 Mon Sep 17 00:00:00 2001
+From: Roman Gushchin <guro@fb.com>
+Date: Fri, 18 Oct 2019 20:19:44 -0700
+Subject: mm: memcg/slab: fix panic in __free_slab() caused by premature memcg pointer release
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Roman Gushchin <guro@fb.com>
+
+commit b749ecfaf6c53ce79d6ab66afd2fc34189a073b1 upstream.
+
+Karsten reported the following panic in __free_slab() happening on a s390x
+machine:
+
+  Unable to handle kernel pointer dereference in virtual kernel address space
+  Failing address: 0000000000000000 TEID: 0000000000000483
+  Fault in home space mode while using kernel ASCE.
+  AS:00000000017d4007 R3:000000007fbd0007 S:000000007fbff000 P:000000000000003d
+  Oops: 0004 ilc:3 Ý#1¨ PREEMPT SMP
+  Modules linked in: tcp_diag inet_diag xt_tcpudp ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ip6table_nat ip6table_mangle ip6table_raw ip6table_security iptable_at nf_nat
+  CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.3.0-05872-g6133e3e4bada-dirty #14
+  Hardware name: IBM 2964 NC9 702 (z/VM 6.4.0)
+  Krnl PSW : 0704d00180000000 00000000003cadb6 (__free_slab+0x686/0x6b0)
+             R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3
+  Krnl GPRS: 00000000f3a32928 0000000000000000 000000007fbf5d00 000000000117c4b8
+             0000000000000000 000000009e3291c1 0000000000000000 0000000000000000
+             0000000000000003 0000000000000008 000000002b478b00 000003d080a97600
+             0000000000000003 0000000000000008 000000002b478b00 000003d080a97600
+             000000000117ba00 000003e000057db0 00000000003cabcc 000003e000057c78
+  Krnl Code: 00000000003cada6: e310a1400004        lg      %r1,320(%r10)
+             00000000003cadac: c0e50046c286        brasl   %r14,ca32b8
+            #00000000003cadb2: a7f4fe36            brc     15,3caa1e
+            >00000000003cadb6: e32060800024        stg     %r2,128(%r6)
+             00000000003cadbc: a7f4fd9e            brc     15,3ca8f8
+             00000000003cadc0: c0e50046790c        brasl   %r14,c99fd8
+             00000000003cadc6: a7f4fe2c            brc     15,3caa
+             00000000003cadc6: a7f4fe2c            brc     15,3caa1e
+             00000000003cadca: ecb1ffff00d9        aghik   %r11,%r1,-1
+  Call Trace:
+  (<00000000003cabcc> __free_slab+0x49c/0x6b0)
+   <00000000001f5886> rcu_core+0x5a6/0x7e0
+   <0000000000ca2dea> __do_softirq+0xf2/0x5c0
+   <0000000000152644> irq_exit+0x104/0x130
+   <000000000010d222> do_IRQ+0x9a/0xf0
+   <0000000000ca2344> ext_int_handler+0x130/0x134
+   <0000000000103648> enabled_wait+0x58/0x128
+  (<0000000000103634> enabled_wait+0x44/0x128)
+   <0000000000103b00> arch_cpu_idle+0x40/0x58
+   <0000000000ca0544> default_idle_call+0x3c/0x68
+   <000000000018eaa4> do_idle+0xec/0x1c0
+   <000000000018ee0e> cpu_startup_entry+0x36/0x40
+   <000000000122df34> arch_call_rest_init+0x5c/0x88
+   <0000000000000000> 0x0
+  INFO: lockdep is turned off.
+  Last Breaking-Event-Address:
+   <00000000003ca8f4> __free_slab+0x1c4/0x6b0
+  Kernel panic - not syncing: Fatal exception in interrupt
+
+The kernel panics on an attempt to dereference the NULL memcg pointer.
+When shutdown_cache() is called from the kmem_cache_destroy() context, a
+memcg kmem_cache might have empty slab pages in a partial list, which are
+still charged to the memory cgroup.
+
+These pages are released by free_partial() at the beginning of
+shutdown_cache(): either directly or by scheduling a RCU-delayed work
+(if the kmem_cache has the SLAB_TYPESAFE_BY_RCU flag).  The latter case
+is when the reported panic can happen: memcg_unlink_cache() is called
+immediately after shrinking partial lists, without waiting for scheduled
+RCU works.  It sets the kmem_cache->memcg_params.memcg pointer to NULL,
+and the following attempt to dereference it by __free_slab() from the
+RCU work context causes the panic.
+
+To fix the issue, let's postpone the release of the memcg pointer to
+destroy_memcg_params().  It's called from a separate work context by
+slab_caches_to_rcu_destroy_workfn(), which contains a full RCU barrier.
+This guarantees that all scheduled page release RCU works will complete
+before the memcg pointer will be zeroed.
+
+Big thanks for Karsten for the perfect report containing all necessary
+information, his help with the analysis of the problem and testing of the
+fix.
+
+Link: http://lkml.kernel.org/r/20191010160549.1584316-1-guro@fb.com
+Fixes: fb2f2b0adb98 ("mm: memcg/slab: reparent memcg kmem_caches on cgroup removal")
+Signed-off-by: Roman Gushchin <guro@fb.com>
+Reported-by: Karsten Graul <kgraul@linux.ibm.com>
+Tested-by: Karsten Graul <kgraul@linux.ibm.com>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Reviewed-by: Shakeel Butt <shakeelb@google.com>
+Cc: Karsten Graul <kgraul@linux.ibm.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: David Rientjes <rientjes@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/slab_common.c |    9 +++++----
+ 1 file changed, 5 insertions(+), 4 deletions(-)
+
+--- a/mm/slab_common.c
++++ b/mm/slab_common.c
+@@ -178,10 +178,13 @@ static int init_memcg_params(struct kmem
+ 
+ static void destroy_memcg_params(struct kmem_cache *s)
+ {
+-      if (is_root_cache(s))
++      if (is_root_cache(s)) {
+               kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
+-      else
++      } else {
++              mem_cgroup_put(s->memcg_params.memcg);
++              WRITE_ONCE(s->memcg_params.memcg, NULL);
+               percpu_ref_exit(&s->memcg_params.refcnt);
++      }
+ }
+ 
+ static void free_memcg_params(struct rcu_head *rcu)
+@@ -253,8 +256,6 @@ static void memcg_unlink_cache(struct km
+       } else {
+               list_del(&s->memcg_params.children_node);
+               list_del(&s->memcg_params.kmem_caches_node);
+-              mem_cgroup_put(s->memcg_params.memcg);
+-              WRITE_ONCE(s->memcg_params.memcg, NULL);
+       }
+ }
+ #else
diff --git a/queue-5.3/mm-memory-failure-poison-read-receives-sigkill-instead-of-sigbus-if-mmaped-more-than-once.patch b/queue-5.3/mm-memory-failure-poison-read-receives-sigkill-instead-of-sigbus-if-mmaped-more-than-once.patch

new file mode 100644 (file)

index 0000000..82f8e94
--- /dev/null
+++ b/queue-5.3/mm-memory-failure-poison-read-receives-sigkill-instead-of-sigbus-if-mmaped-more-than-once.patch
@@ -0,0 +1,114 @@
+From 3d7fed4ad8ccb691d217efbb0f934e6a4df5ef91 Mon Sep 17 00:00:00 2001
+From: Jane Chu <jane.chu@oracle.com>
+Date: Mon, 14 Oct 2019 14:12:29 -0700
+Subject: mm/memory-failure: poison read receives SIGKILL instead of SIGBUS if mmaped more than once
+
+From: Jane Chu <jane.chu@oracle.com>
+
+commit 3d7fed4ad8ccb691d217efbb0f934e6a4df5ef91 upstream.
+
+Mmap /dev/dax more than once, then read the poison location using
+address from one of the mappings.  The other mappings due to not having
+the page mapped in will cause SIGKILLs delivered to the process.
+SIGKILL succeeds over SIGBUS, so user process loses the opportunity to
+handle the UE.
+
+Although one may add MAP_POPULATE to mmap(2) to work around the issue,
+MAP_POPULATE makes mapping 128GB of pmem several magnitudes slower, so
+isn't always an option.
+
+Details -
+
+  ndctl inject-error --block=10 --count=1 namespace6.0
+
+  ./read_poison -x dax6.0 -o 5120 -m 2
+  mmaped address 0x7f5bb6600000
+  mmaped address 0x7f3cf3600000
+  doing local read at address 0x7f3cf3601400
+  Killed
+
+Console messages in instrumented kernel -
+
+  mce: Uncorrected hardware memory error in user-access at edbe201400
+  Memory failure: tk->addr = 7f5bb6601000
+  Memory failure: address edbe201: call dev_pagemap_mapping_shift
+  dev_pagemap_mapping_shift: page edbe201: no PUD
+  Memory failure: tk->size_shift == 0
+  Memory failure: Unable to find user space address edbe201 in read_poison
+  Memory failure: tk->addr = 7f3cf3601000
+  Memory failure: address edbe201: call dev_pagemap_mapping_shift
+  Memory failure: tk->size_shift = 21
+  Memory failure: 0xedbe201: forcibly killing read_poison:22434 because of failure to unmap corrupted page
+    => to deliver SIGKILL
+  Memory failure: 0xedbe201: Killing read_poison:22434 due to hardware memory corruption
+    => to deliver SIGBUS
+
+Link: http://lkml.kernel.org/r/1565112345-28754-3-git-send-email-jane.chu@oracle.com
+Signed-off-by: Jane Chu <jane.chu@oracle.com>
+Suggested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Reviewed-by: Dan Williams <dan.j.williams@intel.com>
+Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memory-failure.c |   22 +++++++++++++---------
+ 1 file changed, 13 insertions(+), 9 deletions(-)
+
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -199,7 +199,6 @@ struct to_kill {
+       struct task_struct *tsk;
+       unsigned long addr;
+       short size_shift;
+-      char addr_valid;
+ };
+ 
+ /*
+@@ -324,22 +323,27 @@ static void add_to_kill(struct task_stru
+               }
+       }
+       tk->addr = page_address_in_vma(p, vma);
+-      tk->addr_valid = 1;
+       if (is_zone_device_page(p))
+               tk->size_shift = dev_pagemap_mapping_shift(p, vma);
+       else
+               tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
+ 
+       /*
+-       * In theory we don't have to kill when the page was
+-       * munmaped. But it could be also a mremap. Since that's
+-       * likely very rare kill anyways just out of paranoia, but use
+-       * a SIGKILL because the error is not contained anymore.
++       * Send SIGKILL if "tk->addr == -EFAULT". Also, as
++       * "tk->size_shift" is always non-zero for !is_zone_device_page(),
++       * so "tk->size_shift == 0" effectively checks no mapping on
++       * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
++       * to a process' address space, it's possible not all N VMAs
++       * contain mappings for the page, but at least one VMA does.
++       * Only deliver SIGBUS with payload derived from the VMA that
++       * has a mapping for the page.
+        */
+-      if (tk->addr == -EFAULT || tk->size_shift == 0) {
++      if (tk->addr == -EFAULT) {
+               pr_info("Memory failure: Unable to find user space address %lx in %s\n",
+                       page_to_pfn(p), tsk->comm);
+-              tk->addr_valid = 0;
++      } else if (tk->size_shift == 0) {
++              kfree(tk);
++              return;
+       }
+       get_task_struct(tsk);
+       tk->tsk = tsk;
+@@ -366,7 +370,7 @@ static void kill_procs(struct list_head
+                        * make sure the process doesn't catch the
+                        * signal and then access the memory. Just kill it.
+                        */
+-                      if (fail || tk->addr_valid == 0) {
++                      if (fail || tk->addr == -EFAULT) {
+                               pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+                                      pfn, tk->tsk->comm, tk->tsk->pid);
+                               do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
diff --git a/queue-5.3/mm-memory-failure.c-don-t-access-uninitialized-memmaps-in-memory_failure.patch b/queue-5.3/mm-memory-failure.c-don-t-access-uninitialized-memmaps-in-memory_failure.patch

new file mode 100644 (file)

index 0000000..1cf58aa
--- /dev/null
+++ b/queue-5.3/mm-memory-failure.c-don-t-access-uninitialized-memmaps-in-memory_failure.patch
@@ -0,0 +1,55 @@
+From 96c804a6ae8c59a9092b3d5dd581198472063184 Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Fri, 18 Oct 2019 20:19:23 -0700
+Subject: mm/memory-failure.c: don't access uninitialized memmaps in memory_failure()
+
+From: David Hildenbrand <david@redhat.com>
+
+commit 96c804a6ae8c59a9092b3d5dd581198472063184 upstream.
+
+We should check for pfn_to_online_page() to not access uninitialized
+memmaps.  Reshuffle the code so we don't have to duplicate the error
+message.
+
+Link: http://lkml.kernel.org/r/20191009142435.3975-3-david@redhat.com
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Fixes: f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online")     [visible after d0dc12e86b319]
+Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: <stable@vger.kernel.org>   [4.13+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memory-failure.c |   14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -1253,17 +1253,19 @@ int memory_failure(unsigned long pfn, in
+       if (!sysctl_memory_failure_recovery)
+               panic("Memory failure on page %lx", pfn);
+ 
+-      if (!pfn_valid(pfn)) {
++      p = pfn_to_online_page(pfn);
++      if (!p) {
++              if (pfn_valid(pfn)) {
++                      pgmap = get_dev_pagemap(pfn, NULL);
++                      if (pgmap)
++                              return memory_failure_dev_pagemap(pfn, flags,
++                                                                pgmap);
++              }
+               pr_err("Memory failure: %#lx: memory outside kernel control\n",
+                       pfn);
+               return -ENXIO;
+       }
+ 
+-      pgmap = get_dev_pagemap(pfn, NULL);
+-      if (pgmap)
+-              return memory_failure_dev_pagemap(pfn, flags, pgmap);
+-
+-      p = pfn_to_page(pfn);
+       if (PageHuge(p))
+               return memory_failure_hugetlb(pfn, flags);
+       if (TestSetPageHWPoison(p)) {
diff --git a/queue-5.3/mm-memory_hotplug-don-t-access-uninitialized-memmaps-in-shrink_pgdat_span.patch b/queue-5.3/mm-memory_hotplug-don-t-access-uninitialized-memmaps-in-shrink_pgdat_span.patch

new file mode 100644 (file)

index 0000000..fbaf167
--- /dev/null
+++ b/queue-5.3/mm-memory_hotplug-don-t-access-uninitialized-memmaps-in-shrink_pgdat_span.patch
@@ -0,0 +1,179 @@
+From 00d6c019b5bc175cee3770e0e659f2b5f4804ea5 Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Fri, 18 Oct 2019 20:19:33 -0700
+Subject: mm/memory_hotplug: don't access uninitialized memmaps in shrink_pgdat_span()
+
+From: David Hildenbrand <david@redhat.com>
+
+commit 00d6c019b5bc175cee3770e0e659f2b5f4804ea5 upstream.
+
+We might use the nid of memmaps that were never initialized.  For
+example, if the memmap was poisoned, we will crash the kernel in
+pfn_to_nid() right now.  Let's use the calculated boundaries of the
+separate zones instead.  This now also avoids having to iterate over a
+whole bunch of subsections again, after shrinking one zone.
+
+Before commit d0dc12e86b31 ("mm/memory_hotplug: optimize memory
+hotplug"), the memmap was initialized to 0 and the node was set to the
+right value.  After that commit, the node might be garbage.
+
+We'll have to fix shrink_zone_span() next.
+
+Link: http://lkml.kernel.org/r/20191006085646.5768-4-david@redhat.com
+Fixes: f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online")     [d0dc12e86b319]
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Reported-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Wei Yang <richardw.yang@linux.intel.com>
+Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Anshuman Khandual <anshuman.khandual@arm.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Christian Borntraeger <borntraeger@de.ibm.com>
+Cc: Christophe Leroy <christophe.leroy@c-s.fr>
+Cc: Damian Tometzki <damian.tometzki@gmail.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Fenghua Yu <fenghua.yu@intel.com>
+Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Halil Pasic <pasic@linux.ibm.com>
+Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
+Cc: "H. Peter Anvin" <hpa@zytor.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Ira Weiny <ira.weiny@intel.com>
+Cc: Jason Gunthorpe <jgg@ziepe.ca>
+Cc: Jun Yao <yaojun8558363@gmail.com>
+Cc: Logan Gunthorpe <logang@deltatee.com>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
+Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Michael Ellerman <mpe@ellerman.id.au>
+Cc: Mike Rapoport <rppt@linux.ibm.com>
+Cc: Pankaj Gupta <pagupta@redhat.com>
+Cc: Paul Mackerras <paulus@samba.org>
+Cc: Pavel Tatashin <pavel.tatashin@microsoft.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Qian Cai <cai@lca.pw>
+Cc: Rich Felker <dalias@libc.org>
+Cc: Robin Murphy <robin.murphy@arm.com>
+Cc: Steve Capper <steve.capper@arm.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Tom Lendacky <thomas.lendacky@amd.com>
+Cc: Tony Luck <tony.luck@intel.com>
+Cc: Vasily Gorbik <gor@linux.ibm.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Wei Yang <richard.weiyang@gmail.com>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
+Cc: Yu Zhao <yuzhao@google.com>
+Cc: <stable@vger.kernel.org>   [4.13+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memory_hotplug.c |   74 +++++++++++-----------------------------------------
+ 1 file changed, 16 insertions(+), 58 deletions(-)
+
+--- a/mm/memory_hotplug.c
++++ b/mm/memory_hotplug.c
+@@ -436,67 +436,25 @@ static void shrink_zone_span(struct zone
+       zone_span_writeunlock(zone);
+ }
+ 
+-static void shrink_pgdat_span(struct pglist_data *pgdat,
+-                            unsigned long start_pfn, unsigned long end_pfn)
++static void update_pgdat_span(struct pglist_data *pgdat)
+ {
+-      unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
+-      unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
+-      unsigned long pgdat_end_pfn = p;
+-      unsigned long pfn;
+-      int nid = pgdat->node_id;
+-
+-      if (pgdat_start_pfn == start_pfn) {
+-              /*
+-               * If the section is smallest section in the pgdat, it need
+-               * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
+-               * In this case, we find second smallest valid mem_section
+-               * for shrinking zone.
+-               */
+-              pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
+-                                              pgdat_end_pfn);
+-              if (pfn) {
+-                      pgdat->node_start_pfn = pfn;
+-                      pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
+-              }
+-      } else if (pgdat_end_pfn == end_pfn) {
+-              /*
+-               * If the section is biggest section in the pgdat, it need
+-               * shrink pgdat->node_spanned_pages.
+-               * In this case, we find second biggest valid mem_section for
+-               * shrinking zone.
+-               */
+-              pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
+-                                             start_pfn);
+-              if (pfn)
+-                      pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
+-      }
+-
+-      /*
+-       * If the section is not biggest or smallest mem_section in the pgdat,
+-       * it only creates a hole in the pgdat. So in this case, we need not
+-       * change the pgdat.
+-       * But perhaps, the pgdat has only hole data. Thus it check the pgdat
+-       * has only hole or not.
+-       */
+-      pfn = pgdat_start_pfn;
+-      for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SUBSECTION) {
+-              if (unlikely(!pfn_valid(pfn)))
+-                      continue;
+-
+-              if (pfn_to_nid(pfn) != nid)
+-                      continue;
+-
+-              /* Skip range to be removed */
+-              if (pfn >= start_pfn && pfn < end_pfn)
+-                      continue;
++      unsigned long node_start_pfn = 0, node_end_pfn = 0;
++      struct zone *zone;
+ 
+-              /* If we find valid section, we have nothing to do */
+-              return;
++      for (zone = pgdat->node_zones;
++           zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
++              unsigned long zone_end_pfn = zone->zone_start_pfn +
++                                           zone->spanned_pages;
++
++              /* No need to lock the zones, they can't change. */
++              if (zone_end_pfn > node_end_pfn)
++                      node_end_pfn = zone_end_pfn;
++              if (zone->zone_start_pfn < node_start_pfn)
++                      node_start_pfn = zone->zone_start_pfn;
+       }
+ 
+-      /* The pgdat has no valid section */
+-      pgdat->node_start_pfn = 0;
+-      pgdat->node_spanned_pages = 0;
++      pgdat->node_start_pfn = node_start_pfn;
++      pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
+ }
+ 
+ static void __remove_zone(struct zone *zone, unsigned long start_pfn,
+@@ -507,7 +465,7 @@ static void __remove_zone(struct zone *z
+ 
+       pgdat_resize_lock(zone->zone_pgdat, &flags);
+       shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
+-      shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
++      update_pgdat_span(pgdat);
+       pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ }
+ 
diff --git a/queue-5.3/mm-memunmap-don-t-access-uninitialized-memmap-in-memunmap_pages.patch b/queue-5.3/mm-memunmap-don-t-access-uninitialized-memmap-in-memunmap_pages.patch

new file mode 100644 (file)

index 0000000..227c077
--- /dev/null
+++ b/queue-5.3/mm-memunmap-don-t-access-uninitialized-memmap-in-memunmap_pages.patch
@@ -0,0 +1,188 @@
+From 77e080e7680e1e615587352f70c87b9e98126d03 Mon Sep 17 00:00:00 2001
+From: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
+Date: Fri, 18 Oct 2019 20:19:39 -0700
+Subject: mm/memunmap: don't access uninitialized memmap in memunmap_pages()
+
+From: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+
+commit 77e080e7680e1e615587352f70c87b9e98126d03 upstream.
+
+Patch series "mm/memory_hotplug: Shrink zones before removing memory",
+v6.
+
+This series fixes the access of uninitialized memmaps when shrinking
+zones/nodes and when removing memory.  Also, it contains all fixes for
+crashes that can be triggered when removing certain namespace using
+memunmap_pages() - ZONE_DEVICE, reported by Aneesh.
+
+We stop trying to shrink ZONE_DEVICE, as it's buggy, fixing it would be
+more involved (we don't have SECTION_IS_ONLINE as an indicator), and
+shrinking is only of limited use (set_zone_contiguous() cannot detect
+the ZONE_DEVICE as contiguous).
+
+We continue shrinking !ZONE_DEVICE zones, however, I reduced the amount
+of code to a minimum.  Shrinking is especially necessary to keep
+zone->contiguous set where possible, especially, on memory unplug of
+DIMMs at zone boundaries.
+
+--------------------------------------------------------------------------
+
+Zones are now properly shrunk when offlining memory blocks or when
+onlining failed.  This allows to properly shrink zones on memory unplug
+even if the separate memory blocks of a DIMM were onlined to different
+zones or re-onlined to a different zone after offlining.
+
+Example:
+
+  :/# cat /proc/zoneinfo
+  Node 1, zone  Movable
+          spanned  0
+          present  0
+          managed  0
+  :/# echo "online_movable" > /sys/devices/system/memory/memory41/state
+  :/# echo "online_movable" > /sys/devices/system/memory/memory43/state
+  :/# cat /proc/zoneinfo
+  Node 1, zone  Movable
+          spanned  98304
+          present  65536
+          managed  65536
+  :/# echo 0 > /sys/devices/system/memory/memory43/online
+  :/# cat /proc/zoneinfo
+  Node 1, zone  Movable
+          spanned  32768
+          present  32768
+          managed  32768
+  :/# echo 0 > /sys/devices/system/memory/memory41/online
+  :/# cat /proc/zoneinfo
+  Node 1, zone  Movable
+          spanned  0
+          present  0
+          managed  0
+
+This patch (of 10):
+
+With an altmap, the memmap falling into the reserved altmap space are not
+initialized and, therefore, contain a garbage NID and a garbage zone.
+Make sure to read the NID/zone from a memmap that was initialized.
+
+This fixes a kernel crash that is observed when destroying a namespace:
+
+  kernel BUG at include/linux/mm.h:1107!
+  cpu 0x1: Vector: 700 (Program Check) at [c000000274087890]
+      pc: c0000000004b9728: memunmap_pages+0x238/0x340
+      lr: c0000000004b9724: memunmap_pages+0x234/0x340
+  ...
+      pid   = 3669, comm = ndctl
+  kernel BUG at include/linux/mm.h:1107!
+    devm_action_release+0x30/0x50
+    release_nodes+0x268/0x2d0
+    device_release_driver_internal+0x174/0x240
+    unbind_store+0x13c/0x190
+    drv_attr_store+0x44/0x60
+    sysfs_kf_write+0x70/0xa0
+    kernfs_fop_write+0x1ac/0x290
+    __vfs_write+0x3c/0x70
+    vfs_write+0xe4/0x200
+    ksys_write+0x7c/0x140
+    system_call+0x5c/0x68
+
+The "page_zone(pfn_to_page(pfn)" was introduced by 69324b8f4833 ("mm,
+devm_memremap_pages: add MEMORY_DEVICE_PRIVATE support"), however, I
+think we will never have driver reserved memory with
+MEMORY_DEVICE_PRIVATE (no altmap AFAIKS).
+
+[david@redhat.com: minimze code changes, rephrase description]
+Link: http://lkml.kernel.org/r/20191006085646.5768-2-david@redhat.com
+Fixes: 2c2a5af6fed2 ("mm, memory_hotplug: add nid parameter to arch_remove_memory")
+Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Jason Gunthorpe <jgg@ziepe.ca>
+Cc: Logan Gunthorpe <logang@deltatee.com>
+Cc: Ira Weiny <ira.weiny@intel.com>
+Cc: Damian Tometzki <damian.tometzki@gmail.com>
+Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Anshuman Khandual <anshuman.khandual@arm.com>
+Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Christian Borntraeger <borntraeger@de.ibm.com>
+Cc: Christophe Leroy <christophe.leroy@c-s.fr>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Fenghua Yu <fenghua.yu@intel.com>
+Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: Halil Pasic <pasic@linux.ibm.com>
+Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
+Cc: "H. Peter Anvin" <hpa@zytor.com>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Jun Yao <yaojun8558363@gmail.com>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
+Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Michael Ellerman <mpe@ellerman.id.au>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Mike Rapoport <rppt@linux.ibm.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Pankaj Gupta <pagupta@redhat.com>
+Cc: Paul Mackerras <paulus@samba.org>
+Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
+Cc: Pavel Tatashin <pavel.tatashin@microsoft.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Qian Cai <cai@lca.pw>
+Cc: Rich Felker <dalias@libc.org>
+Cc: Robin Murphy <robin.murphy@arm.com>
+Cc: Steve Capper <steve.capper@arm.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Tom Lendacky <thomas.lendacky@amd.com>
+Cc: Tony Luck <tony.luck@intel.com>
+Cc: Vasily Gorbik <gor@linux.ibm.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Wei Yang <richard.weiyang@gmail.com>
+Cc: Wei Yang <richardw.yang@linux.intel.com>
+Cc: Will Deacon <will@kernel.org>
+Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
+Cc: Yu Zhao <yuzhao@google.com>
+Cc: <stable@vger.kernel.org>   [5.0+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memremap.c |   11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+--- a/mm/memremap.c
++++ b/mm/memremap.c
+@@ -104,6 +104,7 @@ static void devm_memremap_pages_release(
+       struct dev_pagemap *pgmap = data;
+       struct device *dev = pgmap->dev;
+       struct resource *res = &pgmap->res;
++      struct page *first_page;
+       unsigned long pfn;
+       int nid;
+ 
+@@ -112,14 +113,16 @@ static void devm_memremap_pages_release(
+               put_page(pfn_to_page(pfn));
+       dev_pagemap_cleanup(pgmap);
+ 
++      /* make sure to access a memmap that was actually initialized */
++      first_page = pfn_to_page(pfn_first(pgmap));
++
+       /* pages are dead and unused, undo the arch mapping */
+-      nid = page_to_nid(pfn_to_page(PHYS_PFN(res->start)));
++      nid = page_to_nid(first_page);
+ 
+       mem_hotplug_begin();
+       if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+-              pfn = PHYS_PFN(res->start);
+-              __remove_pages(page_zone(pfn_to_page(pfn)), pfn,
+-                               PHYS_PFN(resource_size(res)), NULL);
++              __remove_pages(page_zone(first_page), PHYS_PFN(res->start),
++                             PHYS_PFN(resource_size(res)), NULL);
+       } else {
+               arch_remove_memory(nid, res->start, resource_size(res),
+                               pgmap_altmap(pgmap));
diff --git a/queue-5.3/mm-page_owner-don-t-access-uninitialized-memmaps-when-reading-proc-pagetypeinfo.patch b/queue-5.3/mm-page_owner-don-t-access-uninitialized-memmaps-when-reading-proc-pagetypeinfo.patch

new file mode 100644 (file)

index 0000000..e248fea
--- /dev/null
+++ b/queue-5.3/mm-page_owner-don-t-access-uninitialized-memmaps-when-reading-proc-pagetypeinfo.patch
@@ -0,0 +1,83 @@
+From a26ee565b6cd8dc2bf15ff6aa70bbb28f928b773 Mon Sep 17 00:00:00 2001
+From: Qian Cai <cai@lca.pw>
+Date: Fri, 18 Oct 2019 20:19:29 -0700
+Subject: mm/page_owner: don't access uninitialized memmaps when reading /proc/pagetypeinfo
+
+From: Qian Cai <cai@lca.pw>
+
+commit a26ee565b6cd8dc2bf15ff6aa70bbb28f928b773 upstream.
+
+Uninitialized memmaps contain garbage and in the worst case trigger
+kernel BUGs, especially with CONFIG_PAGE_POISONING.  They should not get
+touched.
+
+For example, when not onlining a memory block that is spanned by a zone
+and reading /proc/pagetypeinfo with CONFIG_DEBUG_VM_PGFLAGS and
+CONFIG_PAGE_POISONING, we can trigger a kernel BUG:
+
+  :/# echo 1 > /sys/devices/system/memory/memory40/online
+  :/# echo 1 > /sys/devices/system/memory/memory42/online
+  :/# cat /proc/pagetypeinfo > test.file
+   page:fffff2c585200000 is uninitialized and poisoned
+   raw: ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff
+   raw: ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff
+   page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
+   There is not page extension available.
+   ------------[ cut here ]------------
+   kernel BUG at include/linux/mm.h:1107!
+   invalid opcode: 0000 [#1] SMP NOPTI
+
+Please note that this change does not affect ZONE_DEVICE, because
+pagetypeinfo_showmixedcount_print() is called from
+mm/vmstat.c:pagetypeinfo_showmixedcount() only for populated zones, and
+ZONE_DEVICE is never populated (zone->present_pages always 0).
+
+[david@redhat.com: move check to outer loop, add comment, rephrase description]
+Link: http://lkml.kernel.org/r/20191011140638.8160-1-david@redhat.com
+Fixes: f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online") # visible after d0dc12e86b319
+Signed-off-by: Qian Cai <cai@lca.pw>
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
+Cc: Miles Chen <miles.chen@mediatek.com>
+Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
+Cc: Qian Cai <cai@lca.pw>
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: <stable@vger.kernel.org>   [4.13+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page_owner.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/mm/page_owner.c
++++ b/mm/page_owner.c
+@@ -258,7 +258,8 @@ void pagetypeinfo_showmixedcount_print(s
+        * not matter as the mixed block count will still be correct
+        */
+       for (; pfn < end_pfn; ) {
+-              if (!pfn_valid(pfn)) {
++              page = pfn_to_online_page(pfn);
++              if (!page) {
+                       pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
+                       continue;
+               }
+@@ -266,13 +267,13 @@ void pagetypeinfo_showmixedcount_print(s
+               block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+               block_end_pfn = min(block_end_pfn, end_pfn);
+ 
+-              page = pfn_to_page(pfn);
+               pageblock_mt = get_pageblock_migratetype(page);
+ 
+               for (; pfn < block_end_pfn; pfn++) {
+                       if (!pfn_valid_within(pfn))
+                               continue;
+ 
++                      /* The pageblock is online, no need to recheck. */
+                       page = pfn_to_page(pfn);
+ 
+                       if (page_zone(page) != zone)
diff --git a/queue-5.3/mm-slub-fix-a-deadlock-in-show_slab_objects.patch b/queue-5.3/mm-slub-fix-a-deadlock-in-show_slab_objects.patch

new file mode 100644 (file)

index 0000000..9e72f14
--- /dev/null
+++ b/queue-5.3/mm-slub-fix-a-deadlock-in-show_slab_objects.patch
@@ -0,0 +1,186 @@
+From e4f8e513c3d353c134ad4eef9fd0bba12406c7c8 Mon Sep 17 00:00:00 2001
+From: Qian Cai <cai@lca.pw>
+Date: Mon, 14 Oct 2019 14:11:51 -0700
+Subject: mm/slub: fix a deadlock in show_slab_objects()
+
+From: Qian Cai <cai@lca.pw>
+
+commit e4f8e513c3d353c134ad4eef9fd0bba12406c7c8 upstream.
+
+A long time ago we fixed a similar deadlock in show_slab_objects() [1].
+However, it is apparently due to the commits like 01fb58bcba63 ("slab:
+remove synchronous synchronize_sched() from memcg cache deactivation
+path") and 03afc0e25f7f ("slab: get_online_mems for
+kmem_cache_{create,destroy,shrink}"), this kind of deadlock is back by
+just reading files in /sys/kernel/slab which will generate a lockdep
+splat below.
+
+Since the "mem_hotplug_lock" here is only to obtain a stable online node
+mask while racing with NUMA node hotplug, in the worst case, the results
+may me miscalculated while doing NUMA node hotplug, but they shall be
+corrected by later reads of the same files.
+
+  WARNING: possible circular locking dependency detected
+  ------------------------------------------------------
+  cat/5224 is trying to acquire lock:
+  ffff900012ac3120 (mem_hotplug_lock.rw_sem){++++}, at:
+  show_slab_objects+0x94/0x3a8
+
+  but task is already holding lock:
+  b8ff009693eee398 (kn->count#45){++++}, at: kernfs_seq_start+0x44/0xf0
+
+  which lock already depends on the new lock.
+
+  the existing dependency chain (in reverse order) is:
+
+  -> #2 (kn->count#45){++++}:
+         lock_acquire+0x31c/0x360
+         __kernfs_remove+0x290/0x490
+         kernfs_remove+0x30/0x44
+         sysfs_remove_dir+0x70/0x88
+         kobject_del+0x50/0xb0
+         sysfs_slab_unlink+0x2c/0x38
+         shutdown_cache+0xa0/0xf0
+         kmemcg_cache_shutdown_fn+0x1c/0x34
+         kmemcg_workfn+0x44/0x64
+         process_one_work+0x4f4/0x950
+         worker_thread+0x390/0x4bc
+         kthread+0x1cc/0x1e8
+         ret_from_fork+0x10/0x18
+
+  -> #1 (slab_mutex){+.+.}:
+         lock_acquire+0x31c/0x360
+         __mutex_lock_common+0x16c/0xf78
+         mutex_lock_nested+0x40/0x50
+         memcg_create_kmem_cache+0x38/0x16c
+         memcg_kmem_cache_create_func+0x3c/0x70
+         process_one_work+0x4f4/0x950
+         worker_thread+0x390/0x4bc
+         kthread+0x1cc/0x1e8
+         ret_from_fork+0x10/0x18
+
+  -> #0 (mem_hotplug_lock.rw_sem){++++}:
+         validate_chain+0xd10/0x2bcc
+         __lock_acquire+0x7f4/0xb8c
+         lock_acquire+0x31c/0x360
+         get_online_mems+0x54/0x150
+         show_slab_objects+0x94/0x3a8
+         total_objects_show+0x28/0x34
+         slab_attr_show+0x38/0x54
+         sysfs_kf_seq_show+0x198/0x2d4
+         kernfs_seq_show+0xa4/0xcc
+         seq_read+0x30c/0x8a8
+         kernfs_fop_read+0xa8/0x314
+         __vfs_read+0x88/0x20c
+         vfs_read+0xd8/0x10c
+         ksys_read+0xb0/0x120
+         __arm64_sys_read+0x54/0x88
+         el0_svc_handler+0x170/0x240
+         el0_svc+0x8/0xc
+
+  other info that might help us debug this:
+
+  Chain exists of:
+    mem_hotplug_lock.rw_sem --> slab_mutex --> kn->count#45
+
+   Possible unsafe locking scenario:
+
+         CPU0                    CPU1
+         ----                    ----
+    lock(kn->count#45);
+                                 lock(slab_mutex);
+                                 lock(kn->count#45);
+    lock(mem_hotplug_lock.rw_sem);
+
+   *** DEADLOCK ***
+
+  3 locks held by cat/5224:
+   #0: 9eff00095b14b2a0 (&p->lock){+.+.}, at: seq_read+0x4c/0x8a8
+   #1: 0eff008997041480 (&of->mutex){+.+.}, at: kernfs_seq_start+0x34/0xf0
+   #2: b8ff009693eee398 (kn->count#45){++++}, at:
+  kernfs_seq_start+0x44/0xf0
+
+  stack backtrace:
+  Call trace:
+   dump_backtrace+0x0/0x248
+   show_stack+0x20/0x2c
+   dump_stack+0xd0/0x140
+   print_circular_bug+0x368/0x380
+   check_noncircular+0x248/0x250
+   validate_chain+0xd10/0x2bcc
+   __lock_acquire+0x7f4/0xb8c
+   lock_acquire+0x31c/0x360
+   get_online_mems+0x54/0x150
+   show_slab_objects+0x94/0x3a8
+   total_objects_show+0x28/0x34
+   slab_attr_show+0x38/0x54
+   sysfs_kf_seq_show+0x198/0x2d4
+   kernfs_seq_show+0xa4/0xcc
+   seq_read+0x30c/0x8a8
+   kernfs_fop_read+0xa8/0x314
+   __vfs_read+0x88/0x20c
+   vfs_read+0xd8/0x10c
+   ksys_read+0xb0/0x120
+   __arm64_sys_read+0x54/0x88
+   el0_svc_handler+0x170/0x240
+   el0_svc+0x8/0xc
+
+I think it is important to mention that this doesn't expose the
+show_slab_objects to use-after-free.  There is only a single path that
+might really race here and that is the slab hotplug notifier callback
+__kmem_cache_shrink (via slab_mem_going_offline_callback) but that path
+doesn't really destroy kmem_cache_node data structures.
+
+[1] http://lkml.iu.edu/hypermail/linux/kernel/1101.0/02850.html
+
+[akpm@linux-foundation.org: add comment explaining why we don't need mem_hotplug_lock]
+Link: http://lkml.kernel.org/r/1570192309-10132-1-git-send-email-cai@lca.pw
+Fixes: 01fb58bcba63 ("slab: remove synchronous synchronize_sched() from memcg cache deactivation path")
+Fixes: 03afc0e25f7f ("slab: get_online_mems for kmem_cache_{create,destroy,shrink}")
+Signed-off-by: Qian Cai <cai@lca.pw>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Pekka Enberg <penberg@kernel.org>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: Roman Gushchin <guro@fb.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/slub.c |   13 +++++++++++--
+ 1 file changed, 11 insertions(+), 2 deletions(-)
+
+--- a/mm/slub.c
++++ b/mm/slub.c
+@@ -4836,7 +4836,17 @@ static ssize_t show_slab_objects(struct
+               }
+       }
+ 
+-      get_online_mems();
++      /*
++       * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
++       * already held which will conflict with an existing lock order:
++       *
++       * mem_hotplug_lock->slab_mutex->kernfs_mutex
++       *
++       * We don't really need mem_hotplug_lock (to hold off
++       * slab_mem_going_offline_callback) here because slab's memory hot
++       * unplug code doesn't destroy the kmem_cache->node[] data.
++       */
++
+ #ifdef CONFIG_SLUB_DEBUG
+       if (flags & SO_ALL) {
+               struct kmem_cache_node *n;
+@@ -4877,7 +4887,6 @@ static ssize_t show_slab_objects(struct
+                       x += sprintf(buf + x, " N%d=%lu",
+                                       node, nodes[node]);
+ #endif
+-      put_online_mems();
+       kfree(nodes);
+       return x + sprintf(buf + x, "\n");
+ }
diff --git a/queue-5.3/mmc-cqhci-commit-descriptors-before-setting-the-doorbell.patch b/queue-5.3/mmc-cqhci-commit-descriptors-before-setting-the-doorbell.patch

new file mode 100644 (file)

index 0000000..d50404c
--- /dev/null
+++ b/queue-5.3/mmc-cqhci-commit-descriptors-before-setting-the-doorbell.patch
@@ -0,0 +1,34 @@
+From c07d0073b9ec80a139d07ebf78e9c30d2a28279e Mon Sep 17 00:00:00 2001
+From: Faiz Abbas <faiz_abbas@ti.com>
+Date: Tue, 15 Oct 2019 00:08:49 +0530
+Subject: mmc: cqhci: Commit descriptors before setting the doorbell
+
+From: Faiz Abbas <faiz_abbas@ti.com>
+
+commit c07d0073b9ec80a139d07ebf78e9c30d2a28279e upstream.
+
+Add a write memory barrier to make sure that descriptors are actually
+written to memory, before ringing the doorbell.
+
+Signed-off-by: Faiz Abbas <faiz_abbas@ti.com>
+Acked-by: Adrian Hunter <adrian.hunter@intel.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/mmc/host/cqhci.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/mmc/host/cqhci.c
++++ b/drivers/mmc/host/cqhci.c
+@@ -611,7 +611,8 @@ static int cqhci_request(struct mmc_host
+       cq_host->slot[tag].flags = 0;
+ 
+       cq_host->qcnt += 1;
+-
++      /* Make sure descriptors are ready before ringing the doorbell */
++      wmb();
+       cqhci_writel(cq_host, 1 << tag, CQHCI_TDBR);
+       if (!(cqhci_readl(cq_host, CQHCI_TDBR) & (1 << tag)))
+               pr_debug("%s: cqhci: doorbell not set for tag %d\n",
diff --git a/queue-5.3/mmc-mxs-fix-flags-passed-to-dmaengine_prep_slave_sg.patch b/queue-5.3/mmc-mxs-fix-flags-passed-to-dmaengine_prep_slave_sg.patch

new file mode 100644 (file)

index 0000000..6199a40
--- /dev/null
+++ b/queue-5.3/mmc-mxs-fix-flags-passed-to-dmaengine_prep_slave_sg.patch
@@ -0,0 +1,64 @@
+From 2bb9f7566ba7ab3c2154964461e37b52cdc6b91b Mon Sep 17 00:00:00 2001
+From: Sascha Hauer <s.hauer@pengutronix.de>
+Date: Fri, 18 Oct 2019 11:39:34 +0200
+Subject: mmc: mxs: fix flags passed to dmaengine_prep_slave_sg
+
+From: Sascha Hauer <s.hauer@pengutronix.de>
+
+commit 2bb9f7566ba7ab3c2154964461e37b52cdc6b91b upstream.
+
+Since ceeeb99cd821 we no longer abuse the DMA_CTRL_ACK flag for custom
+driver use and introduced the MXS_DMA_CTRL_WAIT4END instead. We have not
+changed all users to this flag though. This patch fixes it for the
+mxs-mmc driver.
+
+Fixes: ceeeb99cd821 ("dmaengine: mxs: rename custom flag")
+Signed-off-by: Sascha Hauer <s.hauer@pengutronix.de>
+Tested-by: Fabio Estevam <festevam@gmail.com>
+Reported-by: Bruno Thomsen <bruno.thomsen@gmail.com>
+Tested-by: Bruno Thomsen <bruno.thomsen@gmail.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/mmc/host/mxs-mmc.c |    7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/drivers/mmc/host/mxs-mmc.c
++++ b/drivers/mmc/host/mxs-mmc.c
+@@ -17,6 +17,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/dma-mapping.h>
+ #include <linux/dmaengine.h>
++#include <linux/dma/mxs-dma.h>
+ #include <linux/highmem.h>
+ #include <linux/clk.h>
+ #include <linux/err.h>
+@@ -266,7 +267,7 @@ static void mxs_mmc_bc(struct mxs_mmc_ho
+       ssp->ssp_pio_words[2] = cmd1;
+       ssp->dma_dir = DMA_NONE;
+       ssp->slave_dirn = DMA_TRANS_NONE;
+-      desc = mxs_mmc_prep_dma(host, DMA_CTRL_ACK);
++      desc = mxs_mmc_prep_dma(host, MXS_DMA_CTRL_WAIT4END);
+       if (!desc)
+               goto out;
+ 
+@@ -311,7 +312,7 @@ static void mxs_mmc_ac(struct mxs_mmc_ho
+       ssp->ssp_pio_words[2] = cmd1;
+       ssp->dma_dir = DMA_NONE;
+       ssp->slave_dirn = DMA_TRANS_NONE;
+-      desc = mxs_mmc_prep_dma(host, DMA_CTRL_ACK);
++      desc = mxs_mmc_prep_dma(host, MXS_DMA_CTRL_WAIT4END);
+       if (!desc)
+               goto out;
+ 
+@@ -441,7 +442,7 @@ static void mxs_mmc_adtc(struct mxs_mmc_
+       host->data = data;
+       ssp->dma_dir = dma_data_dir;
+       ssp->slave_dirn = slave_dirn;
+-      desc = mxs_mmc_prep_dma(host, DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
++      desc = mxs_mmc_prep_dma(host, DMA_PREP_INTERRUPT | MXS_DMA_CTRL_WAIT4END);
+       if (!desc)
+               goto out;
+ 
diff --git a/queue-5.3/mmc-sdhci-omap-fix-tuning-procedure-for-temperatures-20c.patch b/queue-5.3/mmc-sdhci-omap-fix-tuning-procedure-for-temperatures-20c.patch

new file mode 100644 (file)

index 0000000..f35c0d3
--- /dev/null
+++ b/queue-5.3/mmc-sdhci-omap-fix-tuning-procedure-for-temperatures-20c.patch
@@ -0,0 +1,38 @@
+From feb40824d78eac5e48f56498dca941754dff33d7 Mon Sep 17 00:00:00 2001
+From: Faiz Abbas <faiz_abbas@ti.com>
+Date: Thu, 10 Oct 2019 16:22:30 +0530
+Subject: mmc: sdhci-omap: Fix Tuning procedure for temperatures < -20C
+
+From: Faiz Abbas <faiz_abbas@ti.com>
+
+commit feb40824d78eac5e48f56498dca941754dff33d7 upstream.
+
+According to the App note[1] detailing the tuning algorithm, for
+temperatures < -20C, the initial tuning value should be min(largest value
+in LPW - 24, ceil(13/16 ratio of LPW)). The largest value in LPW is
+(max_window + 4 * (max_len - 1)) and not (max_window + 4 * max_len) itself.
+Fix this implementation.
+
+[1] http://www.ti.com/lit/an/spraca9b/spraca9b.pdf
+
+Fixes: 961de0a856e3 ("mmc: sdhci-omap: Workaround errata regarding SDR104/HS200 tuning failures (i929)")
+Cc: stable@vger.kernel.org
+Signed-off-by: Faiz Abbas <faiz_abbas@ti.com>
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/mmc/host/sdhci-omap.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/mmc/host/sdhci-omap.c
++++ b/drivers/mmc/host/sdhci-omap.c
+@@ -372,7 +372,7 @@ static int sdhci_omap_execute_tuning(str
+        * on temperature
+        */
+       if (temperature < -20000)
+-              phase_delay = min(max_window + 4 * max_len - 24,
++              phase_delay = min(max_window + 4 * (max_len - 1) - 24,
+                                 max_window +
+                                 DIV_ROUND_UP(13 * max_len, 16) * 4);
+       else if (temperature < 20000)
diff --git a/queue-5.3/s390-kaslr-add-support-for-r_390_glob_dat-relocation-type.patch b/queue-5.3/s390-kaslr-add-support-for-r_390_glob_dat-relocation-type.patch

new file mode 100644 (file)

index 0000000..a628c25
--- /dev/null
+++ b/queue-5.3/s390-kaslr-add-support-for-r_390_glob_dat-relocation-type.patch
@@ -0,0 +1,63 @@
+From ac49303d9ef0ad98b79867a380ef23480e48870b Mon Sep 17 00:00:00 2001
+From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
+Date: Mon, 21 Oct 2019 19:56:00 +0200
+Subject: s390/kaslr: add support for R_390_GLOB_DAT relocation type
+
+From: Gerald Schaefer <gerald.schaefer@de.ibm.com>
+
+commit ac49303d9ef0ad98b79867a380ef23480e48870b upstream.
+
+Commit "bpf: Process in-kernel BTF" in linux-next introduced an undefined
+__weak symbol, which results in an R_390_GLOB_DAT relocation type. That
+is not yet handled by the KASLR relocation code, and the kernel stops with
+the message "Unknown relocation type".
+
+Add code to detect and handle R_390_GLOB_DAT relocation types and undefined
+symbols.
+
+Fixes: 805bc0bc238f ("s390/kernel: build a relocatable kernel")
+Cc: <stable@vger.kernel.org> # v5.2+
+Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
+Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
+Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/s390/boot/startup.c               |   14 +++++++++++---
+ arch/s390/kernel/machine_kexec_reloc.c |    1 +
+ 2 files changed, 12 insertions(+), 3 deletions(-)
+
+--- a/arch/s390/boot/startup.c
++++ b/arch/s390/boot/startup.c
+@@ -101,10 +101,18 @@ static void handle_relocs(unsigned long
+       dynsym = (Elf64_Sym *) vmlinux.dynsym_start;
+       for (rela = rela_start; rela < rela_end; rela++) {
+               loc = rela->r_offset + offset;
+-              val = rela->r_addend + offset;
++              val = rela->r_addend;
+               r_sym = ELF64_R_SYM(rela->r_info);
+-              if (r_sym)
+-                      val += dynsym[r_sym].st_value;
++              if (r_sym) {
++                      if (dynsym[r_sym].st_shndx != SHN_UNDEF)
++                              val += dynsym[r_sym].st_value + offset;
++              } else {
++                      /*
++                       * 0 == undefined symbol table index (STN_UNDEF),
++                       * used for R_390_RELATIVE, only add KASLR offset
++                       */
++                      val += offset;
++              }
+               r_type = ELF64_R_TYPE(rela->r_info);
+               rc = arch_kexec_do_relocs(r_type, (void *) loc, val, 0);
+               if (rc)
+--- a/arch/s390/kernel/machine_kexec_reloc.c
++++ b/arch/s390/kernel/machine_kexec_reloc.c
+@@ -27,6 +27,7 @@ int arch_kexec_do_relocs(int r_type, voi
+               *(u32 *)loc = val;
+               break;
+       case R_390_64:          /* Direct 64 bit.  */
++      case R_390_GLOB_DAT:
+               *(u64 *)loc = val;
+               break;
+       case R_390_PC16:        /* PC relative 16 bit.  */
diff --git a/queue-5.3/s390-zcrypt-fix-memleak-at-release.patch b/queue-5.3/s390-zcrypt-fix-memleak-at-release.patch

new file mode 100644 (file)

index 0000000..5c4bc73
--- /dev/null
+++ b/queue-5.3/s390-zcrypt-fix-memleak-at-release.patch
@@ -0,0 +1,37 @@
+From 388bb19be8eab4674a660e0c97eaf60775362bc7 Mon Sep 17 00:00:00 2001
+From: Johan Hovold <johan@kernel.org>
+Date: Thu, 10 Oct 2019 15:13:33 +0200
+Subject: s390/zcrypt: fix memleak at release
+
+From: Johan Hovold <johan@kernel.org>
+
+commit 388bb19be8eab4674a660e0c97eaf60775362bc7 upstream.
+
+If a process is interrupted while accessing the crypto device and the
+global ap_perms_mutex is contented, release() could return early and
+fail to free related resources.
+
+Fixes: 00fab2350e6b ("s390/zcrypt: multiple zcrypt device nodes support")
+Cc: <stable@vger.kernel.org> # 4.19
+Cc: Harald Freudenberger <freude@linux.ibm.com>
+Signed-off-by: Johan Hovold <johan@kernel.org>
+Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
+Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/s390/crypto/zcrypt_api.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/drivers/s390/crypto/zcrypt_api.c
++++ b/drivers/s390/crypto/zcrypt_api.c
+@@ -539,8 +539,7 @@ static int zcrypt_release(struct inode *
+       if (filp->f_inode->i_cdev == &zcrypt_cdev) {
+               struct zcdn_device *zcdndev;
+ 
+-              if (mutex_lock_interruptible(&ap_perms_mutex))
+-                      return -ERESTARTSYS;
++              mutex_lock(&ap_perms_mutex);
+               zcdndev = find_zcdndev_by_devt(filp->f_inode->i_rdev);
+               mutex_unlock(&ap_perms_mutex);
+               if (zcdndev) {
diff --git a/queue-5.3/series b/queue-5.3/series

index 7c3445dcdcc7333a8733d33f9cfd475465971e18..5e54dbfc0b62636dac7dc8eb782444a1fe1767f8 100644 (file)
--- a/queue-5.3/series
+++ b/queue-5.3/series
@@ -138,3 +138,22 @@ fs-proc-page.c-don-t-access-uninitialized-memmaps-in-fs-proc-page.c.patch
  io_uring-fix-broken-links-with-offloading.patch
  io_uring-fix-race-for-sqes-with-userspace.patch
  io_uring-used-cached-copies-of-sq-dropped-and-cq-ove.patch
+mmc-mxs-fix-flags-passed-to-dmaengine_prep_slave_sg.patch
+mmc-cqhci-commit-descriptors-before-setting-the-doorbell.patch
+mmc-sdhci-omap-fix-tuning-procedure-for-temperatures-20c.patch
+mm-memory-failure.c-don-t-access-uninitialized-memmaps-in-memory_failure.patch
+mm-slub-fix-a-deadlock-in-show_slab_objects.patch
+mm-page_owner-don-t-access-uninitialized-memmaps-when-reading-proc-pagetypeinfo.patch
+mm-memory_hotplug-don-t-access-uninitialized-memmaps-in-shrink_pgdat_span.patch
+mm-memunmap-don-t-access-uninitialized-memmap-in-memunmap_pages.patch
+mm-memcg-slab-fix-panic-in-__free_slab-caused-by-premature-memcg-pointer-release.patch
+mm-compaction-fix-wrong-pfn-handling-in-__reset_isolation_pfn.patch
+mm-memcg-get-number-of-pages-on-the-lru-list-in-memcgroup-base-on-lru_zone_size.patch
+mm-memblock-do-not-enforce-current-limit-for-memblock_phys-family.patch
+hugetlbfs-don-t-access-uninitialized-memmaps-in-pfn_range_valid_gigantic.patch
+mm-memory-failure-poison-read-receives-sigkill-instead-of-sigbus-if-mmaped-more-than-once.patch
+zram-fix-race-between-backing_dev_show-and-backing_dev_store.patch
+xtensa-drop-export_symbol-for-outs-ins.patch
+xtensa-fix-change_bit-in-exclusive-access-option.patch
+s390-zcrypt-fix-memleak-at-release.patch
+s390-kaslr-add-support-for-r_390_glob_dat-relocation-type.patch
diff --git a/queue-5.3/xtensa-drop-export_symbol-for-outs-ins.patch b/queue-5.3/xtensa-drop-export_symbol-for-outs-ins.patch

new file mode 100644 (file)

index 0000000..5574181
--- /dev/null
+++ b/queue-5.3/xtensa-drop-export_symbol-for-outs-ins.patch
@@ -0,0 +1,46 @@
+From 8b39da985194aac2998dd9e3a22d00b596cebf1e Mon Sep 17 00:00:00 2001
+From: Max Filippov <jcmvbkbc@gmail.com>
+Date: Mon, 14 Oct 2019 15:48:19 -0700
+Subject: xtensa: drop EXPORT_SYMBOL for outs*/ins*
+
+From: Max Filippov <jcmvbkbc@gmail.com>
+
+commit 8b39da985194aac2998dd9e3a22d00b596cebf1e upstream.
+
+Custom outs*/ins* implementations are long gone from the xtensa port,
+remove matching EXPORT_SYMBOLs.
+This fixes the following build warnings issued by modpost since commit
+15bfc2348d54 ("modpost: check for static EXPORT_SYMBOL* functions"):
+
+  WARNING: "insb" [vmlinux] is a static EXPORT_SYMBOL
+  WARNING: "insw" [vmlinux] is a static EXPORT_SYMBOL
+  WARNING: "insl" [vmlinux] is a static EXPORT_SYMBOL
+  WARNING: "outsb" [vmlinux] is a static EXPORT_SYMBOL
+  WARNING: "outsw" [vmlinux] is a static EXPORT_SYMBOL
+  WARNING: "outsl" [vmlinux] is a static EXPORT_SYMBOL
+
+Cc: stable@vger.kernel.org
+Fixes: d38efc1f150f ("xtensa: adopt generic io routines")
+Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/xtensa/kernel/xtensa_ksyms.c |    7 -------
+ 1 file changed, 7 deletions(-)
+
+--- a/arch/xtensa/kernel/xtensa_ksyms.c
++++ b/arch/xtensa/kernel/xtensa_ksyms.c
+@@ -119,13 +119,6 @@ EXPORT_SYMBOL(__invalidate_icache_range)
+ // FIXME EXPORT_SYMBOL(screen_info);
+ #endif
+ 
+-EXPORT_SYMBOL(outsb);
+-EXPORT_SYMBOL(outsw);
+-EXPORT_SYMBOL(outsl);
+-EXPORT_SYMBOL(insb);
+-EXPORT_SYMBOL(insw);
+-EXPORT_SYMBOL(insl);
+-
+ extern long common_exception_return;
+ EXPORT_SYMBOL(common_exception_return);
+ 
diff --git a/queue-5.3/xtensa-fix-change_bit-in-exclusive-access-option.patch b/queue-5.3/xtensa-fix-change_bit-in-exclusive-access-option.patch

new file mode 100644 (file)

index 0000000..c7373bf
--- /dev/null
+++ b/queue-5.3/xtensa-fix-change_bit-in-exclusive-access-option.patch
@@ -0,0 +1,32 @@
+From 775fd6bfefc66a8c33e91dd9687ed530643b954d Mon Sep 17 00:00:00 2001
+From: Max Filippov <jcmvbkbc@gmail.com>
+Date: Tue, 15 Oct 2019 21:51:43 -0700
+Subject: xtensa: fix change_bit in exclusive access option
+
+From: Max Filippov <jcmvbkbc@gmail.com>
+
+commit 775fd6bfefc66a8c33e91dd9687ed530643b954d upstream.
+
+change_bit implementation for XCHAL_HAVE_EXCLUSIVE case changes all bits
+except the one required due to copy-paste error from clear_bit.
+
+Cc: stable@vger.kernel.org # v5.2+
+Fixes: f7c34874f04a ("xtensa: add exclusive atomics support")
+Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/xtensa/include/asm/bitops.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/xtensa/include/asm/bitops.h
++++ b/arch/xtensa/include/asm/bitops.h
+@@ -148,7 +148,7 @@ static inline void change_bit(unsigned i
+                       "       getex   %0\n"
+                       "       beqz    %0, 1b\n"
+                       : "=&a" (tmp)
+-                      : "a" (~mask), "a" (p)
++                      : "a" (mask), "a" (p)
+                       : "memory");
+ }
+ 
diff --git a/queue-5.3/zram-fix-race-between-backing_dev_show-and-backing_dev_store.patch b/queue-5.3/zram-fix-race-between-backing_dev_show-and-backing_dev_store.patch

new file mode 100644 (file)

index 0000000..caa0388
--- /dev/null
+++ b/queue-5.3/zram-fix-race-between-backing_dev_show-and-backing_dev_store.patch
@@ -0,0 +1,66 @@
+From f7daefe4231e57381d92c2e2ad905a899c28e402 Mon Sep 17 00:00:00 2001
+From: Chenwandun <chenwandun@huawei.com>
+Date: Fri, 18 Oct 2019 20:20:14 -0700
+Subject: zram: fix race between backing_dev_show and backing_dev_store
+
+From: Chenwandun <chenwandun@huawei.com>
+
+commit f7daefe4231e57381d92c2e2ad905a899c28e402 upstream.
+
+CPU0:                                 CPU1:
+backing_dev_show                      backing_dev_store
+    ......                                ......
+    file = zram->backing_dev;
+    down_read(&zram->init_lock);          down_read(&zram->init_init_lock)
+    file_path(file, ...);                 zram->backing_dev = backing_dev;
+    up_read(&zram->init_lock);            up_read(&zram->init_lock);
+
+gets the value of zram->backing_dev too early in backing_dev_show, which
+resultin the value being NULL at the beginning, and not NULL later.
+
+backtrace:
+  d_path+0xcc/0x174
+  file_path+0x10/0x18
+  backing_dev_show+0x40/0xb4
+  dev_attr_show+0x20/0x54
+  sysfs_kf_seq_show+0x9c/0x10c
+  kernfs_seq_show+0x28/0x30
+  seq_read+0x184/0x488
+  kernfs_fop_read+0x5c/0x1a4
+  __vfs_read+0x44/0x128
+  vfs_read+0xa0/0x138
+  SyS_read+0x54/0xb4
+
+Link: http://lkml.kernel.org/r/1571046839-16814-1-git-send-email-chenwandun@huawei.com
+Signed-off-by: Chenwandun <chenwandun@huawei.com>
+Acked-by: Minchan Kim <minchan@kernel.org>
+Cc: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: <stable@vger.kernel.org>   [4.14+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/block/zram/zram_drv.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/drivers/block/zram/zram_drv.c
++++ b/drivers/block/zram/zram_drv.c
+@@ -413,13 +413,14 @@ static void reset_bdev(struct zram *zram
+ static ssize_t backing_dev_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+ {
++      struct file *file;
+       struct zram *zram = dev_to_zram(dev);
+-      struct file *file = zram->backing_dev;
+       char *p;
+       ssize_t ret;
+ 
+       down_read(&zram->init_lock);
+-      if (!zram->backing_dev) {
++      file = zram->backing_dev;
++      if (!file) {
+               memcpy(buf, "none\n", 5);
+               up_read(&zram->init_lock);
+               return 5;
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sun, 27 Oct 2019 13:49:22 +0000 (14:49 +0100)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sun, 27 Oct 2019 13:49:22 +0000 (14:49 +0100)
queue-5.3/hugetlbfs-don-t-access-uninitialized-memmaps-in-pfn_range_valid_gigantic.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mm-compaction-fix-wrong-pfn-handling-in-__reset_isolation_pfn.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mm-memblock-do-not-enforce-current-limit-for-memblock_phys-family.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mm-memcg-get-number-of-pages-on-the-lru-list-in-memcgroup-base-on-lru_zone_size.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mm-memcg-slab-fix-panic-in-__free_slab-caused-by-premature-memcg-pointer-release.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mm-memory-failure-poison-read-receives-sigkill-instead-of-sigbus-if-mmaped-more-than-once.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mm-memory-failure.c-don-t-access-uninitialized-memmaps-in-memory_failure.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mm-memory_hotplug-don-t-access-uninitialized-memmaps-in-shrink_pgdat_span.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mm-memunmap-don-t-access-uninitialized-memmap-in-memunmap_pages.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mm-page_owner-don-t-access-uninitialized-memmaps-when-reading-proc-pagetypeinfo.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mm-slub-fix-a-deadlock-in-show_slab_objects.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mmc-cqhci-commit-descriptors-before-setting-the-doorbell.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mmc-mxs-fix-flags-passed-to-dmaengine_prep_slave_sg.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/mmc-sdhci-omap-fix-tuning-procedure-for-temperatures-20c.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/s390-kaslr-add-support-for-r_390_glob_dat-relocation-type.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/s390-zcrypt-fix-memleak-at-release.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/series		patch \| blob \| blame \| history
queue-5.3/xtensa-drop-export_symbol-for-outs-ins.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/xtensa-fix-change_bit-in-exclusive-access-option.patch	[new file with mode: 0644]	patch \| blob
queue-5.3/zram-fix-race-between-backing_dev_show-and-backing_dev_store.patch	[new file with mode: 0644]	patch \| blob